From aab87747b89b64107677056a3d4874d8f5ee7bbf Mon Sep 17 00:00:00 2001 From: Martin Mokrejš Date: Tue, 21 Nov 2017 17:11:28 +0100 Subject: sci-biology/SEECER: execute jellyfish1 instead of jellyfish Also I wrote a few cleanup patches to expose THREADS variable and cleanup the code. Package-Manager: Portage-2.3.14, Repoman-2.3.6 --- sci-biology/SEECER/SEECER-0.1.3-r2.ebuild | 7 ++- .../SEECER/files/rename_jellyfish_binary.patch | 11 ++++ sci-biology/SEECER/files/run_jellyfish.sh.patch | 72 ++++++++++++++++++++++ sci-biology/SEECER/files/run_seecer.sh.patch | 42 +++++++++++++ 4 files changed, 130 insertions(+), 2 deletions(-) create mode 100644 sci-biology/SEECER/files/rename_jellyfish_binary.patch create mode 100644 sci-biology/SEECER/files/run_jellyfish.sh.patch create mode 100644 sci-biology/SEECER/files/run_seecer.sh.patch (limited to 'sci-biology/SEECER') diff --git a/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild b/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild index 60862d8c8..0b7ec3bfa 100644 --- a/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild +++ b/sci-biology/SEECER/SEECER-0.1.3-r2.ebuild @@ -5,7 +5,7 @@ EAPI=6 inherit eutils -DESCRIPTION="SEquencing Error Corrector for RNA-Seq reads" +DESCRIPTION="SEquence Error Corrector for RNA-Seq reads" HOMEPAGE="http://sb.cs.cmu.edu/seecer/" SRC_URI=" http://sb.cs.cmu.edu/seecer/downloads/"${P}".tar.gz @@ -22,12 +22,15 @@ DEPEND=" sci-libs/gsl:0= sci-biology/seqan:0=" RDEPEND="${DEPEND} - =sci-biology/jellyfish-1.1.11" + =sci-biology/jellyfish-1.1.11-r1" S="${S}"/SEECER PATCHES=( "${FILESDIR}"/remove-hardcoded-paths.patch + "${FILESDIR}"/run_seecer.sh.patch + "${FILESDIR}"/run_jellyfish.sh.patch + "${FILESDIR}"/rename_jellyfish_binary.patch ) src_prepare(){ diff --git a/sci-biology/SEECER/files/rename_jellyfish_binary.patch b/sci-biology/SEECER/files/rename_jellyfish_binary.patch new file mode 100644 index 000000000..c6548cee1 --- /dev/null +++ b/sci-biology/SEECER/files/rename_jellyfish_binary.patch @@ -0,0 +1,11 @@ +--- SEECER/bin/run_seecer.sh.ori 2017-11-21 16:56:28.808767468 +0100 ++++ SEECER/bin/run_seecer.sh 2017-11-21 16:57:07.469835728 +0100 +@@ -26,7 +26,7 @@ + + + BINDIR='' #this can be hardcoded to /absolute/path/to/SEECER/bin/ +-JF="jellyfish" #this may be hardcoded to /absolute/path/to/jellyfish/bin/ ++JF="jellyfish1" #this may be hardcoded to /absolute/path/to/jellyfish/bin/jellyfish + + K=17 + SEECER_PARAMS="" diff --git a/sci-biology/SEECER/files/run_jellyfish.sh.patch b/sci-biology/SEECER/files/run_jellyfish.sh.patch new file mode 100644 index 000000000..7631f5a4c --- /dev/null +++ b/sci-biology/SEECER/files/run_jellyfish.sh.patch @@ -0,0 +1,72 @@ +--- SEECER-0.1.3/bin/run_jellyfish.sh.ori 2017-11-21 16:41:54.164599838 +0100 ++++ SEECER-0.1.3/bin/run_jellyfish.sh 2017-11-21 16:46:28.022166903 +0100 +@@ -1,18 +1,45 @@ + #!/bin/bash ++ ++# Usage: run_jellyfish.sh jellyfish_binpath tempfile_prefix kmersize mincount tmpdir infile1 [infile2] threads + JF=$1 + LCOUNT=$4 + TMPDIR=$5 ++THREADS=${8:-32} + + if [ -z "$JF" ]; then + echo "No path to jellyfish binary provided, exiting."; + exit 255; + fi + ++# Usage: jellyfish count [options] file:path+ ++# ++# Count k-mers or qmers in fasta or fastq files ++# ++# Options (default value in (), *required): ++# -m, --mer-len=uint32 *Length of mer ++# -s, --size=uint64 *Hash size ++# -t, --threads=uint32 Number of threads (1) ++# -o, --output=string Output prefix (mer_counts) ++# -c, --counter-len=Length in bits Length of counting field (7) ++# --out-counter-len=Length in bytes Length of counter field in output (4) ++# -C, --both-strands Count both strand, canonical representation (false) ++# -p, --reprobes=uint32 Maximum number of reprobes (62) ++# -r, --raw Write raw database (false) ++# -q, --quake Quake compatibility mode (false) ++# --quality-start=uint32 Starting ASCII for quality values (64) ++# --min-quality=uint32 Minimum quality. A base with lesser quality becomes an N (0) ++# -L, --lower-count=uint64 Don't output k-mer with count < lower-count ++# -U, --upper-count=uint64 Don't output k-mer with count > upper-count ++# --invalid-char=warn|ignore|error How to treat invalid characters. The char is changed to a N. (warn) ++# --matrix=Matrix file Hash function binary matrix ++# --timing=Timing file Print timing information ++# --stats=Stats file Print stats ++# + if [ "$#" -eq "4" ]; + then +-$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t 32 --both-strands $6 || exit 255 ++$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t $THREADS --both-strands $6 || exit 255 + else +-$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t 32 --both-strands $6 $7 || exit 255 ++$JF count -m $3 -o $TMPDIR/jf_tmp -c 3 -s 10000000 -t $THREADS --both-strands $6 $7 || exit 255 + fi; + + # merge +@@ -25,5 +52,21 @@ + rm $TMPDIR/jf_tmp_* + fi + ++# ++# Usage: jellyfish dump [options] db:path ++# ++# Dump k-mer counts ++# ++# By default, dump in a fasta format where the header is the count and ++# the sequence is the sequence of the k-mer. The column format is a 2 ++# column output: k-mer count. ++# ++# Options (default value in (), *required): ++# -c, --column Column format (false) ++# -t, --tab Tab separator (false) ++# -L, --lower-count=uint64 Don't output k-mer with count < lower-count ++# -U, --upper-count=uint64 Don't output k-mer with count > upper-count ++# -o, --output=string Output file ++# + $JF dump --lower-count=$LCOUNT -o $2 -c $TMPDIR/jf_merged_$3 || exit 255 + rm $TMPDIR/jf_merged_$3 diff --git a/sci-biology/SEECER/files/run_seecer.sh.patch b/sci-biology/SEECER/files/run_seecer.sh.patch new file mode 100644 index 000000000..a20c7917f --- /dev/null +++ b/sci-biology/SEECER/files/run_seecer.sh.patch @@ -0,0 +1,42 @@ +--- SEECER/bin/run_seecer.sh.old 2013-10-02 18:55:24.000000000 +0200 ++++ SEECER/bin/run_seecer.sh 2017-11-21 16:24:24.065584149 +0100 +@@ -33,6 +33,7 @@ + SeecerStep=1 + LCOUNT=3 + TMPDIR='' ++THREADS=32 + + usage=$(cat << EOF + # This script runs the SEECER pipeline of 4 steps: +@@ -54,11 +55,12 @@ + -j : specify the location of JELLYFISH binary (default = $JF). + -p : specify extra SEECER parameters (default = ''). + -s : specify the starting step ( default = 1). Values = 1,2,3,4. ++ -c : number of threads (default = 32). + -h : help message + EOF + ); + +-while getopts ":j:p:k:s:t:h" opt; do ++while getopts ":j:p:k:s:t:c:h" opt; do + case $opt in + t) + TMPDIR=$OPTARG +@@ -75,6 +77,8 @@ + s) + SeecerStep=$OPTARG + ;; ++ c) ++ THREADS=$OPTARG + \?) + echo "Invalid option: -$OPTARG" >&2 + echo "$usage" +@@ -170,7 +177,7 @@ + then + echo "++ Step 2: Running JELLYFISH to count kmers ..." + echo +- bash "${BINDIR}"run_jellyfish.sh $JF $TMPDIR/counts_${K}_${LCOUNT} $K $LCOUNT $TMPDIR $Read1_N $Read2_N || exit 255 ++ bash "${BINDIR}"run_jellyfish.sh $JF $TMPDIR/counts_${K}_${LCOUNT} $K $LCOUNT $TMPDIR $Read1_N $Read2_N $THREADS || exit 255 + fi; + + if [ ! -r $TMPDIR/counts_${K}_${LCOUNT} ]; -- cgit v1.2.3-65-gdbad