diff options
author | Martin Mokrejš <mmokrejs@fold.natur.cuni.cz> | 2016-02-10 15:50:27 +0100 |
---|---|---|
committer | Martin Mokrejš <mmokrejs@fold.natur.cuni.cz> | 2016-02-10 15:50:27 +0100 |
commit | e870de5e7451b02cb563d5567dcb7ddccced0eab (patch) | |
tree | 56a7b3bc40c3f203c494ca667e97a01ac18abf08 | |
parent | sci-biology/SnpEff: version bump (diff) | |
download | sci-e870de5e7451b02cb563d5567dcb7ddccced0eab.tar.gz sci-e870de5e7451b02cb563d5567dcb7ddccced0eab.tar.bz2 sci-e870de5e7451b02cb563d5567dcb7ddccced0eab.zip |
sci-biology/full_lengther_next: version bump
Package-Manager: portage-2.2.26
-rw-r--r-- | sci-biology/full_lengther_next/Manifest | 2 | ||||
-rw-r--r-- | sci-biology/full_lengther_next/files/download_fln_dbs.rb | 260 | ||||
-rw-r--r-- | sci-biology/full_lengther_next/full_lengther_next-0.5.6.ebuild (renamed from sci-biology/full_lengther_next/full_lengther_next-0.0.8.ebuild) | 5 |
3 files changed, 266 insertions, 1 deletions
diff --git a/sci-biology/full_lengther_next/Manifest b/sci-biology/full_lengther_next/Manifest index 6b1755016..04acafb04 100644 --- a/sci-biology/full_lengther_next/Manifest +++ b/sci-biology/full_lengther_next/Manifest @@ -1 +1 @@ -DIST full_lengther_next-0.0.8.gem 38912 SHA256 2808de8e04aea2118176ad04298a33e49e60048c8ef7d0f332d98308ae2d4664 SHA512 73c299f564c1580d737be49426e1363b171fcccea81b849f7311da328b6011b99847f28685266f6faebcb128d5f63c787dc27028f009ed25641e1b66ae6cfcf6 WHIRLPOOL 672ce60224cc431f2565b40ca583f9377ffc1c93b55e8a6b7f8055e467be7753edcdfd35f716316d37cc29c4c56759dc8c4a26a1b25ffcfaadd361218cc3422a +DIST full_lengther_next-0.5.6.gem 2038784 SHA256 3e2afada6feada3e4503679a113d9b898061beb27e989c2fe40d35b331f9a417 SHA512 af81e72e3f2e21a0cd013ded12df69d7b877986eb4392691931d93c25acce87c5dec04346b5efa37ccb43a0f3aceabaca7f4cf2a99d5ae679325b18329548b7b WHIRLPOOL 2313280bf69fe2c1b34f81125fa3a1882bf410b106bf53c35e0fe410dea5b38f97cf23bdb7e1bdaa8eb0c52dfe5a53c335904e021431068b8ee99043e457ac8d diff --git a/sci-biology/full_lengther_next/files/download_fln_dbs.rb b/sci-biology/full_lengther_next/files/download_fln_dbs.rb new file mode 100644 index 000000000..90eacbbb8 --- /dev/null +++ b/sci-biology/full_lengther_next/files/download_fln_dbs.rb @@ -0,0 +1,260 @@ +#!/usr/bin/env ruby + +# 15-2-2011 Noe Fernandez-Pozo +# Script to download Full-LengtherNext databases. +# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL. + +require 'net/ftp' +require 'open-uri' + +class FtpClient + +def initialize +end + +def connect(server) + @server=server +end + +def login + +end + +def chdir(dir) + @dir=dir +end + +def getbinaryfile(file,output_file) + if !File.exists?(output_file) && !File.exists?(output_file.gsub('.gz','')) + puts " - Downloading" + cmd="wget #{@server}/#{@dir}/#{file} -O #{output_file}" + system(cmd) + else + puts "File #{output_file}, or #{output_file.gsub('.gz','')} already exists. Skip download" + end + +end + +def close +end + +end +################################################### Functions + +def download_ncrna(formatted_db_path) + + if !File.exists?(File.join(formatted_db_path, "nc_rna_db")) + Dir.mkdir(File.join(formatted_db_path, "nc_rna_db")) + end + + puts "Downloading ncRNA database" + open(File.join(formatted_db_path, "nc_rna_db/ncrna_fln_100.fasta.zip"), "wb") do |my_file| + my_file.print open('http://www.scbi.uma.es/downloads/FLNDB/ncrna_fln_100.fasta.zip').read + end + puts "\nncRNA database downloaded" + + ncrna_zip=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta.zip') + ncrna_out_dir=File.join(formatted_db_path,'nc_rna_db') + system("unzip", ncrna_zip, "-d", ncrna_out_dir) + system("rm", ncrna_zip) + + puts "\nncRNA database decompressed" + + ncrna_fasta=File.join(formatted_db_path,'nc_rna_db','ncrna_fln_100.fasta') + system("makeblastdb", "-in", ncrna_fasta, "-dbtype", "nucl", "-parse_seqids") + + puts "\nncRNA database completed" +end + +def conecta_uniprot(my_array, formatted_db_path) + + #$ftp = Net::FTP.new() + $ftp = FtpClient.new() + + if !File.exists?(formatted_db_path) + Dir.mkdir(formatted_db_path) + end + + $ftp.connect('ftp://ftp.uniprot.org') + + $ftp.login + + puts "connected to UniProt" + + my_array.each do |db_group| + puts "Downloading #{db_group}" + download_uniprot(db_group, formatted_db_path) + end + + varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz') + $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete") + $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out) + + puts "isoform files downloaded" + + $ftp.close + +end + +def download_uniprot(uniprot_group, formatted_db_path) + + sp_out=File.join(formatted_db_path,"uniprot_sprot_#{uniprot_group}.dat.gz") + tr_out=File.join(formatted_db_path,"uniprot_trembl_#{uniprot_group}.dat.gz") + $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions") + puts " from ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_#{uniprot_group}.dat.gz" + puts " from ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_trembl_#{uniprot_group}.dat.gz" + $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out) + $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out) + + puts "#{uniprot_group} files downloaded" + +end + +def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path) + + puts "filtering sequences from #{file_name}" + + # UniProtKB fragments with FT NON_CONS and FT NON_TER features. + # + # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines: + # FT NON_TER 1 1 + # FT NON_TER 29 29 + # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line: + # FT NON_CONS 1683 1684 + # + # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'. + + newseq=false + print_seq=true + id='' + description = '' + organism_name = '' + seq = '' + organelle = '' + + file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/ + db_name = $1 + output_name = $2 + db_name.sub!('sprot','sp') + db_name.sub!('trembl','tr') + + if !File.exists?(File.join(formatted_db_path, "#{db_name}_#{output_name}")) + Dir.mkdir(File.join(formatted_db_path, "#{db_name}_#{output_name}")) + end + + output_file = File.new(File.join(formatted_db_path, "#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta"), "w") + + File.open(file_name).each_line do |line| + if (newseq == false) + if (line =~ /^AC\s+(\w+);/) + id=$1 + newseq = true + description = '' + organism_name = '' + seq = '' + print_seq = true + organelle = '' + end + else + if (line =~ /^DE\s+(.+)\;*/) + if (description == '') + description = $1 + description.sub!(/RecName: Full=/,'sp=') + description.sub!(/SubName: Full=/,'tr=') + end + if (line =~ /Flags: Fragment/) + # puts "#{id} #{line}" + print_seq=false + end + elsif (line =~ /^OS\s+(.+)/) + organism_name = $1 + elsif (line =~ /^OG\s+(.+)/) + organelle = $1 + elsif (line =~ /^FT\s+NON_TER\s+/) + print_seq=false + # puts "#{id} NON_TER" + elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/) + print_seq=false + # puts "#{id} NON_CONS" + elsif (line =~ /^\s+([\w\s]+)/) + seq += $1 + elsif (line =~ /^\/\//) + seq.gsub!(/\s*/,'') + if (seq !~ /^M/i) + print_seq=false + end + newseq = false + + if (print_seq) + output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}" + if (!isoform_hash[id].nil?) + output_file.puts isoform_hash[id] + end + end + end + end + end + output_file.close +end + +def load_isoform_hash(file) + + isoform_hash = {} + my_fasta = '' + acc = '' + File.open(file).each do |line| + line.chomp! + if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/) + if (isoform_hash[acc].nil?) + isoform_hash[acc]= "#{my_fasta}\n" + else + isoform_hash[acc]+= "#{my_fasta}\n" + end + my_fasta = "#{$1}\n" + acc = $2 + else + my_fasta += line + end + end + + return isoform_hash +end + +################################################### MAIN + +ROOT_PATH=File.dirname(__FILE__) + +if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB']) + formatted_db_path = ENV['BLASTDB'] +else # otherwise use ROOTPATH + DB + formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs")) +end + +ENV['BLASTDB']=formatted_db_path +puts "Databases will be downloaded at: #{ENV['BLASTDB']}" +puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n" + +my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"] +# my_array = ["plants","human"] # used for a shoter test + +conecta_uniprot(my_array, formatted_db_path) +system('gunzip '+File.join(formatted_db_path,'*.gz')) + +isoform_hash = {} +isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) + +download_ncrna(formatted_db_path) + +my_array.each do |db_group| + + filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_sprot_#{db_group}.dat"), isoform_hash, formatted_db_path) + filter_incomplete_seqs(File.join(formatted_db_path, "uniprot_trembl_#{db_group}.dat"), isoform_hash, formatted_db_path) + + sp_fasta=File.join(formatted_db_path,"sp_#{db_group}","sp_#{db_group}.fasta") + tr_fasta=File.join(formatted_db_path,"tr_#{db_group}","tr_#{db_group}.fasta") + system("makeblastdb -in #{sp_fasta} -dbtype 'prot' -parse_seqids") + system("makeblastdb -in #{tr_fasta} -dbtype 'prot' -parse_seqids") + +end + +puts "download_fln_dbs.rb has finished" diff --git a/sci-biology/full_lengther_next/full_lengther_next-0.0.8.ebuild b/sci-biology/full_lengther_next/full_lengther_next-0.5.6.ebuild index 0465bc02e..52412c00a 100644 --- a/sci-biology/full_lengther_next/full_lengther_next-0.0.8.ebuild +++ b/sci-biology/full_lengther_next/full_lengther_next-0.5.6.ebuild @@ -28,3 +28,8 @@ RDEPEND="${DEPEND} >=sci-biology/scbi_mapreduce-0.0.29 >=sci-biology/scbi_plot-0.0.6 >=dev-ruby/xml-simple-1.0.12" + +#src_prepare(){ +# cp "${FILESDIR}"/download_fln_dbs.rb all/full_lengther_next-0.0.8/bin || die +# chmod a+rx all/full_lengther_next-0.0.8/bin/*.rb || die +#} |