--- est_annot.pm.ori 2011-02-21 17:19:51.000000000 +0100 +++ est_annot.pm 2011-02-23 00:06:03.000000000 +0100 @@ -37,7 +37,7 @@ sub store_blast_report($$$); #stores a blast report into the mysql database sub annotate_unigenes_from_db($); #adds to the unigenes table the best hits from the blast reports sub do_blast_annotation (\@$); #annotates a unigene acording to the blast results obtained for a single database -sub hit_is_not_annoymous ($$); #checks if a hit has one of the annonymous tags +sub hit_is_not_anonymous ($$); #checks if a hit has one of the annonymous tags sub store_estscan_orfs($); #Stores the ESTScan results sub run_hmmer_for_unigenes ($); #runs hmmer for all the unigenes sub parse_and_store_hmmer($); #parses the hmmer result and stores it in the mysql database @@ -208,10 +208,10 @@ my $dbh=open_db($db) or return 0; my $st="INSERT INTO $table VALUES (?,?,?,?,?,?);"; - my $sth=$dbh->prepare($st); + my $sth=$dbh->prepare($st) or die localtime().": Error preparing INSERT INTO $table VALUES (?,?,?,?,?,?): $DBI::errstr\n"; my $num_micros=0; - open IN, "<$in_file"; + open IN, "<$in_file" or die ocaltime().": Error opening input file $in_file: $!\n"; my $seq_name; my $num_micros_seq=0; while (my $line=){ @@ -279,7 +279,7 @@ my $cmd="$bin -u $max_unit_length -v $min_unit_length -L $min_length_of_ssr $in_file > $out_file"; not (system ($cmd)) or ( print localtime().": Something went wrong running Sputnik to find microsatellites\n" and - print localtime().": Command was: $cmd\n" and + print localtime().": Command was: $cmd: $!\n" and return 0 ); return 1; @@ -322,7 +322,7 @@ my $sth=prepare_st($dbh,$st) or return 0; execute_st($sth) or return 0; open OUT,">$outf" or ( - print localtime().": There was a problem opening the file $outf as output\n" and + print localtime().": There was a problem opening the file $outf as output: $!\n" and return 0 ); while (my @field = $sth->fetchrow_array){ @@ -371,8 +371,9 @@ } my $table=read_config($config,'hmmer_table',''); my $dbh=open_db($db) or return 0; - my $st="INSERT INTO $table VALUES ('',?,?,?,?,?,?,?,'');"; - my $sth=$dbh->prepare($st); + my $st="INSERT INTO $table VALUES (NULL,?,?,?,?,?,?,?,NULL);"; + print localtime().": Trying to prepare \"$st\" for db=$db and table=$table\n"; + my $sth=$dbh->prepare($st) or print localtime().": Error preparing $st statement: $DBI::errstr\n"; my $ok=1; @@ -427,7 +428,7 @@ my $out=read_config($config,'hmmer_res',''); my $micros=read_config($config,'micros',''); unless ($bin and $db and $out and $micros){ - print localtime().": Some of the parameters needed by HMMER are missing\n"; + print localtime().": Some of the parameters needed by HMMER are missing: $bin, $db, $out, $micros\n"; return 0; } @@ -439,7 +440,7 @@ $cmd="$pvmd_bin $host_file &"; print localtime().": Launching pvm: $cmd\n"; not (system($cmd)) or ( - print localtime().": There was a problem launching the pvm environment\n" and + print localtime().": There was a problem launching the pvm environment: $!\n" and return 0 ); $cmd="$bin --cpu $micros --pvm $db $in > $out"; @@ -452,7 +453,7 @@ stop_pvm($config); } if ($not_ok){ - print localtime().": There was a problem running HMMER\n"; + print localtime().": There was a problem running HMMER: $!\n"; return 0; } @@ -510,8 +511,8 @@ #begin_orf mediumint unsigned not null, #end_orf mediumint unsigned not null, #reverse tinyint(1) not null - my $st="INSERT INTO $table VALUES ('',?,?,?,?,?,?);"; - my $sth=$dbh->prepare($st); + my $st="INSERT INTO $table VALUES (NULL,?,?,?,?,?,?);"; + my $sth=$dbh->prepare($st) or die localtime().": Error preparing $st statement: $DBI::errstr\n"; my $no_exit=1; my $ok=1; @@ -544,7 +545,7 @@ } $sth->execute() or $ok=0; unless ($ok){ - print localtime().": There was a problem executing the statement $st\n"; + print localtime().": There was a problem executing the statement: $st: $DBI::errstr\n"; return 0; } }else{ @@ -583,7 +584,7 @@ my $cmd="$bin $seq_file -M $matrix -t $out_pep -o $out_dna\n"; not(system($cmd)) or ( print localtime().": There has been a problem running ESTScan.\n" and - print localtime().": Command was:\n$cmd\n" and + print localtime().": Command was:\n$cmd: $!\n" and return 0 ); @@ -698,7 +699,7 @@ if (not ($putative_e_value) or ($e_value < $putative_e_value)){ #we evaluate if this is a good hit #is greater than the $inf_limit - if (($e_value<$inf_limit) and hit_is_not_annoymous($hit,$config)){ + if (($e_value<$inf_limit) and hit_is_not_anonymous($hit,$config)){ #this is a good hit, we should annotate the unigene $putative_e_value=$e_value; #we remove the first word and the trailing information of the hit @@ -718,7 +719,7 @@ return $putative_annot; } -sub hit_is_not_annoymous ($$){ +sub hit_is_not_anonymous ($$){ my $hit=shift; my $config=shift; my $ok=1; @@ -730,7 +731,7 @@ if ($hit =~ /$tag/i) {return 0} } - # hit is also annonymous if it has no description + # hit is also anonymous if it has no description # JF, 2007-01-10 if ($hit =~ /^\t$/) {return 0} @@ -747,7 +748,7 @@ my $table=read_config($config,'unigenes_table2','') or return 0; my $st="UPDATE $table SET annotation=NULL;"; my $dbh=open_db($db) or ( - print localtime().": Unable to open database $db\n" and + print localtime().": Unable to open database $db: $DBI::errstr\n" and return 0 ); my $sth=prepare_st($dbh,$st) or return 0; @@ -776,7 +777,7 @@ my $cmd="cp $seq_file $blast_dir".$db_name; my $fail=system($cmd); if ($fail){ - print localtime().": There was a problem with the command: $cmd\n"; + print localtime().": There was a problem with the command: $cmd: $!\n"; return 0; } #format the database @@ -792,7 +793,7 @@ } $fail=system($cmd); #now the database is formatted if ($fail){ - print localtime().": There was a problem with the command: $cmd\n"; + print localtime().": There was a problem with the command: $cmd: $!\n"; print localtime().": Logfile is $log\n"; return 0; } @@ -986,7 +987,7 @@ my $in = new Bio::SearchIO(-format => 'blast',-file => $in_file); open OUT,">$out_file" or $ok=0; unless ($ok) { - print localtime().": The out file: $out_file could not be opened.\n"; + print localtime().": The out file: $out_file could not be opened: $!\n"; return 0; } @@ -1092,7 +1093,7 @@ #prints a wellcome message sub wellcome_annot () { - print localtime().": Wellcome to the EST annotation pipe, have a pleasant jouney.\n\n"; + print localtime().": Welcome to the EST annotation pipe, have a pleasant journey.\n\n"; } @@ -1112,6 +1113,7 @@ my $dbh=open_db($db) or return 0; #if the database is superunigenes previous blast superunigenes should be removed if ($blast_db =~ /superunigenes$/){ + print localtime().": Deleting previous BLAST results: DELETE FROM blast_result WHERE db='superunigenes'\n"; my $st="DELETE FROM blast_result WHERE db='superunigenes'"; my $sth=$dbh->prepare($st) or return 0; execute_st($sth) or return 0; @@ -1129,18 +1131,19 @@ if ($ext){$temp_in.=$separator.$ext} #we create a new in file with the sequences that still has no blast stored in the database - #if the blast is alredy done, there is no need to repeat it + #if the blast is already done, there is no need to repeat it my $st; if (read_config($config,'mysql_compress','')){ $st="SELECT UNCOMPRESS(blast_result) FROM blast_result WHERE query_seq=? AND db='$blast_db'"; }else{ $st="SELECT blast_result FROM blast_result WHERE query_seq=? AND db='$blast_db'"; } + print localtime().": Extracting from $in_file those FASTA entries which have no rows matching in blast_result.query_seq using Bio::SeqIO into $temp_in\n"; my $sth=$dbh->prepare($st) or return 0; my $num_blast_todo=0; my $in_seq = Bio::SeqIO->new('-file' => "<$in_file",'-format' => "fasta") or return 0; my $out_seq = Bio::SeqIO->new('-file' => ">$temp_in",'-format' => "fasta") or return 0; - open OUT, ">$blast_file_from_db"; + open OUT, ">$blast_file_from_db" or print localtime().": Cannot write to $blast_file_from_db file: $!\n"; while (my $seq=$in_seq->next_seq){ my $seq_name=$seq->id; $sth->bind_param(1,$seq_name); @@ -1162,10 +1165,11 @@ #we run the blast #we need to check for every blast if this blast has already been done to avoid repeating blast from different modules if (!$num_blast_todo){ - print localtime().": All BLASTs are already on the database\n"; + print localtime().": All BLASTs are already in the database: \$num_blast_todo=$num_blast_todo\n"; my $cmd="mv $blast_file_from_db $out_file"; !system ($cmd) or return 0; - return 1 + print localtime().": Renamed $blast_file_from_db to $out_file\n"; + return 1; } run_local_blast($temp_in,$in_type,$blast_file_from_blast,$blast_db,$config) or return 0; $dbh=open_db($db) or return 0; @@ -1173,14 +1177,18 @@ #we split the result and we store it on the database #we should do a minimal parsing but we don't use bioperl to speed up the process #we just asume that every blast begins with ^BLAST and that the query is ^Query= xxxxxxxx - open IN,"<$blast_file_from_blast" or return 0; + if (! -r $blast_file_from_blast) { + print localtime().": File $blast_file_from_blast is not readable: $!\n"; + return 0; + } + open IN,"<$blast_file_from_blast" or die localtime().": Cannot open $blast_file_from_blast file: $!\n"; my $more=1; my $blast_result=""; my $query=""; if (read_config($config,'mysql_compress','')){ - $st="INSERT INTO blast_result (query_seq,db,blast_result) VALUES (?,?,COMPRESS(?))"; + $st="INSERT DELAYED INTO blast_result (query_seq,db,blast_result) VALUES (?,?,COMPRESS(?))"; }else{ - $st="INSERT INTO blast_result (query_seq,db,blast_result) VALUES (?,?,?)"; + $st="INSERT DELAYED INTO blast_result (query_seq,db,blast_result) VALUES (?,?,?)"; } $sth=$dbh->prepare($st) or return 0; while ($more){ @@ -1211,6 +1219,7 @@ #now we have two files with blast results, the one generated by the blast and the one made by stored blast #we should merge them to obtain the output file my $cmd="cat $blast_file_from_blast $blast_file_from_db > $out_file"; + print localtime().": Merging computed blast output with the one already in database: 'cat $blast_file_from_blast $blast_file_from_db > $out_file'\n"; !system ($cmd) or return 0; return 1; @@ -1241,7 +1250,7 @@ print localtime().": There was a problem executing the statement:\n" and print localtime().": Query: $query\n" and print localtime().": Blast db: $blast_db\n" and - print localtime().": $st\n" and + print localtime().": $st: $DBI::errstr\n" and return 0 ); return 1; @@ -1267,25 +1276,55 @@ my $db;my $db_type; my $env_blastdir; my $program; if (not($db_name =~ /unigenes\.*\d*/) and $db_name ne 'superunigenes'){ + print localtime().": BLAST database $db_name does not match REGEXP with /unigenes\.*\\d*/ nor =='superunigenes',\n\t\t\tfetching its location from table db (sourced from databases.csv)\n"; $db=${$$db_info{$db_name}}{'local_blast_name'}; $db_type=${$$db_info{$db_name}}{'kind'}; + print localtime().": Inferred $db_type datatype from table db (sourced from databases.csv)\n"; }else{ + # unigenes and superunigenes are DNA datasets generated from this EST dataset + # albeit it is ugly these files were written into BLASTDB directory where typically site-wide, public databases exist if ($ENV{'BLASTDB'}) { #if blastdb enviroment is set + print localtime().": BLAST database $db_name will be looked up via BLASTDB=".$ENV{'BLASTDB'}."\n"; #$env_blastdir = $ENV{'BLASTDB'}; #$ENV{'BLASTDB'} = ''; $db=$db_name; #db is just db name }else{ #otherwise + print localtime().": Getting rid of the trailing '_dna' if possible\n"; $db_name =~ s/_dna$//; # get rid of '_dna' in db name $db=read_config($config,'blast_dir','').$db_name; #we should add the path to the database + print localtime().": BLAST database $db was looked up via est_pipe.conf variable \$blast_dir\n"; } $db_type='dna'; + print localtime().": Forced 'dna' datatype because unigenes and superunigenes are DNA datasets generated from this EST dataset.\n\t\t\t However, it is ugly that these files were written into BLASTDB defined directory where typically site-wide, public databases exist.\n"; } unless (check_blastdb($ENV{'BLASTDB'},$db)) { print localtime().": BLAST database $db cannot be found at $ENV{'BLASTDB'}\n"; + } else { + if (-r $ENV{'BLASTDB'}."/".$db) { + print localtime().": BLAST database $db exists in $ENV{'BLASTDB'}\n"; + if ($db_type eq 'dna') { + if (! -r $ENV{'BLASTDB'}."/".$db.".nin") { + system("cd $ENV{'BLASTDB'} && formatdb -p F -i $db") or die "Failed to formatdb -p F -i $db\n"; + } else { + print localtime().": BLAST database ".$ENV{'BLASTDB'}."/".$db." already formatted, good\n"; + } + } elsif ($db_type eq 'pep') { + if (! -r $ENV{'BLASTDB'}."/".$db.".pin") { + system("cd $ENV{'BLASTDB'} && formatdb -p T -i $db") or die "Failed to formatdb -p T -i $db\n"; + } else { + print localtime().": BLAST database ".$ENV{'BLASTDB'}."/".$db." already formatted, good\n"; + } + } else { + print localtime().": Do not know whether $db is a pep/dna database, not checking whether it is formatted by formatdb, it is your job\n"; + } + } else { + print localtime().": Cannot find BLAST database ".$ENV{'BLASTDB'}."/".$db."\n"; + return 0; + } } if (!$db_type){ - print localtime().": Database type (dna or pep) not defined for database $db_name\n"; + print localtime().": Database type (dna or pep) not defined for database $db_name in databases.csv and subsequently in table db.\n"; return 0; } $program = ${$$db_info{$db_name}}{'blast_program'}; @@ -1294,6 +1333,7 @@ elsif ($in_type eq 'pep' and $db_type eq 'dna'){$program='tblastn'} elsif ($in_type eq 'dna' and $db_type eq 'pep'){$program='blastx'} elsif ($in_type eq 'pep' and $db_type eq 'pep'){$program='blastp'} + else { print localtime().": Wrong combination of input data type=$in_type and db.kind=$in_type, do not know which of blastn/tblastn/blastx/blastp to use\n"; return 0; } }elsif($program ne'blastn' and $program ne'tblastn' and $program ne'blastx' and $program ne'blastp' and $program ne'tblastx'){ print localtime().": BLAST program called $program is not allowed\n"; return 0; @@ -1330,7 +1370,7 @@ $ok=not (system("rm $out_file")); unless ($ok){ print localtime().": The BLAST output file: $out_file\n"; - print localtime().": exists and can not be removed\n"; + print localtime().": exists and can not be removed: $!\n"; return 0; }; } @@ -1346,7 +1386,7 @@ my $fail=system($cmd); if ($fail){ print localtime().": Something wen't wrong with the mpiBLAST\n"; - print localtime().": Command was: $cmd\n"; + print localtime().": Command was: $cmd: $!\n"; return 0; } } else { @@ -1357,7 +1397,7 @@ my $fail=system($cmd); if ($fail){ print localtime().": Something wen't wrong with the BLAST\n"; - print localtime().": Command was: $cmd\n"; + print localtime().": Command was: $cmd: $!\n"; return 0; } }else{ @@ -1371,7 +1411,7 @@ #we create the input stream my $in = Bio::SeqIO->new(-file => $in_file , -format => 'Fasta') or $ok=0; unless ($ok){ - print localtime().": There was a problem opening the file $in_file\n"; + print localtime().": There was a problem opening the file $in_file: $!\n"; } my $seq; my $blast_report; while ($seq=$in->next_seq()) { @@ -1380,13 +1420,13 @@ #input file there could be some memory trouble $blast_report= $factory->blastall($seq); unless ($blast_report) { - print localtime().": There was a problem running the BLAST search\n"; + print localtime().": There was a problem running the BLAST search: $!\n"; return 0; } - system("cat $temp_file >> $out_file") + system("cat $temp_file >> $out_file") or die "Cannot append to $out_file: $!\n"; } - #we remove the temporal file - system("rm $temp_file"); + #we remove the temporary file + system("rm $temp_file") or die "Cannot remove temporary file $temp_file: $!\n"; #my $blast_report = $factory->blastall($in_file); } } @@ -1425,7 +1465,7 @@ my $blast_dbs=read_config($config,'blast_dbs',''); my @blast_dbs=split(/,/,$blast_dbs); unless (@blast_dbs) { - print localtime().": There are no BLAST databases defined in the config file\n"; + print localtime().": There are no BLAST databases defined in the config file under \$blast_dbs\n"; return 0; } #how are the report files named? @@ -1441,7 +1481,7 @@ } my $db=read_config($config,'db_path',''); #complete path to the mysql db unless (check_table($db,$table)){ - print localtime().": Table $table for store the BLAST reports is not in database $db\n"; + print localtime().": Table $table to store the BLAST reports is not in created in the database $db\n"; return 0; } @@ -1461,7 +1501,7 @@ my $config=shift; my $blast_db=shift; my $in_file=shift; - print localtime().": Storing BLAST report: $in_file into the database\n"; + print localtime().": Storing BLAST report from $in_file into the database\n"; my $table=read_config($config,'blast_reports_table',''); my $db=read_config($config,'db_path',''); #complete path to the mysql db @@ -1477,13 +1517,13 @@ #| identity | float | | | 0 | | #| similarity | float | | | 0 | | #| e_value - my $st="INSERT INTO $table (blast_result_id,subj_acc,description,location_begin_query,location_end_query,location_query,location_begin_subj,location_end_subj,location_subj,identity,similarity,e_value) VALUES (?,?,?,?,?,?,?,?,?,?,?,?);"; + my $st="INSERT DELAYED INTO $table (blast_result_id,subj_acc,description,location_begin_query,location_end_query,location_query,location_begin_subj,location_end_subj,location_subj,identity,similarity,e_value) VALUES (?,?,?,?,?,?,?,?,?,?,?,?);"; my $sth=$dbh->prepare($st) or return 0; my $st_blast_result="SELECT blast_result_id FROM blast_result WHERE query_seq=? AND db=?"; my $sth_blast_result=$dbh->prepare($st_blast_result) or return 0; unless ($sth){ print localtime().": There was a problem preparing statement:\n"; - print localtime().": $st\n"; + print localtime().": $st: $DBI::errstr\n"; close_db($dbh); return 0; } @@ -1493,8 +1533,10 @@ } open IN, "<$in_file" or $ok=0; unless ($ok) { - print localtime()." There was a problem opening file $in_file\n"; + print localtime()." There was a problem opening file $in_file: $!\n"; return 0; + } else { + print localtime().": Parsing $in_file with BLAST plaintext output\n"; } my $line;my $query;my $hit; while ($ok){ @@ -1513,7 +1555,7 @@ if ($array[0]){ $blast_result_id=$array[0] }else{ - print localtime().": BLAST result ID has not been found in the database\n"; + print localtime().": BLAST result blast_result_id has not been found in the database using query_seq=$query AND db=$blast_db\n"; return 0; } @@ -1558,6 +1600,7 @@ print localtime().": query: $query\n"; print localtime().": blast_db: $blast_db\n"; print localtime().": hit: $hit\n"; + print localtime().": The error was: $DBI::errstr\n"; return 0; } }