diff options
Diffstat (limited to 'sci-biology/repbase/files/clean')
-rwxr-xr-x | sci-biology/repbase/files/clean | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/sci-biology/repbase/files/clean b/sci-biology/repbase/files/clean new file mode 100755 index 0000000..37ee59c --- /dev/null +++ b/sci-biology/repbase/files/clean @@ -0,0 +1,59 @@ +#!/usr/bin/perl -w + +use strict; + +my $name=""; +my $sequence=""; +my $infile = $ARGV[0]; +my $outfile = $ARGV[1]; + +open(LIB, $infile); + +open(CLIB, ">", $outfile); + +while (<LIB>) +{ + if (m/^>/) + { + if (not $name eq "") + { + if ($sequence eq "") + { + print "---------------------------------------------------------------------------------------\n"; + print "Empty sequence: " . $name . "\n"; + } + else + { + if ($sequence =~ m/^[ACGTNWSYRMHKXDVB]*$/) + { + $sequence =~ s/[WSYRMHKXDVB]/N/g; + if (not $sequence =~ m/NNNN/) + { + print CLIB $name; + print CLIB $sequence . "\n"; + } + else + { + print "---------------------------------------------------------------------------------------\n"; + print "Too many N's: " . $name; + } + } + else + { + print "---------------------------------------------------------------------------------------\n"; + print "Invalid entry: " . $name; + $sequence =~ s/[ACGTN]//g; + print "Reduced sequence:\n"; + print $sequence . "\n"; + } + $sequence = ""; + } + } + $name = $_; + } + else + { + $sequence .= uc($_); + $sequence =~ s/\n//g; + } +} |