summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'sci-biology/repbase/files/clean')
-rwxr-xr-xsci-biology/repbase/files/clean59
1 files changed, 59 insertions, 0 deletions
diff --git a/sci-biology/repbase/files/clean b/sci-biology/repbase/files/clean
new file mode 100755
index 0000000..37ee59c
--- /dev/null
+++ b/sci-biology/repbase/files/clean
@@ -0,0 +1,59 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my $name="";
+my $sequence="";
+my $infile = $ARGV[0];
+my $outfile = $ARGV[1];
+
+open(LIB, $infile);
+
+open(CLIB, ">", $outfile);
+
+while (<LIB>)
+{
+ if (m/^>/)
+ {
+ if (not $name eq "")
+ {
+ if ($sequence eq "")
+ {
+ print "---------------------------------------------------------------------------------------\n";
+ print "Empty sequence: " . $name . "\n";
+ }
+ else
+ {
+ if ($sequence =~ m/^[ACGTNWSYRMHKXDVB]*$/)
+ {
+ $sequence =~ s/[WSYRMHKXDVB]/N/g;
+ if (not $sequence =~ m/NNNN/)
+ {
+ print CLIB $name;
+ print CLIB $sequence . "\n";
+ }
+ else
+ {
+ print "---------------------------------------------------------------------------------------\n";
+ print "Too many N's: " . $name;
+ }
+ }
+ else
+ {
+ print "---------------------------------------------------------------------------------------\n";
+ print "Invalid entry: " . $name;
+ $sequence =~ s/[ACGTN]//g;
+ print "Reduced sequence:\n";
+ print $sequence . "\n";
+ }
+ $sequence = "";
+ }
+ }
+ $name = $_;
+ }
+ else
+ {
+ $sequence .= uc($_);
+ $sequence =~ s/\n//g;
+ }
+}