#!/usr/bin/perl my $in = (); my $a = 0; my $g = 0; my $r = 0; for(my $k = $#ARGV; $k >= 0; $k--){ if($ARGV[$k] eq '-a'){ $a = 1; } if($ARGV[$k] eq '-g'){ $g = 1; } if($ARGV[$k] eq '-i'){ if(-e $ARGV[$k+1]){ $in = $ARGV[$k+1]; } } if($ARGV[$k] eq '-r'){ $r = 1; } } if(length($in)){ open(INFILE, '<', $in) or die("could not open $in!\n"); my $buffer = (); my $name = (); my $sequence = (); while(my $line = ){ chomp($line); if($line =~ m/^>/){ if(length($sequence)){ $buffer .= process($name, $sequence); if(length($buffer) > 10000){ print("$buffer"); $buffer = (); } } $sequence = (); $name = $line; } else { $sequence .= $line; } } $buffer .= process($name, $sequence); print("$buffer"); close(INFILE); sub process { my @labels = split(/\|/, $_[0]); my $output = '>'; if($a == 1){ $output .= $labels[3]; } else { my @bits = split(/ /, $labels[4]); $output .= join('_', (@bits[1,2], $labels[3])); } my $sequence = uc($_[1]); $sequence =~ tr/ACGTNVDBHWMRKSY//cd; if($g == 1){ my $GC += ($sequence =~ tr/CGNVDBHMRKSY/CGNVDBHMRKSY/); $GC = 100*($GC/(1+length($sequence))); $output .= '|GC=' . sprintf("%.2f", $GC) . '%'; } if($r == 1){ $sequence =~ tr/ACGTNVDBHWMRKSY/TGCANBHVDWKYMSR/; $sequence = reverse($sequence); } my @bases = split(//, $sequence); for(my $k = 0; $k <= $#bases; $k++){ if(($k % 80) == 0){ $output .= "\n" . $bases[$k]; } else { $output .= $bases[$k]; } } $output .= "\n"; return($output); } } else { print("\nGenBank2fasta a script for processing GenBank files.\n\n"); print("USAGE: GenBank2fasta -i file [-a] [-g] [-r]\n"); print("-a\treduces names to GenBank accessions only\n"); print("-g\tappends percent GC content to each name\n"); print("-i\tspecifies a GenBank fasta file (required)\n"); print("-r\tproduces the reverse complement of each sequence\n\n"); } exit(0);