Выравнивание все-против-всех

Perl-скрипт для проведения выравнивания последовательностей в FASTA-файле 

 

Простой скрипт для проведения выравнивания всех-против-всех последовательностей для одного FASTA-файла.  Используется пакет EMBOSS для глобального выравнивания (алгоритм Нидлмана-Вунша) или локального выравнивания (алгоритм Смита-Уотермана). На выходе - матрица сходства между последовательностями в tab-delimited формате. 

 

#
# given a FASTA file conducts all-vs-all alignment, using EMBOSS NEEDLE or WATER program
#
use Getopt::Long; #http://perldoc.perl.org/Getopt/Long.html
use Bio::SeqIO;
use Bio::SeqFeatureI;
use Tie::IxHash;


sub help{
print "Alignment\n";
print "Usage:\n";
print " -f FASTA file\n";
print " -m global or local alignment\n";
print " -r reverse each query sequence\n";

}

#--------------------- main part --------------------------------
# GLOBAL VARIABLES

$ALIGNMODE="global";
$OUT_FNAME=undef;
$TRY_REVERSE=undef;
$file=undef; #file to align
GetOptions ('f=s' => \$file,
'm=s' => \$ALIGNMODE, #possible: global,local
'reverse' => \$TRY_REVERSE, #reverse each query sequence.
'-out' => \$OUT_FNAME,
'help|?|h' => \$help );
if($help) { help(); exit;}
if(!($file)) { die "please specify -f FNAME"; }
if (-e $OUT_FNAME) { die "file $OUT_FNAME already exists\n"; }
# if (($TRY_REVERSE)&&(-e "$OUT_FNAME.rev")) { die "file $OUT_FNAME.rev already exists\n"; }

if ((!($ALIGNMODE) eq 'local')||(!($ALIGNMODE) eq 'global'))
{$ALIGNMODE="global"; print "Unknown alignment mode! Set to $ALIGNMODE\n";}


open(OUTFILE,">$OUTFILE");
# if($TRY_REVERSE) {open(OUTFILE_REV,">$OUTFILE.rev"); }
print "#using $ALIGNMODE alignment\n";

%SEQUENCES=(); # key - IDs, values - sequences
tie %SEQUENCES, "Tie::IxHash";

#read FASTA file in a hash
my $FastaFile = Bio::SeqIO->new( -file => $file, -format => 'fasta');
while (my $faaseq = $FastaFile->next_seq) {
my $descr = $faaseq->primary_id;
my $seq = $faaseq->seq;
if ($TRY_REVERSE) {
my $seqobj = Bio::Seq->new(-seq =>$seq);
my $rev=$seqobj->revcom; # via BioPerl - reverse and complement
$seq=$rev->primary_seq->seq;
} # via BioPerl - reverse and complement
$SEQUENCES{$descr}=$seq;
}

#print output header
foreach $key(keys %SEQUENCES){ print "$key\t"}; print "\n";

foreach $key(keys %SEQUENCES){ #main cycle: align each sequence with
%SCORE=();
tie %SCORE, "Tie::IxHash"; # to get keys in insertion order
%SCORE=%SEQUENCES; # key - sequence_ids, values - similarity

open (TMPFILE, ">$key".'.faa') or print("Error opening file"); #create temp file current sequence
print TMPFILE ">$key\n";
print TMPFILE "$SEQUENCES{$key}\n";
close(TMPFILE);

if($ALIGNMODE eq "global"){ #run Needleman-Wunsch global alignment from EMBOSS with default parameters
$cmdstr="needle $key.faa $file -gapopen 10 -gapextend 0.5 -outfile $key.tmp";
}
else { #run Smith-Waterman local alignment from EMBOSS with default parameters
$cmdstr="water $key.faa $file -gapopen 10 -gapextend 0.5 -outfile $key.tmp";
}
system($cmdstr);

open(INFILE,"$key.tmp"); #NB: reading entire file as a string
local $/ = undef;
$FILE=;

#regexp for parsing sequence IDs and identity from NEEDLE output
$regexp='#.*\n# 1: (\w*)\n# 2: (\w*)\n.*\n.*\n.*\n.*\n.*\n# Identity:\s*(\d*)/(\d*)';

while($FILE =~ m/$regexp/g){ #NB: parsing globally
#$id1=$1;
$id2=$2;
$ident=$3; # Warning: don't `last' out of this loop
$length=$4;
$SCORE{$id2}=$ident/$length;
}
close(INFILE);

foreach $key2(keys %SCORE) { print sprintf("%.4f",$SCORE{$key2}),"\t"};
print "\n";

$cmdstr="del $key.tmp"; system($cmdstr);
$cmdstr="del $key.faa"; system($cmdstr);
}

close(OUTFILE); if($TRY_REVERSE) { close(OUTFILE_REV); }
print "#Work is finished.";