Обновлено 24.03.2009 Автор: Administrator
Perl-скрипт для проведения выравнивания последовательностей в FASTA-файле
Простой скрипт для проведения выравнивания всех-против-всех последовательностей для одного FASTA-файла. Используется пакет EMBOSS для глобального выравнивания (алгоритм Нидлмана-Вунша) или локального выравнивания (алгоритм Смита-Уотермана). На выходе - матрица сходства между последовательностями в tab-delimited формате.
#
# given a FASTA file conducts all-vs-all alignment, using EMBOSS NEEDLE or WATER program
#
use Getopt::Long; #http://perldoc.perl.org/Getopt/Long.html
use Bio::SeqIO;
use Bio::SeqFeatureI;
use Tie::IxHash;
sub help{
print "Alignment\n";
print "Usage:\n";
print " -f FASTA file\n";
print " -m global or local alignment\n";
print " -r reverse each query sequence\n";
}
#--------------------- main part --------------------------------
# GLOBAL VARIABLES
$ALIGNMODE="global";
$OUT_FNAME=undef;
$TRY_REVERSE=undef;
$file=undef; #file to align
GetOptions ('f=s' => \$file,
'm=s' => \$ALIGNMODE, #possible: global,local
'reverse' => \$TRY_REVERSE, #reverse each query sequence.
'-out' => \$OUT_FNAME,
'help|?|h' => \$help );
if($help) { help(); exit;}
if(!($file)) { die "please specify -f FNAME"; }
if (-e $OUT_FNAME) { die "file $OUT_FNAME already exists\n"; }
# if (($TRY_REVERSE)&&(-e "$OUT_FNAME.rev")) { die "file $OUT_FNAME.rev already exists\n"; }
if ((!($ALIGNMODE) eq 'local')||(!($ALIGNMODE) eq 'global'))
{$ALIGNMODE="global"; print "Unknown alignment mode! Set to $ALIGNMODE\n";}
open(OUTFILE,">$OUTFILE");
# if($TRY_REVERSE) {open(OUTFILE_REV,">$OUTFILE.rev"); }
print "#using $ALIGNMODE alignment\n";
%SEQUENCES=(); # key - IDs, values - sequences
tie %SEQUENCES, "Tie::IxHash";
#read FASTA file in a hash
my $FastaFile = Bio::SeqIO->new( -file => $file, -format => 'fasta');
while (my $faaseq = $FastaFile->next_seq) {
my $descr = $faaseq->primary_id;
my $seq = $faaseq->seq;
if ($TRY_REVERSE) {
my $seqobj = Bio::Seq->new(-seq =>$seq);
my $rev=$seqobj->revcom; # via BioPerl - reverse and complement
$seq=$rev->primary_seq->seq;
} # via BioPerl - reverse and complement
$SEQUENCES{$descr}=$seq;
}
#print output header
foreach $key(keys %SEQUENCES){ print "$key\t"}; print "\n";
foreach $key(keys %SEQUENCES){ #main cycle: align each sequence with
%SCORE=();
tie %SCORE, "Tie::IxHash"; # to get keys in insertion order
%SCORE=%SEQUENCES; # key - sequence_ids, values - similarity
open (TMPFILE, ">$key".'.faa') or print("Error opening file"); #create temp file current sequence
print TMPFILE ">$key\n";
print TMPFILE "$SEQUENCES{$key}\n";
close(TMPFILE);
if($ALIGNMODE eq "global"){ #run Needleman-Wunsch global alignment from EMBOSS with default parameters
$cmdstr="needle $key.faa $file -gapopen 10 -gapextend 0.5 -outfile $key.tmp";
}
else { #run Smith-Waterman local alignment from EMBOSS with default parameters
$cmdstr="water $key.faa $file -gapopen 10 -gapextend 0.5 -outfile $key.tmp";
}
system($cmdstr);
open(INFILE,"$key.tmp"); #NB: reading entire file as a string
local $/ = undef;
$FILE=;
#regexp for parsing sequence IDs and identity from NEEDLE output
$regexp='#.*\n# 1: (\w*)\n# 2: (\w*)\n.*\n.*\n.*\n.*\n.*\n# Identity:\s*(\d*)/(\d*)';
while($FILE =~ m/$regexp/g){ #NB: parsing globally
#$id1=$1;
$id2=$2;
$ident=$3; # Warning: don't `last' out of this loop
$length=$4;
$SCORE{$id2}=$ident/$length;
}
close(INFILE);
foreach $key2(keys %SCORE) { print sprintf("%.4f",$SCORE{$key2}),"\t"};
print "\n";
$cmdstr="del $key.tmp"; system($cmdstr);
$cmdstr="del $key.faa"; system($cmdstr);
}
close(OUTFILE); if($TRY_REVERSE) { close(OUTFILE_REV); }
print "#Work is finished.";
< Предыдущая | Следующая > |
---|