Hey guys, I've been trying to figure out why this script has been running painfully slowly. It takes a 256 GB node about 14 hours to produce 3051 results, which definitely shouldn't be the case. I was hoping you guys could look at it and see if a) any of the changes I've made are screwing it up and b) if there's anywhere it could be doing something superfluous that is causing it to take so very long. We've done much more intense things much more quickly so it definitely shouldn't be taking this long, so it's almost certainly a problem with the script:
#!/usr/bin/env perl
###############################################################################
#
# calc.dnds.pl
#
# Calculates pairwise dn/ds ratios of all input sequences. Note: the sequences
# have to be alligned and in fasta format. In addition its assumed they start in
# frame.
#
# Copyright (C) 2012 Mads Albertsen
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################
#pragmas
use strict;
use warnings;
#core Perl modules
use Getopt::Long;
#Bioperl modules
use Bio::Seq;
use Bio::AlignIO;
use Bio::Align::DNAStatistics;
use Bio::SeqIO;
#locally-written modules
BEGIN {
select(STDERR);
$| = 1;
select(STDOUT);
$| = 1;
}
# get input params
my $global_options = checkParams();
my $inputfile;
my $outputfile;
my $tempheader;
my %header;
my %sequence;
$inputfile = &overrideDefault("inputfile.txt",'inputfile');
$outputfile = &overrideDefault("outputfile.txt",'outputfile');
######################################################################
# CODE HERE
######################################################################
### declare 2 position variables for sequences to compare (to be incremented by 2 each loop)
my $i1 = 1;
my $i2 = 2;
# use Bio::AlignIO to read in the alignment
my $str = Bio::AlignIO->new('-file' => $inputfile); #To get the length of the sequences. (Assumes that tey are all equal length)
my $aln = $str->next_aln();
my $seqlength = $aln->length/3;
my $stats = Bio::Align::DNAStatistics->new(); #Create the stats variable.
my $in = Bio::AlignIO->new(-format => 'fasta', -file => $inputfile); #Add the input fasta file.
my $alnobj = $in->next_aln;
### open file specified in command line for writing (if omitted on command line, "output.txt" will be used.
open (MYFILE, '>>'.$outputfile);
### print column headers one time at top of output. move inside loop to print above each individual result
print MYFILE "comparison\tAA.length\tdN/dS\t#dN\t#dS\t%AA.sim\n";
### start loop using total pairs of sequences to be compared from getpairs sub as number of times to loop
for (my $i=1;$i<=972793;$i++){
my $alnobj2 = $alnobj->select($i1, $i2); #Add objects that need to be aligned. By taking each sequence from the in file one by one.
my ($seq1id,$seq2id) = map { $_->display_id } $alnobj2->each_seq;
#create the combinations of sequences that need to be compared.
### changed from calc all stats to only one pair
my $results = $stats->calc_KaKs_pair($alnobj2,$seq1id,$seq2id); #Calculate all stats.
my %Nd;
my %Sd;
for my $an (@$results){ #Get the results that are needed for the output
for (sort keys %$an ){
next if /Seq/;
if ($_ eq "N_d"){
my $tempcomp = $an->{'Seq1'}." vs ".$an->{'Seq2'};
$Nd{$tempcomp} = $an->{$_};
}
if ($_ eq "S_d"){
my $tempcomp = $an->{'Seq1'}." vs ".$an->{'Seq2'};
$Sd{$tempcomp} = $an->{$_};
}
}
foreach my $key (sort keys %Nd){
my $tempcalc2 = (1-$Nd{$key}/$seqlength)*100;
print MYFILE "$key\t$seqlength\t",sprintf("%.2f","\t",sprintf("%.0f",$Nd{$key}),"\t",sprintf("%.0f",$Sd{$key}),"\t",sprintf("%.1f",$tempcalc2),"\n";
}
### increment positions to next pair
$i1 = $i1 + 2;
$i2 = $i2 + 2;
}
### close output file handle
close (MYFILE);
######################################################################
# TEMPLATE SUBS
######################################################################
sub checkParams {
#-----
# Do any and all options checking here...
#
my @standard_options = ( "help|h+", "inputfile|i:s", "outputfile|o:s");
my %options;
# Add any other command line options, and the code to handle them
#
GetOptions( \%options, @standard_options );
#if no arguments supplied print the usage and exit
#
exec("pod2usage $0") if (0 == (keys (%options) ));
# If the -help option is set, print the usage and exit
#
exec("pod2usage $0") if $options{'help'};
# Compulsosy items
#if(!exists $options{'infile'} ) { print "**ERROR: $0 : \n"; exec("pod2usage $0"); }
return \%options;
}
sub overrideDefault
{
#-----
# Set and override default values for parameters
#
my ($default_value, $option_name) = @_;
if(exists $global_options->{$option_name})
{
return $global_options->{$option_name};
}
return $default_value;
}
sub getpairs
{
my $seq_in = Bio::SeqIO->new(
-format => 'fasta',
-file => $inputfile,
);
# loads the whole file into memory - be careful
# if this is a big file, then this script will
# use a lot of memory
my $seq;
my @seq_array;
while( $seq = $seq_in->next_seq() ) {
push(@seq_array,$seq);
}
# count elements
# @seq_array = sort { $a->length <=> $b->length } @seq_array;
my $count = 0;
foreach my $seq ( @seq_array ) {
$count++;
}
#dispose of array to free up resources
undef @seq_array;
return $count/2;
}
__DATA__
=head1 NAME
calc.dsdn.pl
=head1 COPYRIGHT
copyright (C) 2012 Mads Albertsen
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head1 DESCRIPTION
=head1 SYNOPSIS
script.pl -i [-h]
[-help -h] Displays this basic usage information
[-inputfile -i] Aligned 'in frame' fasta file.
=cut
I appreciate any help you can give, thanks very much!
Sounds silly, but how large is your file? In the
getpairs
subroutine it says:Hey, the file is a little over a gig. I have however removed the getpairs sub from the actual code. He had it in the original script but its been removed.
Could you highlight then please in your code above which parts have been edited? Maybe highlight it in yellow. It would make troubleshooting a lot easier as the original script shouldn't be affected by your problem?!
Hey it's not letting me highlight more than one change but I added the
and removed a
tempcalc
. The for loop is a replacement for the getpairs sub. And it allowed me to grab the first two sequences as opposed to comparing one sequence against every sequence. Originally, it'd go 1 compared to 2, 3, 4, 5, 6, etc as opposed to 1 compared to 2, 3 compared to 4, 5 compared to 6 which is what I wanted. I'm sorry if this is asinine, I've just begun learning perl.And yeah, the original script wasn't designed for such large files and it functioned slightly differently. The original script also had the memory dump problem.