|
use threads; |
|
use Thread::Queue; |
|
# use YourUtils qw/logmsg getNumCPUs/; |
|
my $seqs=qualityTrimFastq("raw.fastq",{numcpus=>4,min_avg_quality=>20}); |
|
for(@$seqs){ |
|
print $_; # => an entire 8-line entry ($id$seq\n+\n$qual\n, for each of the two reads in the pair) |
|
} |
|
|
|
# reads a shuffled paired end fastq and trims it |
|
# and cleans it |
|
# Author: Lee Katz <lkatz@cdc.gov> |
|
sub qualityTrimFastq($;$){ |
|
my($fastq,$settings)=@_; |
|
$$settings{numcpus}||=LKUtils::getNumCPUs(); |
|
# trimming |
|
$$settings{min_quality}||=30; |
|
$$settings{bases_to_trim}||=10; # max number of bases that will be trimmed on either side |
|
# cleaning |
|
$$settings{min_avg_quality}||=20; |
|
$$settings{min_avg_quality_window}||=70; |
|
$$settings{min_length}||=62; # twice hash length sounds good |
|
|
|
my (@t,$t); |
|
my $Q=Thread::Queue->new; |
|
for(0..$$settings{numcpus}-1){ |
|
$t[$t++]=threads->create(sub{ |
|
logmsg "Launching thread TID".threads->tid; |
|
my(@entryOut); |
|
while(defined(my $entry=$Q->dequeue)){ |
|
my($id1,$seq1,undef,$qual1,$id2,$seq2,undef,$qual2)=split(/\s*\n\s*/,$entry); |
|
my %read1=(id=>$id1,seq=>$seq1,qual=>$qual1,length=>length($seq1)); |
|
my %read2=(id=>$id2,seq=>$seq2,qual=>$qual2,length=>length($seq2)); |
|
# quality trim: read 1, 3' read |
|
my($numToTrim3,$numToTrim5,@qual); |
|
$numToTrim3=$numToTrim5=0; |
|
@qual=map(ord($_)-33,split(//,$read1{qual})); |
|
for(my $i=0;$i<$$settings{bases_to_trim};$i++){ |
|
$numToTrim3++ if($qual[$i]<$$settings{min_quality}); |
|
last if($qual[$i]>=$$settings{min_quality}); |
|
} |
|
|
|
$read1{seq}=substr($read1{seq},$numToTrim3,$read1{length}-$numToTrim3-$numToTrim5); |
|
$read1{qual}=substr($read1{qual},$numToTrim3,$read1{length}-$numToTrim3-$numToTrim5); |
|
# quality trim: read 1, 5' read |
|
$numToTrim3=$numToTrim5=0; |
|
@qual=map(ord($_)-33,split(//,$read2{qual})); |
|
for(my $i=0;$i<$$settings{bases_to_trim};$i++){ |
|
$numToTrim3++ if($qual[$i]<$$settings{min_quality}); |
|
last if($qual[$i]>=$$settings{min_quality}); |
|
} |
|
for(my $i=$read2{length}-$$settings{bases_to_trim};$i<@qual;$i++){ |
|
$numToTrim5++ if($qual[$i]<$$settings{min_quality}); |
|
last if($qual[$i]>=$$settings{min_quality}); |
|
} |
|
$read2{seq}=substr($read2{seq},$numToTrim3,$read2{length}-$numToTrim3-$numToTrim5); |
|
$read2{qual}=substr($read2{qual},$numToTrim3,$read2{length}-$numToTrim3-$numToTrim5); |
|
|
|
#cleaning stage |
|
# TODO keep singletons |
|
next if(length($read1{seq})<$$settings{min_length}); |
|
next if(length($read2{seq})<$$settings{min_length}); |
|
# avg quality |
|
my $qual_is_good=1; |
|
QUALITY_CHECK: |
|
for my $qualStr ($read1{qual},$read2{qual}){ |
|
my $length=length($qualStr)-$$settings{min_avg_quality_window};; |
|
my @qual=map(ord($_)-33,split(//,$qualStr)); |
|
for(my $i=0;$i<$length;$i++){ |
|
my @subQual=@qual[$i..($$settings{min_avg_quality_window}+$i-1)]; |
|
my $avgQual=sum(@subQual)/$$settings{min_avg_quality_window}; |
|
if($avgQual<$$settings{min_avg_quality}){ |
|
#print "$i: ".join(".",@subQual)."\n$avgQual <=> $$settings{min_avg_quality}\n";die; |
|
$qual_is_good=0; |
|
last QUALITY_CHECK; |
|
} |
|
} |
|
} |
|
next if(!$qual_is_good); |
|
|
|
my $entryOut="$read1{id}\n$read1{seq}\n+\n$read1{qual}\n$read2{id}\n$read2{seq}\n+\n$read2{qual}\n"; |
|
push(@entryOut,$entry); |
|
} |
|
return \@entryOut; |
|
}); |
|
} |
|
|
|
my $entryCount=0; |
|
open(FQ,'<',$fastq) or die "Could not open $fastq for reading: $!"; |
|
while(my $entry=<FQ>){ |
|
$entry.=<FQ> for(1..7); # 8 lines total for paired end entry |
|
$Q->enqueue($entry); |
|
$entryCount++; |
|
logmsg "Finished loading $entryCount pairs" if($entryCount%100000==0); |
|
#if($entryCount>99999){ |
|
# logmsg "DEBUG: only $entryCount loaded"; |
|
# last; |
|
#} |
|
} |
|
close FQ; |
|
$Q->enqueue(undef) for(1..$$settings{numcpus}); |
|
|
|
# status update |
|
while(my $p=$Q->pending){ |
|
logmsg "$p entries left to process"; |
|
sleep 10; |
|
} |
|
|
|
my @entry; |
|
for(@t){ |
|
my $tmp=$_->join; |
|
push(@entry,@$tmp); |
|
} |
|
|
|
my $numCleaned=$entryCount-@entry; |
|
logmsg "$numCleaned entries out of $entryCount removed due to low quality"; |
|
return \@entry; |
|
} |
what's wrong with the stuff in fastx toolkit?
I'd also suggest that whilst the code golf is fine, and posting code to ask for help/suggestions is fine, dumping code snippets here when there are a dozen implementations elsewhere is not what BioStar should be used for. Stick it on github.
I don't know who Richard Mott is but it would be helpful if you describe algorithms in English, e.g. "this trims all base pairs to the right of the first position where quality drops more than 'limit'"
It would be better to choose an existing software to do Illumina trimming rather than reinventing the wheel. Considering up-to-date precision and speed benchmarks, I'd recommend atria.