Hi,
I have amplicon sequencing data from Illumina TrueSight Myeloid cancer panel. I have exact coordinates of each amplicon and primer sequences. And I have implemented Java code using htsjdk.samtools library to strip primer sequences from paired end mapped data.
The idea is that if there are overlapping amplicons, then primer sequences would possibly interfere with any mutation detection in this region.
My only fear is that my code is unrevised and prone to bugs that might be very silent. Especially when dealing with insertions/deletions within the region of clipping. Is there some tool or data set to test my code against?
Thanks Vojtech.
P.S: I am enclosing part of my code to softclip part of aligned read.
/**
* Mask samRecord up to position from
* @param samRecord
* @param from
* @return Clipping indel score. Number of insertions/deletions in clipped region.
* @throws InvalidActivityException
* @throws InvalidAlgorithmParameterException
*/
private int maskSoftStart(SAMRecord samRecord, int from) throws InvalidActivityException, InvalidAlgorithmParameterException
{
int indelScore = 0;
Cigar c = samRecord.getCigar();
List<CigarElement> cel = c.getCigarElements();
int cigarPosition = samRecord.getUnclippedStart() - 1;
Cigar newcigar;
List<CigarElement> newcigarelm = new ArrayList<CigarElement>();
if(samRecord.getAlignmentStart() - 1 >= from)
{
log.info("Nothing to mask");
return indelScore;
}
boolean containsValidOperator = false;
int adjustClippedStart = from;
int softClippingLength = 0;
for(int i = 0; i != cel.size(); i++)
{
CigarElement ce = cel.get(i);
CigarOperator co = ce.getOperator();
int length = ce.getLength();
if(length > 0)
{
switch(co)
{
case D:
if(cigarPosition >= from)
{
if(softClippingLength != 0)
{
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
softClippingLength = 0;
}
newcigarelm.add(ce);
containsValidOperator = true;
}else if(from - cigarPosition < length)
{
int x = from - cigarPosition;
indelScore += x;
if(softClippingLength != 0)
{
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
softClippingLength = 0;
}
newcigarelm.add(new CigarElement(length - x, CigarOperator.D));
containsValidOperator = true;
}else
{
indelScore += length;
}
cigarPosition += length;
break;
case I:
if(cigarPosition >= from)
{
if(softClippingLength != 0)
{
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
softClippingLength = 0;
}
newcigarelm.add(ce);
containsValidOperator = true;
}else
{
indelScore += length;
softClippingLength += length;
}
break;
case M:
if(cigarPosition >= from)
{
if(softClippingLength != 0)
{
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
softClippingLength = 0;
}
newcigarelm.add(ce);
containsValidOperator = true;
}else if(from - cigarPosition >= length)
{
softClippingLength += length;
}else
{
int x = from - cigarPosition;
softClippingLength += x;
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
newcigarelm.add(new CigarElement(length - x, CigarOperator.M));
softClippingLength = 0;
containsValidOperator = true;
}
cigarPosition += length;
break;
case S:
if(cigarPosition >= from)
{
if(softClippingLength != 0)
{
newcigarelm.add(new CigarElement(softClippingLength + length, CigarOperator.S));
softClippingLength = 0;
}else
{
newcigarelm.add(ce);
}
}else
{
softClippingLength += length;
if(from - cigarPosition <= length)
{
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
softClippingLength = 0;
}
}
cigarPosition += length;
break;
case H:
if(cigarPosition >= from)//on the end
{
if(softClippingLength != 0)
{
newcigarelm.add(new CigarElement(softClippingLength, CigarOperator.S));
softClippingLength = 0;
}
newcigarelm.add(ce);
}else
{//On the beginning
newcigarelm.add(ce);
}
cigarPosition += length;
break;
case N:
case P:
case EQ:
case X:
throw new InvalidActivityException("Unsupported token " + co.name() + ".");
}
}
}
if(softClippingLength != 0)
{
throw new InvalidActivityException("Soft clipping inconsistency.");
}
newcigar = new Cigar(newcigarelm);
samRecord.setCigar(newcigar);
samRecord.setAlignmentStart(adjustClippedStart+1);
if(!containsValidOperator)
{
throw new InvalidAlgorithmParameterException("Whole cigar masking");
}
return indelScore;
}
Have a look at skewer (or any other published trimmer). It is made to trim adapters from NGS data, no need to implement these NGS standard things yourself.