The following code I have developed with PYSAM looks for reads completely soft clipped
import pysam
import re
import sys
import os
if len(sys.argv) != 2:
print("Usage: python improve_bam.py input_bam_path")
sys.exit(1)
# Input BAM file path provided as a command-line argument
input_bam_path = sys.argv[1]
# Generate the output BAM file path
output_directory = os.path.dirname(input_bam_path)
input_filename = os.path.basename(input_bam_path)
output_filename = input_filename.replace(".bam", "_improved.bam")
output_bam_path = os.path.join(output_directory, output_filename)
# Additional parameters
FLAG = 4 # Set FLAG to indicate an unmapped read
RNAME = -1 # Set RNAME to -1 to indicate unmapped
MAPQ = 0
CIGAR = "*" # Set CIGAR to '*' to indicate no cigar string
RNEXT = -1 # Set RNEXT to -1 to indicate no "next read"
PNEXT = -1 # Set PNEXT to -1 to indicate no "next position"
TLEN = 0 # Set TLEN to 0 to indicate no template length
#POS = 0
print(f"Flag: {FLAG}")
print(f"MAPQ: {MAPQ}")
print(f"RNAME: {RNAME}")
print(f"RNEXT: {RNEXT}")
print(f"PNEXT: {PNEXT}")
print(f"TLEN: {TLEN}")
# Open input BAM file
input_bam = pysam.AlignmentFile(input_bam_path, "rb")
# Open output BAM file for writing
output_bam = pysam.AlignmentFile(output_bam_path, "wb", header=input_bam.header)
total_reads = 0
changed_reads = 0
for read in input_bam:
total_reads += 1
if not read.is_unmapped:
# Check if the CIGAR string matches the pattern: <number>S
cigar_string = read.cigarstring
if cigar_string and re.match(r'^\d+S$', cigar_string):
# Print the read before modification
print("Before modification:")
print(f"Flag: {read.flag}")
print(f"MAPQ: {read.mapping_quality}")
print(f"RNAME: {read.rname}")
print(f"RNEXT: {read.rnext}")
print(f"PNEXT: {read.pnext}")
print(f"TLEN: {read.tlen}")
print(f"POS: {read.pos}")
# Perform the necessary modifications
read.flag = FLAG
read.mapping_quality = MAPQ
read.rname = RNAME
read.rnext = RNEXT
read.pnext = PNEXT
read.tlen = TLEN
read.cigarstring = CIGAR
# read.pos = POS
if read.rname == -1:
read.is_unmapped = True
read.pos = 0
# Print the read after modification
print("After modification:")
print(f"Flag: {read.flag}")
print(f"MAPQ: {read.mapping_quality}")
print(f"RNAME: {read.rname}")
print(f"RNEXT: {read.rnext}")
print(f"PNEXT: {read.pnext}")
print(f"TLEN: {read.tlen}")
print(f"POS: {read.pos}")
changed_reads += 1
# Write the modified or unmodified read to the output BAM file
output_bam.write(read)
# Close the input and output BAM files
input_bam.close()
output_bam.close()
# Print the total and changed reads counts
print("Total reads in the BAM file:", total_reads)
print("Changed reads:", changed_reads)
This seems to work and a normal output is like this
Before modification:
Flag: 0
MAPQ: 0
RNAME: 16
RNEXT: -1
PNEXT: -1
TLEN: 0
POS: 27116631
After modification:
Flag: 4
MAPQ: 0
RNAME: -1
RNEXT: -1
PNEXT: -1
TLEN: 0
POS: 0
Apart from a new BAM file (called _improved.bam)
HOWEVER, (and this is what I don't understand) when I do a validation in the _improved.bam file I get this error
ERROR::INVALID_ALIGNMENT_START:Record 1699229, Read name ML-PT1-10:12:000H00CHL:1:13107:9070:8182, Alignment start should be 0 because reference name = *.
If I convert the 2 BAM files to a SAM format and I look at that specific read I get this
BEFORE change:
ML-PT1-10:12:000H00CHL:1:13107:9070:8182 256 chr17 7669612 0 151S * 0 0 CCTAAGAGCAATCAGTGGGGAACAAGAAGTGGAGAATGTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG AAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFA00AFFF0FFFFFFFFFAFFFFFFFFFFFFFF000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF SA:Z:chr2,32916401,+,113S38M,0,0; RG:Z:MiniSeq_TruSight_Tumor_15_FFPE_and_control_samples_-27687666_HD729_S5andS6_L001_R1_001 UQ:i:74 AS:i:30
After the change:
ML-PT1-10:12:000H00CHL:1:13107:9070:8182 4 * 1 0 * * 0 0 CCTAAGAGCAATCAGTGGGGAACAAGAAGTGGAGAATGTCATGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTGAAAAAAAAAAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG AAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFA00AFFF0FFFFFFFFFAFFFFFFFFFFFFFF000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF SA:Z:chr2,32916401,+,113S38M,0,0; RG:Z:MiniSeq_TruSight_Tumor_15_FFPE_and_control_samples_-27687666_HD729_S5andS6_L001_R1_001 UQ:i:74 AS:i:30
That "1" in the fourth field should be "0" shouldn't be?? Why this has not been modified? Why did PYSAM report to me that this was 0 when it is not?
To give some context, what I am looking for is to NULL these reads to PASS validation and also to not get an error when this BAM is used by PISCES.