this does not necessarily answer your question, but becoming familiar with parsing CIGAR is sometimes quite useful
Here is a short code that can help (js/typescript)
https://cmdcolin.github.io/posts/2022-02-06-sv-sam#appendix-b---cigar-parsing
// @param cigar: CIGAR string in text form
// @returns an array of elements like ['30','M', '2','I', '50','M', '40','D'] which you can consume in a loop two elements at a time
function parseCigar(cigar: string) {
return cigar.split(/([MIDNSHPX=])/)
}
// this function does nothing, but is informative for how to parse interpret a
// CIGAR string
// @param cigar:CIGAR string from record
// @param readSeq: the SEQ from record
// @param refSeq: the reference sequence underlying the read
function interpretCigar(cigar: string, readSeq: string, refSeq: string) {
const opts = parseCigar(cigar)
let qpos = 0 // query position, position on the read
let tpos = 0 // target position, position on the reference sequence
for (let i = 0; i < ops.length; i += 2) {
const length = +opts[i]
const operator = opts[i + 1]
// do things. refer to the CIGAR chart in SAMv1.pdf for which operators
// "consume reference" to see whether to increment
if (op === 'M' || op === '=') {
// matches consume query and reference
const refMatch = refSeq.slice(tpos, tpos + len)
const readMatch = readSeq.slice(qpos, qpos + len)
for (let i = 0; i < len; i++) {
if (refMatch[i] !== readMatch[i]) {
// SNP at this position
}
}
qpos += len
tpos += len
}
if (op === 'I') {
// insertions only consume query
// sequence of the insertion from the read is
const insSeq = readSeq.slice(qpos, qpos + len)
qpos += len
}
if (op === 'D') {
// deletions only consume reference
// sequence of the deletion from the reference is
const delSeq = refSeq.slice(tpos, tpos + len)
tpad += len
}
if (op === 'N') {
// skips only consume reference
// skips are similar to deletions but are related to spliced alignments
tpad += len
}
if (op === 'X') {
// mismatch using the extended CIGAR format
// could lookup the mismatch letter in a string containing the reference
const mismatch = refSeq.slice(tpos, tpos + len)
qpos += len
tpos += len
}
if (op === 'H') {
// does not consume query or reference
// hardclip is just an indicator
}
if (op === 'S') {
// softclip consumes query
// below gets the entire soft clipped portion
const softClipStr = readSeq.slice(qpos, qpos + len)
qpos += len
}
}
}
Not exactly what you were looking for, but take a look here. It's a script I use for extracting large insertions based on CIGAR strings (in PAF files in my case). Could be a starting point for writing your own script (which sounds pretty useful BTW).