Below is a Python API solution using the pyvcf
submodule from the fuc
package I wrote.
Imagine you have the following data:
>>> from fuc import pyvcf
>>> data = {
... 'CHROM': ['chr1', 'chr1', 'chr1', 'chr1'],
... 'POS': [100, 101, 102, 103],
... 'ID': ['.', '.', '.', '.'],
... 'REF': ['G', 'T', 'A', 'A'],
... 'ALT': ['A', 'C', 'T', 'C'],
... 'QUAL': ['.', '.', '.', '.'],
... 'FILTER': ['.', '.', '.', '.'],
... 'INFO': ['.', '.', '.', '.'],
... 'FORMAT': ['GT:GQ:DP', 'GT:GQ:DP', 'GT:GQ:DP', 'GT'],
... 'A': ['0/0:48:3', '0/0:19:3', './.:.:.', '0/1'],
... 'B': ['0/1:3:5', '0/1:20:8', '0/0:29:12', '0/0'],
... 'C': ['0/0:16:3', '1/0:32:8', '0/1:25:9', '0/0'],
... }
>>> vf = pyvcf.VcfFrame.from_dict([], data)
>>> vf.df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C
0 chr1 100 . G A . . . GT:GQ:DP 0/0:48:3 0/1:3:5 0/0:16:3
1 chr1 101 . T C . . . GT:GQ:DP 0/0:19:3 0/1:20:8 1/0:32:8
2 chr1 102 . A T . . . GT:GQ:DP ./.:.:. 0/0:29:12 0/1:25:9
3 chr1 103 . A C . . . GT 0/1 0/0 0/0
We can then define a custom method (one_row
) to be applied to each row to convert genotypes calls with GQ < 20 to missing:
>>> def one_row(r):
... format_list = r.FORMAT.split(':')
... # This method automatically finds the appropriate missing value ('./.', '.', './.:.:.', etc.).
... missval = pyvcf.row_missval(r)
...
... if 'GQ' in format_list:
... i = format_list.index('GQ')
... else:
... # This row doesn't have GQ, return the entire row as is.
... return r
...
... def one_gt(g):
... gq = g.split(':')[i]
...
... if gq.isnumeric():
... gq = int(gq)
... else:
... # This genotype doesn't have a numeric GQ value (probably has missing value '.'), keep it as is.
... return g
...
... if gq >= 20:
... # This genotype has GQ >= 20, keep it as is.
... return g
... else:
... return missval
...
... r[9:] = r[9:].apply(one_gt)
...
... return r
...
We then apply the method:
>>> vf.df = vf.df.apply(one_row, axis=1)
>>> vf.df
CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C
0 chr1 100 . G A . . . GT:GQ:DP 0/0:48:3 ./.:.:. ./.:.:.
1 chr1 101 . T C . . . GT:GQ:DP ./.:.:. 0/1:20:8 1/0:32:8
2 chr1 102 . A T . . . GT:GQ:DP ./.:.:. 0/0:29:12 0/1:25:9
3 chr1 103 . A C . . . GT 0/1 0/0 0/0
Note that you can easily read and write VCF files too:
vf = pyvcf.VcfFrame.from_file('in.vcf')
vf.to_file('out.vcf')
Let me know in the comment if you have any questions.