Entering edit mode
3.0 years ago
Thomas
•
0
I have to correct and remove some fields in the info column of a VCF file. Pysam seems the logical choice, but I fail to write a new file. Adding SAMPLE to a pysam.VariantFile seems to be the issue.
Any insight highly appreciated!
Code
import pysam
import re
def build_new_header(vcf_header:pysam.VariantHeader) -> pysam.VariantHeader:
"""
Convert Header to list.
Splitting on newline causes an empty field at the end. Remove with list(filter(None))
Remove all INFO lines not present in the new file
Adding #CHROM causes an error, skip
"""
tmp = list(filter(None,str(vcf_header).split("\n")))
new_header = pysam.VariantHeader()
for line in tmp:
new_line = line
if re.match('^##INFO=<ID=TRANSCRIPT', line):
continue
if re.search('^#CHROM', line):
continue
new_header.add_line(str(new_line))
return new_header
def update_info(file_in: pysam.VariantFile, file_out: pysam.VariantFile, new_info_column: dict) -> None:
"""Change the INFO column, then write to new VCF"""
for i in file_in:
i.info.update(new_info_column)
print(i.info)
file_out.write(i)
path_vcf_in = "test.sorted_bcf_2.vcf"
path_vcf_out = "test.new.vcf"
vcf_in = pysam.VariantFile(path_vcf_in, 'r')
new_header = build_new_header(vcf_in.header)
vcf_out = pysam.VariantFile(path_vcf_out,'w',header=new_header)
print(vcf_in.header)
print(vcf_out.header)
print("Formats:", list(vcf_out.header.formats))
### Change the INFO column, then write to new VCF
# update_info(vcf_in,vcf_out,{"GENE_ID":666, "GENE_NAME":"Luciferase" })
### Create record, then add samples
# record = vcf_out.new_record(contig="1", pos=752719, id="71976", ref="A", alts="G", qual=53.29, filter="PASS", info={"GENE_ID":666,"GENE_NAME":"Luciferase"})
# record.samples['42']['GT'] = (0,1)
# record.samples['42']['GQ'] = 53.29
# record.samples['42']['AD'] = 0,2
# record.samples['42']['DP'] = 2
test.sorted_bcf_2.vcf / vcf_in.header
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##assembly=GRCh37
##contig=<ID=1>
##INFO=<ID=GENE_ID,Number=1,Type=Integer,Description="ID">
##INFO=<ID=GENE_NAME,Number=1,Type=String,Description="Official gene name">
##INFO=<ID=TRANSCRIPT,Number=1,Type=String,Description="Canonical Transcript">
##INFO=<ID=GENOTYPE,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Score of the quality of the read of the variant.">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed.">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of good quality reads.">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 42
1 752719 751991 A C 54.29 PASS GENE_ID=.;GENE_NAME=.;GENOTYPE=Homozygous;TRANSCRIPT=ENST6666 GT:GQ:AD:DP 1/1:53.29:0,2:2
1 752721 71976 C G 53.29 PASS GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666; GT:GQ:AD:DP 1/1:53.29:0,2:2
1 754182 70250 A T 61.77 PASS GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666; GT:GQ:AD:DP 1/1:61.77:0,2:2
1 754192 7844 A G 61.77 PASS GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666; GT:GQ:AD:DP 1/1:61.77:0,2:2
new_header / vcf_out.header. TRANSCRIPT is removed, sample ID (42) not present
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##assembly=GRCh37
##contig=<ID=1>
##INFO=<ID=GENE_ID,Number=1,Type=Integer,Description="ID">
##INFO=<ID=GENE_NAME,Number=1,Type=String,Description="Official gene name">
##INFO=<ID=GENOTYPE,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Score of the quality of the read of the variant.">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed.">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of good quality reads.">
#CHROM POS ID REF ALT QUAL FILTER INFO
Error when running update_info
Formats: ['GT', 'GQ', 'AD', 'DP']
<pysam.libcbcf.VariantRecordInfo object at 0x7fec422b6528>
Traceback (most recent call last):
File "post.py", line 39, in <module>
update_info(vcf_in,vcf_out,{"GENE_ID":666, "GENE_NAME":"Luciferase" })
File "post.py", line 27, in update_info
file_out.write(i)
File "pysam/libcbcf.pyx", line 4400, in pysam.libcbcf.VariantFile.write
File "pysam/libcbcf.pyx", line 4426, in pysam.libcbcf.VariantFile.write
ValueError: Invalid VariantRecord. Number of samples does not match header (1 vs 0)
Error when trying to create new record
Formats: ['GT', 'GQ', 'AD', 'DP']
Traceback (most recent call last):
File "post.py", line 42, in <module>
record = vcf_out.new_record(contig="1", pos=752719, id="71976", ref="A", alts="G", qual=53.29, filter="PASS", info={"GENE_ID":666,"GENE_NAME":"Luciferase"})
File "pysam/libcbcf.pyx", line 4398, in pysam.libcbcf.VariantFile.new_record
File "pysam/libcbcf.pyx", line 2096, in pysam.libcbcf.VariantHeader.new_record
File "pysam/libcbcf.pyx", line 2864, in pysam.libcbcf.VariantRecordSamples.__getitem__
IndexError: invalid sample index
Maybe not a solution to your problem, and I haven't checked your use case in the documentation, but I like cyvcf2 for handling VCF files from python. I hope that module can be helpful for you...