Question

Pysam & VCF: How to add sample information to a new record?

0

Entering edit mode

3.0 years ago

Thomas • 0

I have to correct and remove some fields in the info column of a VCF file. Pysam seems the logical choice, but I fail to write a new file. Adding SAMPLE to a pysam.VariantFile seems to be the issue.

Any insight highly appreciated!

Code

import pysam
import re

def build_new_header(vcf_header:pysam.VariantHeader) -> pysam.VariantHeader:
    """
    Convert Header to list.
    Splitting on newline causes an empty field at the end. Remove with list(filter(None))
    Remove all INFO lines not present in the new file
    Adding #CHROM causes an error, skip
    """
    tmp = list(filter(None,str(vcf_header).split("\n")))
    new_header = pysam.VariantHeader()
    for line in tmp:
        new_line = line
        if re.match('^##INFO=<ID=TRANSCRIPT', line):
            continue
        if re.search('^#CHROM', line):
            continue
        new_header.add_line(str(new_line))
    return new_header

def update_info(file_in: pysam.VariantFile, file_out: pysam.VariantFile, new_info_column: dict) -> None:
    """Change the INFO column, then write to new VCF"""
    for i in file_in:
        i.info.update(new_info_column)
        print(i.info)
        file_out.write(i)

path_vcf_in      = "test.sorted_bcf_2.vcf"
path_vcf_out     = "test.new.vcf"
vcf_in           = pysam.VariantFile(path_vcf_in, 'r')
new_header       = build_new_header(vcf_in.header)

vcf_out          = pysam.VariantFile(path_vcf_out,'w',header=new_header)
print(vcf_in.header)
print(vcf_out.header)
print("Formats:", list(vcf_out.header.formats))
### Change the INFO column, then write to new VCF
# update_info(vcf_in,vcf_out,{"GENE_ID":666, "GENE_NAME":"Luciferase" })

### Create record, then add samples
# record = vcf_out.new_record(contig="1", pos=752719, id="71976", ref="A", alts="G", qual=53.29, filter="PASS", info={"GENE_ID":666,"GENE_NAME":"Luciferase"})
# record.samples['42']['GT'] = (0,1)
# record.samples['42']['GQ'] = 53.29
# record.samples['42']['AD'] = 0,2
# record.samples['42']['DP'] = 2

test.sorted_bcf_2.vcf / vcf_in.header

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##assembly=GRCh37
##contig=<ID=1>
##INFO=<ID=GENE_ID,Number=1,Type=Integer,Description="ID">
##INFO=<ID=GENE_NAME,Number=1,Type=String,Description="Official gene name">
##INFO=<ID=TRANSCRIPT,Number=1,Type=String,Description="Canonical Transcript">
##INFO=<ID=GENOTYPE,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Score of the quality of the read of the variant.">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed.">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of good quality reads.">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  42
1   752719  751991  A   C   54.29   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=Homozygous;TRANSCRIPT=ENST6666   GT:GQ:AD:DP 1/1:53.29:0,2:2
1   752721  71976   C   G   53.29   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666;   GT:GQ:AD:DP 1/1:53.29:0,2:2
1   754182  70250   A   T   61.77   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666;   GT:GQ:AD:DP 1/1:61.77:0,2:2
1   754192  7844    A   G   61.77   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666;   GT:GQ:AD:DP 1/1:61.77:0,2:2

new_header / vcf_out.header. TRANSCRIPT is removed, sample ID (42) not present

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##assembly=GRCh37
##contig=<ID=1>
##INFO=<ID=GENE_ID,Number=1,Type=Integer,Description="ID">
##INFO=<ID=GENE_NAME,Number=1,Type=String,Description="Official gene name">
##INFO=<ID=GENOTYPE,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Score of the quality of the read of the variant.">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed.">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of good quality reads.">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO

Error when running update_info

Formats: ['GT', 'GQ', 'AD', 'DP']
<pysam.libcbcf.VariantRecordInfo object at 0x7fec422b6528>
Traceback (most recent call last):
  File "post.py", line 39, in <module>
    update_info(vcf_in,vcf_out,{"GENE_ID":666, "GENE_NAME":"Luciferase" })
  File "post.py", line 27, in update_info
    file_out.write(i)
  File "pysam/libcbcf.pyx", line 4400, in pysam.libcbcf.VariantFile.write
  File "pysam/libcbcf.pyx", line 4426, in pysam.libcbcf.VariantFile.write
ValueError: Invalid VariantRecord.  Number of samples does not match header (1 vs 0)

Error when trying to create new record

Formats: ['GT', 'GQ', 'AD', 'DP']
Traceback (most recent call last):
  File "post.py", line 42, in <module>
    record = vcf_out.new_record(contig="1", pos=752719, id="71976", ref="A", alts="G", qual=53.29, filter="PASS", info={"GENE_ID":666,"GENE_NAME":"Luciferase"})
  File "pysam/libcbcf.pyx", line 4398, in pysam.libcbcf.VariantFile.new_record
  File "pysam/libcbcf.pyx", line 2096, in pysam.libcbcf.VariantHeader.new_record
  File "pysam/libcbcf.pyx", line 2864, in pysam.libcbcf.VariantRecordSamples.__getitem__
IndexError: invalid sample index

pysam python vcf • 3.5k views

ADD COMMENT • link updated 3.0 years ago by WouterDeCoster 47k • written 3.0 years ago by Thomas • 0

0

Entering edit mode

Maybe not a solution to your problem, and I haven't checked your use case in the documentation, but I like cyvcf2 for handling VCF files from python. I hope that module can be helpful for you...

ADD REPLY • link 3.0 years ago by WouterDeCoster 47k