Pysam & VCF: How to add sample information to a new record?
0
0
Entering edit mode
3.0 years ago
Thomas • 0

I have to correct and remove some fields in the info column of a VCF file. Pysam seems the logical choice, but I fail to write a new file. Adding SAMPLE to a pysam.VariantFile seems to be the issue.

Any insight highly appreciated!

Code

import pysam
import re

def build_new_header(vcf_header:pysam.VariantHeader) -> pysam.VariantHeader:
    """
    Convert Header to list.
    Splitting on newline causes an empty field at the end. Remove with list(filter(None))
    Remove all INFO lines not present in the new file
    Adding #CHROM causes an error, skip
    """
    tmp = list(filter(None,str(vcf_header).split("\n")))
    new_header = pysam.VariantHeader()
    for line in tmp:
        new_line = line
        if re.match('^##INFO=<ID=TRANSCRIPT', line):
            continue
        if re.search('^#CHROM', line):
            continue
        new_header.add_line(str(new_line))
    return new_header

def update_info(file_in: pysam.VariantFile, file_out: pysam.VariantFile, new_info_column: dict) -> None:
    """Change the INFO column, then write to new VCF"""
    for i in file_in:
        i.info.update(new_info_column)
        print(i.info)
        file_out.write(i)

path_vcf_in      = "test.sorted_bcf_2.vcf"
path_vcf_out     = "test.new.vcf"
vcf_in           = pysam.VariantFile(path_vcf_in, 'r')
new_header       = build_new_header(vcf_in.header)

vcf_out          = pysam.VariantFile(path_vcf_out,'w',header=new_header)
print(vcf_in.header)
print(vcf_out.header)
print("Formats:", list(vcf_out.header.formats))
### Change the INFO column, then write to new VCF
# update_info(vcf_in,vcf_out,{"GENE_ID":666, "GENE_NAME":"Luciferase" })

### Create record, then add samples
# record = vcf_out.new_record(contig="1", pos=752719, id="71976", ref="A", alts="G", qual=53.29, filter="PASS", info={"GENE_ID":666,"GENE_NAME":"Luciferase"})
# record.samples['42']['GT'] = (0,1)
# record.samples['42']['GQ'] = 53.29
# record.samples['42']['AD'] = 0,2
# record.samples['42']['DP'] = 2

test.sorted_bcf_2.vcf / vcf_in.header

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##assembly=GRCh37
##contig=<ID=1>
##INFO=<ID=GENE_ID,Number=1,Type=Integer,Description="ID">
##INFO=<ID=GENE_NAME,Number=1,Type=String,Description="Official gene name">
##INFO=<ID=TRANSCRIPT,Number=1,Type=String,Description="Canonical Transcript">
##INFO=<ID=GENOTYPE,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Score of the quality of the read of the variant.">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed.">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of good quality reads.">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  42
1   752719  751991  A   C   54.29   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=Homozygous;TRANSCRIPT=ENST6666   GT:GQ:AD:DP 1/1:53.29:0,2:2
1   752721  71976   C   G   53.29   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666;   GT:GQ:AD:DP 1/1:53.29:0,2:2
1   754182  70250   A   T   61.77   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666;   GT:GQ:AD:DP 1/1:61.77:0,2:2
1   754192  7844    A   G   61.77   PASS    GENE_ID=.;GENE_NAME=.;GENOTYPE=.;TRANSCRIPT=ENST6666;   GT:GQ:AD:DP 1/1:61.77:0,2:2

new_header / vcf_out.header. TRANSCRIPT is removed, sample ID (42) not present

##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##assembly=GRCh37
##contig=<ID=1>
##INFO=<ID=GENE_ID,Number=1,Type=Integer,Description="ID">
##INFO=<ID=GENE_NAME,Number=1,Type=String,Description="Official gene name">
##INFO=<ID=GENOTYPE,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Heterozygous, Hemizygous, Homozygous">
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Score of the quality of the read of the variant.">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed.">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Number of good quality reads.">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO

Error when running update_info

Formats: ['GT', 'GQ', 'AD', 'DP']
<pysam.libcbcf.VariantRecordInfo object at 0x7fec422b6528>
Traceback (most recent call last):
  File "post.py", line 39, in <module>
    update_info(vcf_in,vcf_out,{"GENE_ID":666, "GENE_NAME":"Luciferase" })
  File "post.py", line 27, in update_info
    file_out.write(i)
  File "pysam/libcbcf.pyx", line 4400, in pysam.libcbcf.VariantFile.write
  File "pysam/libcbcf.pyx", line 4426, in pysam.libcbcf.VariantFile.write
ValueError: Invalid VariantRecord.  Number of samples does not match header (1 vs 0)

Error when trying to create new record

Formats: ['GT', 'GQ', 'AD', 'DP']
Traceback (most recent call last):
  File "post.py", line 42, in <module>
    record = vcf_out.new_record(contig="1", pos=752719, id="71976", ref="A", alts="G", qual=53.29, filter="PASS", info={"GENE_ID":666,"GENE_NAME":"Luciferase"})
  File "pysam/libcbcf.pyx", line 4398, in pysam.libcbcf.VariantFile.new_record
  File "pysam/libcbcf.pyx", line 2096, in pysam.libcbcf.VariantHeader.new_record
  File "pysam/libcbcf.pyx", line 2864, in pysam.libcbcf.VariantRecordSamples.__getitem__
IndexError: invalid sample index
pysam python vcf • 3.5k views
ADD COMMENT
0
Entering edit mode

Maybe not a solution to your problem, and I haven't checked your use case in the documentation, but I like cyvcf2 for handling VCF files from python. I hope that module can be helpful for you...

ADD REPLY

Login before adding your answer.

Traffic: 2446 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6