Entering edit mode
8 months ago
Adyasha
•
0
Hello everyone,
I have annotation file like this
less -S Sars_cov_2.ASM985889v3.101.gtf | head -20
#!genome-build ASM985889v3
#!genome-version ASM985889v3
#!genome-date 2020-01
#!genome-build-accession NCBI:GCA_009858895.3
MN908947.3 ensembl gene 266 21555 . + . gene_id "ENSSASG00005000002"; gene_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding";
MN908947.3 ensembl transcript 266 21555 . + . gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl exon 266 21555 . + . gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSSASE00005000002"; exon_version "1";
MN908947.3 ensembl CDS 266 21552 . + 0 gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSSASP00005000002"; protein_version "1";
MN908947.3 ensembl start_codon 266 268 . + 0 gene_id "ENSSASG00005000002"; gene_version "1"; transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl stop_codon 21553 21555 . + 0 gene_id "ENSSASG00005000002"; gene_version "1";
transcript_id "ENSSAST00005000002"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1ab"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl gene 266 13483 . + . gene_id "ENSSASG00005000003"; gene_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding";
MN908947.3 ensembl transcript 266 13483 . + . gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl exon 266 13483 . + . gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id "ENSSASE00005000003"; exon_version "1";
MN908947.3 ensembl CDS 266 13480 . + 0 gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSSASP00005000003"; protein_version "1";
MN908947.3 ensembl start_codon 266 268 . + 0 gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl stop_codon 13481 13483 . + 0 gene_id "ENSSASG00005000003"; gene_version "1"; transcript_id "ENSSAST00005000003"; transcript_version "1"; exon_number "1"; gene_name "ORF1ab"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "ORF1a"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl gene 21563 25384 . + . gene_id "ENSSASG00005000004"; gene_version "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding";
MN908947.3 ensembl transcript 21563 25384 . + . gene_id "ENSSASG00005000004"; gene_version "1"; transcript_id "ENSSAST00005000004"; transcript_version "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "S"; transcript_source "ensembl"; transcript_biotype "protein_coding";
MN908947.3 ensembl exon 21563 25384 . + . gene_id "ENSSASG00005000004"; gene_version "1"; transcript_id "ENSSAST00005000004"; transcript_version "1"; exon_number "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "S"; transcript_source "ensembl"; transcript_biotype "protein_coding"; exon_id
"ENSSASE00005000004"; exon_version "1";
MN908947.3 ensembl CDS 21563 25381 . + 0 gene_id "ENSSASG00005000004"; gene_version "1"; transcript_id "ENSSAST00005000004"; transcript_version "1"; exon_number "1"; gene_name "S"; gene_source "ensembl"; gene_biotype "protein_coding"; transcript_name "S"; transcript_source "ensembl"; transcript_biotype "protein_coding"; protein_id "ENSSASP00005000004"; protein_version "1";
I have a reference genome :
sequence.fasta
and a bam file :
ILS_W_V_558_S2_R1_001_val.bam
that looks like this :
samtools view ILS_W_V_558_S2_R1_001_val.bam | head
NB551648:137:HN725BGXL:1:13307:9948:2995 97 NC_045512.2 37 42 36M = 17969 17968
CAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACG AAAAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEE AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:DP
NB551648:137:HN725BGXL:1:12301:12947:7621 97 NC_045512.2 39 42 36M = 28343 28340 ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:DP
NB551648:137:HN725BGXL:1:13104:10907:7606 161 NC_045512.2 39 42 36M = 28306 28303 ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA AAAAAEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEE AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:DP
NB551648:137:HN725BGXL:1:21308:15644:16840 161 NC_045512.2 39 42 36M = 28401 28398 ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:DP
NB551648:137:HN725BGXL:1:22104:26696:1445 99 NC_045512.2 39 42 36M = 99 96 ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA AAAAAEEAEEEEEA/EEEEEEAEEEEEEAEEEEAEA AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:CP
NB551648:137:HN725BGXL:1:23201:19089:2498 161 NC_045512.2 39 42 36M = 28293 28290 ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA AAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:DP
NB551648:137:HN725BGXL:1:23206:4111:17048 161 NC_045512.2 39 42 36M = 28334 28331 ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAA /AAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:36 YS:i:0 YT:Z:DP
I want to see gene specific coverage from the bam file. The result should have the gene_name in 1st column and its Coverage in 2nd column
I tried this command :
import subprocess
# Parse the GTF file and extract gene information
def parse_gtf(gtf_file):
gene_info = {}
with open(gtf_file, 'r') as f:
for line in f:
if line.startswith('#'):
continue
parts = line.strip().split('\t')
if parts[2] == 'gene':
gene_id = parts[8].split(';')[0].split('"')[1]
gene_name = parts[8].split(';')[3].split('"')[1]
gene_info[gene_id] = gene_name
return gene_info
# Calculate gene-specific coverage from the BAM file
def calculate_gene_coverage(gene_info, bam_file):
gene_coverage = {}
for gene_id, gene_name in gene_info.items():
region = f'{gene_id}'
command = f'samtools depth -r {region} {bam_file}'
result = subprocess.run(command, shell=True, capture_output=True, text=True)
if result.returncode == 0:
coverage = sum(int(line.split()[2]) for line in result.stdout.strip().split('\n'))
gene_coverage[gene_name] = coverage
return gene_coverage
#main_command
def main():
gtf_file = 'Sars_cov_2.ASM985889v3.101.gtf'
bam_file = 'ILS_W_V_558_S2_R1_001_val.bam'
# Parse GTF file to get gene information
gene_info = parse_gtf(gtf_file)
# Print gene information
print("Gene Information:")
for gene_id, gene_name in gene_info.items():
print(f"{gene_id}: {gene_name}")
# Calculate gene-specific coverage
gene_coverage = calculate_gene_coverage(gene_info, bam_file)
# Output gene-specific coverage
print("\nGene-specific Coverage:")
for gene_name, coverage in gene_coverage.items():
print(f'{gene_name}\t{coverage}')
if __name__ == "__main__":
main()
I have got this output :
Gene Information:
ENSSASG00005000002: ensembl
ENSSASG00005000003: ensembl
ENSSASG00005000004: ensembl
ENSSASG00005000006: ensembl
ENSSASG00005000010: ensembl
ENSSASG00005000007: ensembl
Gene-specific Coverage:
I am unable to understand this.
Please help.
Also if is there any other quick way to find gene specific coverage from the bam file then please share. I dont know the gene location co-ordinates.
Thank you
This is not going to work no matter what program you use since the chromosome identifiers in your GTF (
MN908947.3
) do not match your BAM file (NC_045512.2
). So find a annotation file that matches your reference genome."Also if is there any other quick way to find gene specific coverage" can you give some example so that its bit more clear what do you mean by gene specific coverage ?
Thank you, here I am giving you some example , suppose I have bam files .
now I want to see Spike gene coverage in each sample.
I want output like