|
#!/usr/bin/env python |
|
""" |
|
coding_lengths.py |
|
Kamil Slowikowski |
|
February 7, 2014 |
|
|
|
Count the number of coding base pairs in each Gencode gene. |
|
|
|
Gencode coordinates, including all exons with Ensembl identifiers. |
|
(Gencode release 17 corresponds to hg19) |
|
|
|
ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz |
|
ftp://ftp.sanger.ac.uk/pub/gencode/release_17/gencode.v17.annotation.gtf.gz |
|
|
|
chr1 HAVANA gene 11869 14412 . + . gene_id "ENSG00000223972.4"; |
|
chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.4"; |
|
chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.4"; |
|
chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.4"; |
|
chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.4"; |
|
|
|
|
|
NCBI mapping from Entrez GeneID to Ensembl identifiers. |
|
|
|
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2ensembl.gz |
|
|
|
9606 1 ENSG00000121410 NM_130786.3 ENST00000263100 NP_570602.2 ENSP00000263100 |
|
9606 2 ENSG00000175899 NM_000014.4 ENST00000318602 NP_000005.2 ENSP00000323929 |
|
9606 3 ENSG00000256069 NR_040112.1 ENST00000543404 - - |
|
9606 9 ENSG00000171428 NM_000662.5 ENST00000307719 NP_000653.3 ENSP00000307218 |
|
9606 9 ENSG00000171428 XM_005273679.1 ENST00000517492 XP_005273736.1 ENSP00000429407 |
|
|
|
Output: |
|
|
|
Ensembl_gene_identifier GeneID length |
|
ENSG00000000005 64102 1339 |
|
ENSG00000000419 8813 1185 |
|
ENSG00000000457 57147 3755 |
|
ENSG00000000938 2268 3167 |
|
|
|
USAGE: |
|
coding_lengths.py -g FILE -n FILE [-o FILE] |
|
|
|
OPTIONS: |
|
-h Show this help message. |
|
-g FILE Gencode annotation.gtf.gz file. |
|
-n FILE NCBI gene2ensembl.gz file. |
|
-o FILE Output file (gzipped). |
|
""" |
|
|
|
|
|
import GTF # https://gist.github.com/slowkow/8101481 |
|
|
|
from docopt import docopt |
|
import pandas as pd |
|
import gzip |
|
import time |
|
import sys |
|
from contextlib import contextmanager |
|
|
|
|
|
def main(args): |
|
# Input files. |
|
GENCODE = args['-g'] |
|
NCBI_ENSEMBL = args['-n'] |
|
|
|
# Output file prefix. |
|
GENE_LENGTHS = args['-o'] or "ncbi_ensembl_coding_lengths.txt.gz" |
|
|
|
with log("Reading the Gencode annotation file: {}".format(GENCODE)): |
|
gc = GTF.dataframe(GENCODE) |
|
|
|
# Select just exons of protein coding genes, and columns that we want to use. |
|
idx = (gc.feature == 'exon') & (gc.transcript_type == 'protein_coding') |
|
exon = gc.ix[idx, ['seqname','start','end','gene_id','gene_name']] |
|
|
|
# Convert columns to proper types. |
|
exon.start = exon.start.astype(int) |
|
exon.end = exon.end.astype(int) |
|
|
|
# Sort in place. |
|
exon.sort(['seqname','start','end'], inplace=True) |
|
|
|
# Group the rows by the Ensembl gene identifier (with version numbers.) |
|
groups = exon.groupby('gene_id') |
|
|
|
with log("Calculating coding region (exonic) length for each gene..."): |
|
lengths = groups.apply(count_bp) |
|
|
|
with log("Reading NCBI mapping of Entrez GeneID "\ |
|
"to Ensembl gene identifier: {}".format(NCBI_ENSEMBL)): |
|
g2e = pd.read_table(NCBI_ENSEMBL, |
|
compression="gzip", |
|
header=None, |
|
names=['tax_id', 'GeneID', |
|
'Ensembl_gene_identifier', |
|
'RNA_nucleotide_accession.version', |
|
'Ensembl_rna_identifier', |
|
'protein_accession.version', |
|
'Ensembl_protein_identifier']) |
|
|
|
# Create a new DataFrame with gene lengths and EnsemblID. |
|
ensembl_no_version = lengths.index.map(lambda x: x.split(".")[0]) |
|
ldf = pd.DataFrame({'length': lengths, |
|
'Ensembl_gene_identifier': ensembl_no_version}, |
|
index=lengths.index) |
|
|
|
# Merge so we have EntrezGeneID with length. |
|
m1 = pd.merge(ldf, g2e, on='Ensembl_gene_identifier') |
|
m1 = m1[['Ensembl_gene_identifier', 'GeneID', 'length']].drop_duplicates() |
|
|
|
with log("Writing output file: {}".format(GENE_LENGTHS)): |
|
with gzip.open(GENE_LENGTHS, "wb") as out: |
|
m1.to_csv(out, sep="\t", index=False) |
|
|
|
|
|
def count_bp(df): |
|
"""Given a DataFrame with the exon coordinates from Gencode for a single |
|
gene, return the total number of coding bases in that gene. |
|
Example: |
|
>>> import numpy as np |
|
>>> n = 3 |
|
>>> r = lambda x: np.random.sample(x) * 10 |
|
>>> d = pd.DataFrame([np.sort([a,b]) for a,b in zip(r(n), r(n))], columns=['start','end']).astype(int) |
|
>>> d |
|
start end |
|
0 6 9 |
|
1 3 4 |
|
2 4 9 |
|
>>> count_bp(d) |
|
7 |
|
|
|
Here is a visual representation of the 3 exons and the way they are added: |
|
|
|
123456789 Length |
|
0 ---- 4 |
|
1 -- 2 |
|
2 ------ 6 |
|
======= 7 |
|
""" |
|
start = df.start.min() |
|
end = df.end.max() |
|
bp = [False] * (end - start + 1) |
|
for i in range(df.shape[0]): |
|
s = df.iloc[i]['start'] - start |
|
e = df.iloc[i]['end'] - start + 1 |
|
bp[s:e] = [True] * (e - s) |
|
return sum(bp) |
|
|
|
|
|
@contextmanager |
|
def log(message): |
|
"""Log a timestamp, a message, and the elapsed time to stderr.""" |
|
start = time.time() |
|
sys.stderr.write("{} # {}\n".format(time.asctime(), message)) |
|
yield |
|
elapsed = int(time.time() - start + 0.5) |
|
sys.stderr.write("{} # done in {} s\n".format(time.asctime(), elapsed)) |
|
sys.stderr.flush() |
|
|
|
|
|
if __name__ == '__main__': |
|
args = docopt(__doc__) |
|
main(args) |
Thanks a lot, save my time writing a perl script. :)
Nice to hear that it's useful to others
Thank you for this script !
Really Really tanks!!
you're welcome. maybe you should make your post a comment instead of an answer
Thanks so much, this saves me a lot of time!
Hi Irsan,
Thanks for a very helpful chunk of code! It solved two of my problems. First I got the length for calculating FPKM which I need later in my analysis, secondly it gave an idea how to quickly count the number of exons per gene which I also needed.
Just a quick comment. Instead of using lapply I simply run:
Which took only about 2 seconds and returned a data.frame which I could easly merge with the rest of my data.
updating comment error resolved after starting R new session.
Hi all,
I'm getting an error in this line:
Did anyone encounter this ?
Best
Hi,
Did you by any chance resolve this? I'm very interested in getting this code to work as well.
Best, Asta
I think this error comes from R running "reduce" from dplyr (if you have it loaded) instead of from GenomicRanges as we want it to do. You can substitute "reduce" with "GenomicRanges::reduce" to fix it. So the command will be:
Using the code above For zebrafish, what you can do is:
The code above was working two days ago, but did not work yesterday. Here is the thing, I guess, one of the packages I have downloaded before GenomicFeatures prevented the code above to work properly (although there were no warning, package masking). Where do I know this from? Simple, today the code is working :). Please, be careful about this.
Hi! your code looks great! I want to calculate the and I got to this post via github page but I am confused. Is this code useful to calculate the length only or the meanFragmentLength?
Thank you!
Looks like we're on the same path
yeah but I am sure you know how to do it, whereas I am just go more confused :/
Nope, I kinda gave up. I got to the image on what fragment size is (What is the difference between a Read and a Fragment in RNA-seq?) but cannot figure out how to connect that to the picard metrics output. For the moment, I'd say use insert size for fragment length especially since you trimmed adapters using trimmomatic anyway.
Can different genes that share the same exons cause similar problem?
Because you reduce overlapping exons for each individual gene seperately, they will contribute to both gene sizes like the way you want
How about total non-overlapping introns per gene lengths? Introns are grouped by transcripts.How could I group them by genes? Thanks!
Oef, I have to look that up in the GenomicRanges manual. Their is an introns() accessor, that might bring you somewhere
Hi Irsan, How can you collect exons per gene_name, instead of gene id? I am using a gtf file Thank you
Do you mean gene symbol in stead of gene name? Like TP53/RB1/E2F3?
Do you mean gene symbol in stead of gene name? Like TP53/RB1/E2F3?
Yes, i am using annotation gtf file from Ensembl. I want to get gene symbol, instead of Ensembl gene ID. Do you know how? Thanks
Many ensembl gene ids do not have an official HGNC symbol (that is what you are looking for). I think the gene symbols are not even in the gtf-file. So you have to work with Ensembl gene IDs and at the end annotate them with gene symbol with for example bioconductor package org.Hs.eg.db
I used gtf file for human from here but the
makeTranscriptDbFromGFF
gave me the following error,is there any suggestions? probably a point to another gtf file?
Download GTF-file from UCSC table browser
didn't work - I decided using http://bioconductor.org/packages/2.10/data/annotation/html/BSgenome.Hsapiens.UCSC.hg19.html
I've tried this and it might not be the best idea. When creating a GTF-file, the UCSC table browser fills in the gene_ID field with the transcript_ID, no matter whether the UCSC, RefSeq or Ensembl track is used, which defeats the purpose of your script. I have no idea why UCSC would do this, seems like a very dangerous oversight on their part.
Thanks for the script though, it's saved me a lot of time.
The last part:
Seems to be taking extraordinarily long to run. Any faster method?
sum(width(reduce(exons.list.per.gene))) works directly and should be faster
Thank you so much for sharing the code! I am a beginner, just wondering if you could help with how to input the length information into the dds data. Many thanks!
Hi Irsan
Thank you very much for providing us with the results of your studies.
The study I am analyzing is a single-end sequence.
Can I use your code with these conditions?
Hi,
To reduce the time taken to upload the gtf file. Install using bioconductor the TxDb libraries:
best
Would you recommend using the gencode comprehensive annotations file or the basic annotations file to do this since they produce different results (see here and here)? Here they use the comprehensive one.