I think there is something amiss with either Biomart or Ensembl.
library(biomarRt)
.A1_ATTRIBS <- c("ensembl_transcript_id",
"chromosome_name",
"strand",
"transcript_start",
"transcript_end")
mart<-useMart(biomart="ensembl",dataset="hsapiens_gene_ensembl")
filters <- values <- ""
bm_table <- getBM(.A1_ATTRIBS, filters=filters, values=values, mart=mart)
transcripts_tx_name <- bm_table$ensembl_transcript_id
print(sort(transcripts_tx_name[duplicated(transcripts_tx_name)]))
[1] "ENST00000244174" "ENST00000262640" "ENST00000285718" "ENST00000286448" "ENST00000302805" "ENST00000313871"
[7] "ENST00000326153" "ENST00000331035" "ENST00000334060" "ENST00000334651" "ENST00000340131" "ENST00000355432"
[13] "ENST00000355805" "ENST00000359512" "ENST00000361536" "ENST00000369423" "ENST00000369437" "ENST00000381177"
[19] "ENST00000381180" "ENST00000381184" "ENST00000381187" "ENST00000381192" "ENST00000381218" "ENST00000381222"
[25] "ENST00000381223" "ENST00000381229" "ENST00000381233" "ENST00000381241" "ENST00000381261" "ENST00000381297"
[31] "ENST00000381317" "ENST00000381333" "ENST00000381401" "ENST00000381469" "ENST00000381500" "ENST00000381507"
[37] "ENST00000381509" "ENST00000381524" "ENST00000381529" "ENST00000381566" "ENST00000381567" "ENST00000381575"
[43] "ENST00000381578" "ENST00000381625" "ENST00000381657" "ENST00000381663" "ENST00000390665" "ENST00000391707"
[49] "ENST00000399012" "ENST00000399966" "ENST00000400701" "ENST00000400841" "ENST00000411342" "ENST00000412290"
[55] "ENST00000412516" "ENST00000412936" "ENST00000414513" "ENST00000415337" "ENST00000416733" "ENST00000417535"
[61] "ENST00000419094" "ENST00000419737" "ENST00000420411" "ENST00000420865" "ENST00000421233" "ENST00000422618"
[67] "ENST00000424344" "ENST00000425740" "ENST00000427886" "ENST00000429181" "ENST00000430235" "ENST00000430536"
[73] "ENST00000430562" "ENST00000430923" "ENST00000431238" "ENST00000431582" "ENST00000431919" "ENST00000432272"
[79] "ENST00000432318" "ENST00000432523" "ENST00000432757" "ENST00000434938" "ENST00000435581" "ENST00000436474"
[85] "ENST00000437244" "ENST00000441131" "ENST00000443019" "ENST00000443929" "ENST00000444280" "ENST00000445062"
[91] "ENST00000445777" "ENST00000445785" "ENST00000445792" "ENST00000447472" "ENST00000447786" "ENST00000448477"
[97] "ENST00000449611" "ENST00000452144" "ENST00000453953" "ENST00000455739" "ENST00000456370" "ENST00000460206"
[103] "ENST00000460621" "ENST00000460672" "ENST00000461007" "ENST00000461691" "ENST00000462195" "ENST00000463317"
[109] "ENST00000463763" "ENST00000464205" "ENST00000464935" "ENST00000467626" "ENST00000468169" "ENST00000469624"
[115] "ENST00000474361" "ENST00000474865" "ENST00000475167" "ENST00000475259" "ENST00000475594" "ENST00000475859"
[121] "ENST00000476066" "ENST00000477110" "ENST00000477636" "ENST00000477940" "ENST00000478256" "ENST00000478825"
[127] "ENST00000479401" "ENST00000479438" "ENST00000479687" "ENST00000481245" "ENST00000482170" "ENST00000482293"
[133] "ENST00000482405" "ENST00000483079" "ENST00000483286" "ENST00000483543" "ENST00000484026" "ENST00000484364"
[139] "ENST00000484415" "ENST00000484611" "ENST00000485332" "ENST00000486791" "ENST00000488344" "ENST00000489233"
[145] "ENST00000491683" "ENST00000492963" "ENST00000493312" "ENST00000494962" "ENST00000494969" "ENST00000496011"
[151] "ENST00000496301" "ENST00000496630" "ENST00000497752" "ENST00000498153" "ENST00000501036" "ENST00000503018"
[157] "ENST00000507418" "ENST00000509780" "ENST00000515319" "ENST00000527459" "ENST00000534940" "ENST00000540897"
[163] "ENST00000554971"
If I go http://www.biomart.org/biomart/martview/
and search for ENST00000244174
I get two duplicate entries, so a temporary and less than ideal solution is to skip these duplicated transcripts (run the code above the fold first):
"%!in%" <- function(x, y) !(x %in% y)
really_unique<-function(x){x[x %!in% x[duplicated(x)]]}
makeTranscriptDbFromBiomart(biomart="ensembl",
dataset="hsapiens_gene_ensembl",
transcript_ids=really_unique(transcripts_tx_name)
)
can you give the output of sessionInfo()
version 2.13.1 (2011-07-08) Platform: x86_64-pc-linux-gnu (64-bit)
locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=C LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] stats graphics grDevices utils datasets methods base
other attached packages: [1] GenomicFeatures_1.4.5 GenomicRanges_1.4.
version 2.13.1 (2011-07-08) Platform: x86_64-pc-linux-gnu (64-bit)
locale: [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=C LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages: [1] stats graphics grDevices utils datasets methods base
other attached packages: [1] GenomicFeatures_1.4.5 GenomicRanges_1.4.
I have updated the question Jeremy