I have a bunch of FPKM normalized mRNA sequencing from TCGA-PAAD, which covers pancreatic cancer patients. I wrote a script which goes through that data and makes a big data frame with all 60k+ Ensemble mappings for all 180+ patients, so that I can analyze various hypotheses regarding the clinical data.
I have the clinical metadata in a separate table, which I obtained using the TCGABiolinks package.
The problem is that all the columns in the sequencing data table are "HT-seq FPKM UUIDs".
Now all I need to do is map the HT-seq FPKM UUID to a Case UUID (patient)
I need an API query for this, or a call to some TCGABiolinks function.
Thanks in advance for your help!
Ok I have followed the advice here with some modifications. I obtained a json encoded case and file manifest which take the form given below.
File manifest (first few records)
[{
"file_name": "232f085b-6201-4e4d-8473-e592b8d8e16d.FPKM.txt.gz",
"data_format": "TXT",
"access": "open",
"data_category": "Transcriptome Profiling",
"file_size": 514027,
"cases": [
{
"project": {
"project_id": "TCGA-PAAD"
},
"case_id": "620e0648-ec20-4a12-a6cb-5546fe829c77"
}
],
"annotations": [
{
"annotation_id": "050203a0-12ab-5025-973d-e070d94f722b"
}
]
},{
"file_name": "b0159d01-f1eb-490d-875b-cfdabed6f529.FPKM.txt.gz",
"data_format": "TXT",
"access": "open",
"data_category": "Transcriptome Profiling",
"file_size": 515800,
"cases": [
{
"project": {
"project_id": "TCGA-PAAD"
},
"case_id": "16b38977-aea1-4c75-89ec-4fb551f652dd"
}
]
},{
"file_name": "f2389819-b8fc-460e-821c-01dba313cce1.FPKM.txt.gz",
"data_format": "TXT",
"access": "open",
"data_category": "Transcriptome Profiling",
"file_size": 510184,
"cases": [
{
"project": {
"project_id": "TCGA-PAAD"
},
"case_id": "23908554-b98e-4ff8-98e7-dee3e2c5feaf"
}
]
},{
Cases manifest (first few records)
[{
"diagnoses": [
{
"days_to_death": null
}
],
"case_id": "33833131-1482-42d5-9cf5-01cade540234",
"submitter_id": "TCGA-2J-AAB4"
},{
"diagnoses": [
{
"days_to_death": 738.0
}
],
"case_id": "67e9abc1-4b6f-4054-bdc4-29906c55c682",
"submitter_id": "TCGA-3A-A9IC"
},{
"diagnoses": [
{
"days_to_death": null
}
],
"case_id": "a53c919a-4e08-46f1-af3f-30b16b597c33",
"submitter_id": "TCGA-IB-AAUU"
},{
"diagnoses": [
{
"days_to_death": 278.0
}
],
"case_id": "ab449860-46e5-485e-abd5-31c5abef2c58",
"submitter_id": "TCGA-L1-A7W4"
},{
Here is the code that was run:
rm(list = ls())
output_logfile <- file("log_output.txt", open="wt")
message_logfile <- file("log_message.txt", open="wt")
sink(file=output_logfile, type = "output")
sink(file=message_logfile, type = "message")
library(GSAR)
library(org.Hs.eg.db)
library(GSVAdata)
library(GSEABase)
library(GSVA)
library(dplyr)
data(c2BroadSets)
#### The following should work with data from the GDC, where we have a shopping cart
#### of zipped single patient sequencing data
#### Example workflow for a GDC-hosted study (TCGA-PAAD):
#### Go to the GDC homepage of the study: https://portal.gdc.cancer.gov/projects/TCGA-PAAD
#### Click on "Cases"
#### download the json manifest for cases
#### move it to the project directory
#### Click on "Files"
#### download the json manifest for files
#### move it to the project directory
#### download the actual data by selecting appropriate filters at left and clicking "add all files to cart"
#### unzip the downloaded archive and move the directory to the project folder
#### This Creates a Matrix with all our data
file.loc <- "PAAD-FPKM" # the name of the data dir
file.manifest <- "files.json" # the name of the files manifest
cases.manifest <- "cases.json" # the name of the cases manifest
dsep <- "/"
files <- fromJSON(file=file.manifest)
cases <- fromJSON(file=cases.manifest)
dlfiles <- list.files(file.loc, recursive = TRUE)
#### remove the manifest, since we are using a separately downloaded json object
unlink(paste0(file.loc, dsep, "MANIFEST.txt"))
#for (file in 1:length(dlfiles)){
for (file in 1:5){
if(!exists("rna_table")){
## get the patient barcode
case_id = strsplit(dlfiles[file],"/")
## create a table with the expression profile where the count column is named with the patient barcode
rna_table<-read.delim(paste0(file.loc,dsep,dlfiles[file]), sep="\t", col.names = c("ensemble_id",case_id[[1]][2]))
} else {
case_id = strsplit(dlfiles[file],"/")
new_data<-read.delim(paste0(file.loc,dsep,dlfiles[file]), sep="\t", col.names = c("ensemble_id",case_id[[1]][2]))
rna_table <- full_join(rna_table, new_data, by = NULL)
}
}
rna_data <- as.matrix(rna_table)
rownames(rna_data) <- rna_data[,1]
rna_data <- rna_data[,-1]
#### This creates a Matrix of all the death dates
death_data <- matrix(nrow=ncol(rna_data),ncol=2)
colnames(death_data) <- c("filename", "days_to_death")
#### Do a sanity check
list.files(file.loc, recursive = TRUE)[1:10] # here are the files we read to get the read counts
str(death_data[1:5,]) # what our clinical data looks like
str(rna_data[1:5,1:5]) # what our sequencing data looks like
for (case in 1:ncol(rna_data)){
case_id <- files[[grep(colnames(rna_data)[case],files)]]$cases[[1]]$case_id
#days_to_death <- cases[grep(case_id, cases)][[1]]$diagnoses[[1]]$days_to_death
death_data[case,1] <- colnames(rna_data)[case]
#death_data[case,2] <- days_to_death
death_data[case,2] <- case_id
}
Now the output (non errors) is fairly straightforward. Notice that I am running this on data from only the first 5 patients in the data folder in the interest of time:
> sink(file=message_logfile, type = "message")
> library(GSAR)
> library(org.Hs.eg.db)
> library(GSVAdata)
> library(GSEABase)
> library(GSVA)
> library(dplyr)
> data(c2BroadSets)
> #### The following should work with data from the GDC, where we have a shopping cart
> #### of zipped single patient sequencing data
> #### Example .... [TRUNCATED]
> file.manifest <- "files.json" # the name of the files manifest
> cases.manifest <- "cases.json" # the name of the cases manifest
> dsep <- "/"
> files <- fromJSON(file=file.manifest)
> cases <- fromJSON(file=cases.manifest)
> dlfiles <- list.files(file.loc, recursive = TRUE)
> #### remove the manifest, since we are using a separately downloaded json object
> unlink(paste0(file.loc, dsep, "MANIFEST.txt"))
> #for (file in 1:length(dlfiles)){
> for (file in 1:5){
+ if(!exists("rna_table")){
+ ## get the patient barcode
+ case_id = strsplit(dlfil .... [TRUNCATED]
> rna_data <- as.matrix(rna_table)
> rownames(rna_data) <- rna_data[,1]
> rna_data <- rna_data[,-1]
> #### This creates a Matrix of all the death dates
> death_data <- matrix(nrow=ncol(rna_data),ncol=2)
> colnames(death_data) <- c("filename", "days_to_death")
> #### Do a sanity check
> list.files(file.loc, recursive = TRUE)[1:10] # here are the files we read to get the read counts
[1] "005c0660-3700-40ea-b037-b456319d369a/bb15d7d0-8705-49af-89e4-fc13c01de642.FPKM.txt.gz"
[2] "030cf06f-890c-4193-9c7d-254980c73a48/3d771128-9e90-49c2-8ee5-23d994ee6398.FPKM.txt.gz"
[3] "03a162ee-0be2-484d-ad86-17bba311a3f8/4172e3f8-3578-4f33-9168-6f8c2b8d0783.FPKM.txt.gz"
[4] "051918c1-9bb2-4146-bf85-4e4a55c5759e/5aed2227-1f31-4159-9eed-430bc45c61dc.FPKM.txt.gz"
[5] "0882ecec-b533-4912-adc1-8ffd6eaa47c1/c19f102d-47a0-48c6-9443-63730d9ea6d1.FPKM.txt.gz"
[6] "0ae4ff1f-e2d3-46e0-95a2-0ea80a4ebb63/574df2fc-a608-49c5-8e83-f26d03ef8bb3.FPKM.txt.gz"
[7] "0c2840a2-3a49-4f22-ae21-1cfbb0034212/fef65b57-c58d-4050-8de4-f09f5cd616ce.FPKM.txt.gz"
[8] "0dfe7aef-a105-4a32-89ca-49a30a1b59ed/65a45bca-b5d4-4763-a51f-f7b9ad9efcb9.FPKM.txt.gz"
[9] "0e7871dc-a721-4dae-8938-28a73ec3f968/232f085b-6201-4e4d-8473-e592b8d8e16d.FPKM.txt.gz"
[10] "101e042e-efa2-4c6c-b629-55ecbde859d2/3de80dcb-4ff2-4125-b8e6-9e06ec1cd833.FPKM.txt.gz"
> str(death_data[1:5,]) # what our clinical data looks like
logi [1:5, 1:2] NA NA NA NA NA NA ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:2] "filename" "days_to_death"
> str(rna_data[1:5,1:5]) # what our sequencing data looks like
chr [1:5, 1:5] "3.009793e-03" "2.945653e+00" "0.000000e+00" "3.861741e+00" ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:5] "ENSG00000270112.3" "ENSG00000167578.15" "ENSG00000273842.1" "ENSG00000078237.5" ...
..$ : chr [1:5] "bb15d7d0.8705.49af.89e4.fc13c01de642.FPKM.txt.gz" "X3d771128.9e90.49c2.8ee5.23d994ee6398.FPKM.txt.gz" "X4172e3f8.3578.4f33.9168.6f8c2b8d0783.FPKM.txt.gz" "X5aed2227.1f31.4159.9eed.430bc45c61dc.FPKM.txt.gz" ...
> for (case in 1:ncol(rna_data)){
+ case_id <- files[[grep(colnames(rna_data)[case],files)]]$cases[[1]]$case_id
+ #days_to_death <- cases[grep(cas .... [TRUNCATED]
However, Somehow I am only able to locate the first data file in my files manifest. The last loop in the included source causes the following error to be recorded in my logged messages:
Joining, by = "ensemble_id"
Joining, by = "ensemble_id"
Joining, by = "ensemble_id"
Joining, by = "ensemble_id"
Error in files[[grep(colnames(rna_data)[case], files)]] :
attempt to select less than one element in get1index
Hi Kevin, I've used the downloaded manifest strategy above with limited success. Can you have a look?
Hey, I see that you edited your original question with code - thanks!
The problem seems to be 2-fold:
Both of these problems are related to the fact that your filenames were assigned as colnames, which in R means that you cannot have a hyphen in the colname, neither can the colname begin with a number. You'll have to save the filenames earlier / use an earlier listing of the filenames.
Hope that this makes sense?
Kevin
Hi Kevin, Those suggestions solved the problem. I ended up just renaming the columns in the RNA seq matrix generically, then building a lookup table for the file names. Now I have several questions about the analysis of this data and I created a second post to address these. Would you be willing to take a look?
Linking patient survival to gene-set analysis
Hi! Okay, I will take a look but not until later today (Saturday) or Sunday. I will give someone else a chance to take a look too.