Entering edit mode
7 months ago
Pedro
•
0
Hello!
I need to calculate the OS of two types of cancer: CESC and HNSC. I will search the relation between the OS and gene expression.
But I didn't find the OS column on TCGA data. Can anyone help me ?
Please be specific. Which TCGA subset, what did you try? Show code.
I have already tried this code. listSamples with HNSC cancer and smokers. I need at least one supporting material to find out the survival of each of the HNSC and CESC cancer individuals
library(DESeq2) library(TCGAbiolinks) library(survminer) library(survival) library(SummarizedExperiment) library(tidyverse)
listSamples <- c("TCGA-BA-5152", "TCGA-CN-6013", "TCGA-CN-A49A", "TCGA-CQ-7069", "TCGA-P3-A5QF", "TCGA-P3-A6T6", "TCGA-QK-A6IH", "TCGA-CR-6472", "TCGA-DQ-7594", "TCGA-HD-8224", "TCGA-HD-8314", "TCGA-P3-A5QE", "TCGA-CQ-6220", "TCGA-CQ-7064", "TCGA-F7-A624", "TCGA-HD-A4C1", "TCGA-P3-A6T2", "TCGA-CQ-7068", "TCGA-CV-6953", "TCGA-CV-7407", "TCGA-CV-A463", "TCGA-KU-A66T", "TCGA-MT-A7BN", "TCGA-UF-A71E", "TCGA-UF-A7JO", "TCGA-UF-A7JT", "TCGA-BA-5558", "TCGA-QK-A64Z", "TCGA-CV-A6JM", "TCGA-UF-A7JV", "TCGA-CV-7247", "TCGA-CV-7437", "TCGA-D6-6826", "TCGA-D6-A6EQ", "TCGA-F7-A622", "TCGA-TN-A7HJ", "TCGA-BB-A5HU", "TCGA-BB-A5HZ", "TCGA-CN-6018", "TCGA-CQ-7071", "TCGA-CQ-A4CD", "TCGA-CR-6484", "TCGA-CR-7380", "TCGA-CV-6942", "TCGA-CV-6955", "TCGA-CV-7252", "TCGA-CV-7416", "TCGA-CV-7425", "TCGA-CV-A45V", "TCGA-HD-A633", "TCGA-MT-A67F", "TCGA-P3-A6T3", "TCGA-RS-A6TO", "TCGA-BA-5557", "TCGA-BA-A6DB", "TCGA-BB-4224", "TCGA-BB-7863", "TCGA-BB-7872", "TCGA-C9-A47Z", "TCGA-C9-A480", "TCGA-CN-4725", "TCGA-CN-4733", "TCGA-CN-4737", "TCGA-CN-6996", "TCGA-CN-A640", "TCGA-CQ-5327", "TCGA-CQ-5329", "TCGA-CQ-6229", "TCGA-CQ-7065", "TCGA-CQ-A4CE", "TCGA-CQ-A4CH", "TCGA-CR-6488", "TCGA-CR-7372", "TCGA-CR-7382", "TCGA-CR-7393", "TCGA-CV-5973", "TCGA-CV-5979", "TCGA-CV-6003", "TCGA-CV-6939", "TCGA-CV-6959", "TCGA-CV-7104", "TCGA-CV-7238", "TCGA-CV-7243", "TCGA-CV-7255", "TCGA-CV-7438", "TCGA-CV-A45P", "TCGA-CV-A465", "TCGA-CV-A6JT", "TCGA-CV-A6K0", "TCGA-D6-6515", "TCGA-D6-A6EM", "TCGA-DQ-5624", "TCGA-HD-7831", "TCGA-HD-A6HZ", "TCGA-IQ-A61J", "TCGA-IQ-A61L", "TCGA-IQ-A6SG", "TCGA-MT-A67A", "TCGA-P3-A5QA", "TCGA-QK-A652", "TCGA-T2-A6WX", "TCGA-UP-A6WW", "TCGA-BA-5153", "TCGA-BA-5559", "TCGA-BB-4223", "TCGA-CN-A499", "TCGA-CN-A63Y", "TCGA-CN-A6V1", "TCGA-CN-A6V7", "TCGA-CR-5243", "TCGA-CR-5249", "TCGA-CR-6470", "TCGA-CR-6480", "TCGA-CR-6481", "TCGA-CR-7404", "TCGA-DQ-7596", "TCGA-MT-A67G", "TCGA-QK-A6IF", "TCGA-QK-A6V9" )
clinical_HNSC <- GDCquery_clinic("TCGA-HNSC") any(colnames(clinical_HNSC) %in% c("vital_status", "days_to_last_follow_up", "days_to_death")) which(colnames(clinical_HNSC) %in% c("vital_status", "days_to_last_follow_up", "days_to_death")) clinical_HNSC[,c(9,43,48)]
table (clinical_HNSC$vital_status)
clinical_HNSC$deceased <- ifelse(clinical_HNSC$vital_status =="Alive",FALSE,TRUE)
clinical_HNSC$overall_survival <- ifelse(clinical_HNSC$vital_status == "Alive", clinical_HNSC$days_to_last_follow_up, clinical_HNSC$days_to_death) write.csv(clinical_HNSC, "tabela.csv", row.names = FALSE)
query_HNSC_all <- GDCquery( project = "TCGA-HNSC", data.category = "Transcriptome Profiling", experimental.strategy = "RNA-Seq", workflow.type = "STAR - Counts", data.type = "Gene Expression Quantification", barcode = listSamples, sample.type = c("Primary Tumor","Solid Tissue Normal"), access="open" )
output_HNSC <- getResults(query_HNSC_all) tumor <- output_HNSC$cases
query_HNSC_all <- GDCquery( project = "TCGA-HNSC", data.category = "Transcriptome Profiling", experimental.strategy = "RNA-Seq", workflow.type = "STAR - Counts", data.type = "Gene Expression Quantification", sample.type = "Primary Tumor", access="open", barcode=tumor )
GDCdownload (query_HNSC_all) tcga_hnsc_data <- GDCprepare(query_HNSC_all,summarizedExperiment = TRUE) hnsc_matrix <- assay(tcga_hnsc_data, "unstranded") hnsc_matrix[1:10,1:10]
gene_metadata <- as.data.frame(rowData(tcga_hnsc_data)) coldata <- as.data.frame (colData(tcga_hnsc_data))
dds <- DESeqDataSetFromMatrix(countData=hnsc_matrix, colData = coldata, design = ~1)
keep <- rowSums(counts(dds)) >=10 dds <- dds[keep,]
vsd <- vst(dds,blind=FALSE) hnsc_matrix_vst <- assay (vsd)
hnsc_tp53 <- hnsc_matrix_vst %>% as.data.frame() %>% rownames_to_column(var = 'gene_id') %>% gather(key = 'case_id', value = 'counts', -gene_id) %>% left_join(., gene_metadata, by = "gene_id") %>% filter(gene_name == "TP53")
median_value <- median(hnsc_tp53$counts)
hnsc_tp53$strata <- ifelse(hnsc_tp53$counts >= median_value, "HIGH", "LOW")
hnsc_tp53$case_id <- gsub('-01.*', '', hnsc_tp53$case_id) hnsc_tp53 <- merge(hnsc_tp53, clinical_HNSC, by.x = 'case_id', by.y = 'submitter_id')
fit <- survfit(Surv(overall_survival, deceased) ~ strata, data = hnsc_tp53) fit ggsurvplot(fit, data = hnsc_tp53, pval = T, risk.table = T)
fit2 <- survdiff(Surv(overall_survival, deceased) ~ strata, data = hnsc_tp53)