I am using BALLGOWN for DEG analysis of RNA-seq data obtained from ENA. During analysis what is the correct way to give pheno_data. Do it have to be in any particular order?
I have set it as:
"ids","cell","rep"
"SRR4011890","hESC","rep1"
"SRR4011891","hESC","rep2"
"SRR4011896","hESC-CM","rep1"
"SRR4011897","hESC-CM","rep2"
"SRR4011898","hESC","rep3"
"SRR4011899","hESC","rep4"
"SRR4011904","hESC-CM","rep3"
"SRR4011905","hESC-CM","rep4"
After this I am using the following code:
library(ballgown)
library(genefilter)
library(dplyr)
library(devtools)
pheno_data = read.csv("compare.csv")
bg = ballgown (dataDir = "/home/bioinfo/Documents/Work/Data/PRJNA338181/expression", samplePattern = "SRR", pData = pheno_data)
bg_table = texpr (bg, 'all')
bg_table
bg_gene_name = unique (bg_table[,9:10])
save (bg, file = 'bg.rda')
result_transcripts = stattest(bg, feature = "transcript", covariate = "cell", getFC = TRUE, meas = "FPKM")
result_genes = stattest(bg, feature = "gene", covariate = "cell", getFC = TRUE, meas = "FPKM")
result_genes = merge (result_genes, bg_gene_name, by.x = c("id"), by.y = c ("gene_id"))
write.table (result_transcripts, "compare_transcript_resuls.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table (result_genes, "compare_genes_result.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
hist(result_genes$pval, main="Histogram of p-values before filter", xlab="p-val")
hist(result_genes$qval, main="Histogram of q-values before filter", xlab="q-val")
bg_filt = subset (bg, "rowVars(texpr(bg))>1", genomesubset = TRUE)
bg_filt_table = texpr (bg_filt, 'all') bg_filt_gene_names = unique (bg_filt_table [,9:10])
result_transcripts = stattest(bg_filt, feature = "transcript", covariate = "cell", getFC = TRUE, meas = "FPKM")
result_genes = stattest(bg_filt, feature = "gene", covariate = "cell", getFC = TRUE, meas = "FPKM")
result_genes = merge (result_genes, bg_filt_gene_names, by.x = c("id"), by.y = c ("gene_id"))
write.table (result_transcripts, "compare_transcript_result_filt.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table (result_genes, "compare_genes_result_filt.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
sig_p_transcripts = subset (result_transcripts, result_transcripts$pval<0.05)
sig_p_genes = subset (result_genes, result_genes$pval<0.05)
write.table (sig_p_transcripts, "compare_transcript_sigp_005.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
write.table (sig_p_genes, "compare_gene_sigp_005.tsv", sep = "\t", quote = FALSE, row.names = FALSE)
Can anyone help with the R script that I am using?
I am asking this question because on changing the names of data sets and ordering the phenodata content by cell type gives a different result.
Can you please elaborate?