Hello, I have a question when you are doing an RNA-Seq analysis of a bacteria, and I get the data after aligning it with bwa and counting the raw counts with featureCounts. I have normalized it with DESEQ2, which I understand is a normalization by library size, should I do another normalization by gene size or is it not necessary?
Also I used a code like this to get the foldchange at 1.5, 1.8 and 2 with a p-value > 0.05 it is ok doing it this way?
# Cargar las librerías necesarias
library(DESeq2)
library(openxlsx)
library(ggplot2)
# DESeq.ds already created and normalized
# Definir el diseño experimental
design(DESeq.ds) <- ~ condition
# Realizar el análisis de expresión diferencial
DESeq.ds <- DESeq(DESeq.ds)
# Extraer los resultados para la comparación Wt06 vs RpoS06
results_Wt06_vs_RpoS06 <- results(DESeq.ds, contrast = c("condition", "Wt06", "RpoS06"))
# Extraer los resultados para la comparación Wt25 vs RpoS25
results_Wt25_vs_RpoS25 <- results(DESeq.ds, contrast = c("condition", "Wt25", "RpoS25"))
# Convertir los resultados a data frame y añadir nombres de genes
results_df_Wt06_vs_RpoS06 <- as.data.frame(results_Wt06_vs_RpoS06)
results_df_Wt06_vs_RpoS06$Gene <- rownames(results_df_Wt06_vs_RpoS06)
results_df_Wt06_vs_RpoS06 <- results_df_Wt06_vs_RpoS06[, c("Gene", colnames(results_df_Wt06_vs_RpoS06)[-ncol(results_df_Wt06_vs_RpoS06)])]
results_df_Wt25_vs_RpoS25 <- as.data.frame(results_Wt25_vs_RpoS25)
results_df_Wt25_vs_RpoS25$Gene <- rownames(results_df_Wt25_vs_RpoS25)
results_df_Wt25_vs_RpoS25 <- results_df_Wt25_vs_RpoS25[, c("Gene", colnames(results_df_Wt25_vs_RpoS25)[-ncol(results_df_Wt25_vs_RpoS25)])]
# Guardar los resultados sin filtrar en un archivo Excel
write.xlsx(results_df_Wt06_vs_RpoS06, file = "Wt06_vs_RpoS06_unfiltered.xlsx", rownames = FALSE)
write.xlsx(results_df_Wt25_vs_RpoS25, file = "Wt25_vs_RpoS25_unfiltered.xlsx", rownames = FALSE)
# Función para filtrar, contar y devolver los resultados basados en el valor p
filter_and_count <- function(results_df, fold_change_threshold) {
filtered_results <- results_df[which(results_df$pvalue < 0.05 & abs(results_df$log2FoldChange) > fold_change_threshold), ]
num_overexpressed <- sum(filtered_results$log2FoldChange > 0)
num_underexpressed <- sum(filtered_results$log2FoldChange < 0)
# Mostrar el número de genes sobreexpresados e infraexpresados
cat("Para el umbral de fold change >", fold_change_threshold, ":\n")
cat("Número de genes sobreexpresados:", num_overexpressed, "\n")
cat("Número de genes infraexpresados:", num_underexpressed, "\n")
return(list(filtered_results = filtered_results, num_overexpressed = num_overexpressed, num_underexpressed = num_underexpressed))
}
# Aplicar los filtros y contar los genes
fc_thresholds <- c(1.5, 1.8, 2)
comparisons <- c("Wt06_vs_RpoS06", "Wt25_vs_RpoS25")
results_list <- list()
for (comparison in comparisons) {
results_df <- get(paste0("results_df_", comparison))
for (fc in fc_thresholds) {
result <- filter_and_count(results_df, fc)
results_list[[paste(comparison, "FC >", fc)]] <- result
output_file <- paste0(comparison, "_filtered_pvalue_0.05_fc_", fc, ".xlsx")
write.xlsx(result$filtered_results, file = output_file, rownames = FALSE)
}
}
See my answer to a similar quesiton here: Why employ normalization methods, and how can they be utilized in DEG analysis?