I don't know why my rarefaction curve for most samples does not reach saturation. Please how can I improve te curve? The curve does not look like most rarefaction curves I see in publications, Please help and I am relatively new to microbiome study.
Below is the code I used: mind in this code the OTU are actually ASVs
# Metadata #
samp_dat_bac = read.delim("C:/JASON WALLACE LAB/Manuscript restart1/raw files/metadata.tsv")
samp_dat_bac
rownames(samp_dat_bac) <- samp_dat_bac$Samples #row names must match OTU table headers
SAMP.bac <- phyloseq::sample_data(samp_dat_bac)
SAMP.bac
# OTU table #
otu_bac <- read.csv("C:/JASON WALLACE LAB/Manuscript restart1/raw files/asv-table.csv")
otu_bac
rownames(otu_bac) <- otu_bac$OTU
otu_bac <- otu_bac[,-1]
otu_bac
OTU.bac <- phyloseq::otu_table(otu_bac, taxa_are_rows = TRUE)
OTU.bac
any(is.na(otu_bac)) # no NA in the OTU table
# Taxonomy #
#taxonomy.bac <- read.csv("C:/JASON WALLACE LAB/Manuscript restart1/raw files/taxonomy.csv")
taxonomy.bac = read.delim("C:/JASON WALLACE LAB/Manuscript restart1/raw files/taxonomy.tsv")
taxonomy.bac$Label
head(taxonomy.bac)
#replace the uncultured in the Genus column, with uncultured and phylum name
for (i in 1:nrow(taxonomy.bac)) {
if (taxonomy.bac$Genus[i] == "uncultured") {
taxonomy.bac$Genus[i] <- paste0("uncultured ", taxonomy.bac$Phylum[i], sep = " ")
}
}
head(taxonomy.bac)
rownames(taxonomy.bac) <- taxonomy.bac$OTU
TAX.bac <- phyloseq::tax_table(as.matrix(taxonomy.bac))
TAX.bac
head(TAX.bac)
all.equal(rownames(samp_dat_bac), colnames(otu_bac))
# Fasta #
FASTA.bac <- readDNAStringSet("C:/JASON WALLACE LAB/Manuscript restart1/raw files/dna-sequences.fasta", format="fasta", seek.first.rec=TRUE, use.names=TRUE)
FASTA.bac
# Phylogentic tree #
tree <- phyloseq::read_tree("C:/JASON WALLACE LAB/Manuscript restart1/raw files/rooted_tree.nwk")
###### Create Initial Phyloseq object #####
# Merge reads into Phyloseq object #
bac.unedited <- phyloseq::phyloseq(OTU.bac, TAX.bac, FASTA.bac, SAMP.bac, tree)
bac.unedited
#i decided to change the name to ASVs for easy identification
names(FASTA.bac) <- taxa_names(bac.unedited)
bac.unedited <- merge_phyloseq(bac.unedited, FASTA.bac)
taxa_names(bac.unedited) <- paste0("ASV", seq(ntaxa(bac.unedited)))
bac.unedited@tax_table
#check if the taxa_names of phyloseq and rownames for otu table are the same
identical(taxa_names(bac.unedited),rownames(bac.unedited@otu_table))
bac.unedited #41255 taxa
###### Taxonomy filtering #####
# remove OTUs that are mitochondria, chloroplast, or Unassigned at the kingdom level
bac_no_chloro <- bac.unedited %>%
phyloseq::subset_taxa(Order != "Chloroplast") %>%
phyloseq::subset_taxa(Family != "Mitochondria") %>%
phyloseq::subset_taxa(Kingdom == "Bacteria") %>%
phyloseq::subset_taxa(Phylum != "Unclassified ")
bac_no_chloro
#First, create a list of the samples that you want to remove
Samples_toRemove <- c("Sample328" , "Sample15",
"Sample273" , "Sample314")
#To see what samples get removed, run the following; note, I have a column called "SampleID"
subset_samples(bac_no_chloro, Samples %in% Samples_toRemove)
#This will return a ps object that contains the samples you want to remove
#To remove those from your phyloseq object
bac_no_chloro <- subset_samples(bac_no_chloro, !(Samples %in% Samples_toRemove))
bac_no_chloro
# Filter any Genus that is in less than 2 samples and has less than 14 reads
bac_no_chloro <- tax_glom(bac_no_chloro, taxrank = "Genus", NArm = FALSE)
filter_compare_reads <- function(bac_no_chloro){
prevdf = apply(X = otu_table(bac_no_chloro),
MARGIN = ifelse(taxa_are_rows(bac_no_chloro), yes = 1, no = 0),
FUN = function(x){sum(x > 0)})
# Add taxonomy and total read counts to this data.frame
prevdf = data.frame(Prevalence = prevdf,
TotalAbundance = taxa_sums(bac_no_chloro),
tax_table(bac_no_chloro))
prevdf1 = subset(prevdf, Phylum %in% get_taxa_unique(bac_no_chloro, "Phylum"))
# Total taxa sums
TotalReads <- sum(prevdf1$TotalAbundance)
readThreshold <- 5
prevalenceThreshold = 2 #We have such a wide variety of locations and tissues that we don't want to lose alot 0.01 * nsamples(psc)
prevalenceThreshold
# taxa must be in at least two samples and make up .0001% of total reads
keepTaxa = rownames(prevdf1)[(prevdf1$Prevalence >= prevalenceThreshold & prevdf1$TotalAbundance >= readThreshold)]
phy_filt = prune_taxa(keepTaxa, bac_no_chloro)
print(sum(sample_sums(phy_filt)))
return(phy_filt)
}
filter_compare_reads
sum(sample_sums(bac_no_chloro)) #5839840
phy_asv <- filter_compare_reads(bac_no_chloro) #4294585
phy_asv
### How many total reads do we have?
sample_sums(phy_asv)
phy_filtered_asv <- prune_samples(sample_sums(phy_asv) >= 500, phy_asv)
phy_filtered_asv
sample_sums(phy_filtered_asv)
I actually clustered using ASV of DADA2. I have done some filtering also like filtering below 10,000 reads. Nonetheless, I still got this plot.
I can't see where you plot this curve in the code you uploaded. (moved it to the main post for clarity)