Entering edit mode
2.3 years ago
O.rka
▴
740
I want to calculate the mean, 25th, 50th, and 75th percentiles of quality values for each region. FastQC kind of does this but it bins the positions into ranges plus it's very clunky as I'm just looking for a single table output. I don't want to actually trim the sequences, just calculate the stats per position.
#Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile
1 33.23305037285801 33.0 33.0 34.0 32.0 34.0
2 33.11358515751122 33.0 33.0 34.0 32.0 34.0
3 33.41188336801493 34.0 33.0 34.0 32.0 35.0
4 33.451638150002225 34.0 33.0 34.0 32.0 35.0
5 33.23187315792592 34.0 33.0 34.0 32.0 35.0
6 35.61086578407559 37.0 34.0 37.0 33.0 37.0
7 35.60032407544543 37.0 34.0 37.0 33.0 37.0
8 33.54016276899836 34.0 33.0 34.0 32.0 35.0
9 35.295348346391386 37.0 34.0 37.0 33.0 37.0
10-14 36.5042000677587 37.4 36.8 37.4 35.2 37.4
15-19 37.70399836527496 38.0 38.0 38.0 38.0 38.0
20-24 37.94843261711519 38.2 38.2 38.2 38.2 38.2
25-29 37.79203177994343 38.2 38.2 38.2 37.2 38.2
30-34 38.7642066029562 39.0 39.0 39.0 38.6 39.0
35-39 38.04057269990669 38.4 38.2 38.4 38.0 38.6
40-44 38.29040631155675 38.6 38.6 38.6 38.0 38.8
45-49 37.678851462181015 38.0 38.0 38.2 38.0 38.2
50-54 37.8440563952369 38.0 38.0 38.2 38.0 39.0
55-59 37.410261833706066 38.2 38.0 38.2 36.6 38.8
60-64 38.184713469541904 39.0 38.6 39.0 37.2 39.0
65-69 38.30075224196152 39.0 38.6 39.0 37.8 39.0
70-74 38.196683572401845 39.0 38.6 39.0 37.4 39.0
75-79 38.31633614241917 39.0 39.0 39.0 37.4 39.0
80-84 37.75569515617826 38.2 38.2 39.0 36.8 39.0
85-89 38.02949160606644 38.6 38.0 38.8 37.0 39.0
90-94 37.90482609524726 39.0 38.2 39.0 37.2 39.0
95-99 37.48980555473274 38.6 38.0 38.8 36.0 39.0
100-104 37.545679503176885 38.8 38.0 39.0 36.6 39.0
105-109 37.70546424895215 38.8 38.0 39.0 37.0 39.0
110-114 37.91915905152624 39.0 39.0 39.0 37.0 39.0
115-119 37.797473303810776 39.0 38.4 39.0 37.0 39.0
120-124 37.77431301744694 39.0 38.6 39.0 37.0 39.0
125-129 37.246687182496785 38.4 38.2 39.0 35.8 39.0
130-134 37.28997342414728 38.0 38.0 38.8 35.8 38.8
135-139 37.24109704305455 38.4 38.0 39.0 35.0 39.0
140-144 37.62798337319865 38.2 38.0 39.0 36.8 39.0
145-149 37.768444484145206 39.0 38.4 39.0 37.0 39.0
150-154 37.40824022682504 38.6 38.0 39.0 36.2 39.0
155-159 36.842611801863185 38.2 37.6 38.4 34.8 38.4
160-164 37.2724359069299 38.0 37.8 38.8 36.4 38.8
165-169 36.769489421496175 38.0 37.4 38.2 35.0 39.0
170-174 36.99442763333284 38.0 37.6 38.8 36.2 39.0
175-179 36.844243379641284 38.0 37.0 38.2 36.2 38.6
180-184 36.74144648173107 38.0 37.0 38.0 36.4 38.0
185-189 36.416224923725174 38.0 37.0 38.0 34.4 38.0
190-194 35.93546455442172 37.0 37.0 38.0 33.6 38.0
195-199 35.87151065811105 37.0 37.0 37.0 33.0 37.6
200-204 35.521310668108235 37.0 37.0 37.0 32.2 37.0
205-209 35.660209163346615 37.0 37.0 37.0 33.8 37.0
210-214 34.99553060990239 37.0 37.0 37.0 31.2 37.0
215-219 35.589499761178345 37.0 37.0 37.0 33.0 37.0
220-224 35.396030098935114 37.0 37.0 37.0 32.6 37.0
225-229 35.09452538359572 37.0 36.8 37.0 31.4 37.0
230-234 34.99679381914721 37.0 36.2 37.0 30.8 37.0
235-239 34.360394388986805 37.0 35.4 37.0 26.6 37.0
240-244 34.69519177009435 37.0 35.2 37.0 29.2 37.0
245-249 34.53139893955775 37.0 36.0 37.0 28.4 37.0
250-251 32.04626579647951 36.5 29.5 37.0 19.0 37.0
Here is a similar output from FastQC but there are a bunch of other files, I have to parse this out of a larger text file, and its binned.
Edit: I ended up writing one last night
#!/usr/bin/env python
from __future__ import print_function, division
import sys, os, argparse, glob
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm
import pyfastx
pd.options.display.max_colwidth = 100
__program__ = os.path.split(sys.argv[0])[-1]
__version__ = "2022.8.9"
def statistics(fp):
# Build table
quality_table = list()
failed_reads = list()
for read in tqdm(pyfastx.Fastq(fp), desc="Reading fastq filepath: {}".format(fp), unit=" read"):
try:
quality = np.asarray(read.quali).astype(int)
quality_table.append(quality)
except UnicodeDecodeError as e:
failed_reads.append(read.name)
quality_table = np.stack(quality_table)
df_minmeanmax = pd.DataFrame([
pd.Series(np.min(quality_table, axis=0), name="min"),
pd.Series(np.mean(quality_table, axis=0), name="mean"),
pd.Series(np.max(quality_table, axis=0), name="max"),
]).T
df_quantiles = pd.DataFrame(np.quantile(quality_table, q=[0.25,0.5,0.75], axis=0).T, columns = ["q=0.25", "q=0.5", "q=0.75"])
df_output = pd.concat([df_minmeanmax, df_quantiles], axis=1)
df_output.index = df_output.index.values + 1
df_output.index.name = "position"
return df_output
def main(args=None):
# Path info
script_directory = os.path.dirname(os.path.abspath( __file__ ))
script_filename = __program__
# Path info
description = """
Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable)
usage = "{} file.fq > <output_table>".format(__program__)
epilog = "Copyright 2021 Josh L. Espinoza (jespinoz@jcvi.org)"
# Parser
parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
# Pipeline
parser.add_argument("fastq_filepath", type=str, nargs="+", help = "path/to/fastq[.gz] file(s)")
parser.add_argument("-o","--output", default="stdout", type=str, help = "Output filepath [Default: stdout]")
parser.add_argument("-f","--field", default="q=0.25", type=str, help = "Field when batch [min, mean, max, q=0.25, q=0.5, q=0.75] [Default: q=0.25]")
parser.add_argument("-b","--basename", action="store_true", help = "Output basename for multiple fastq files")
# Options
opts = parser.parse_args()
opts.script_directory = script_directory
opts.script_filename = script_filename
# Build table
if len(opts.fastq_filepath) == 1:
df_output = statistics(opts.fastq_filepath[0])
else:
field_table = dict()
for fp in opts.fastq_filepath:
name = fp
if opts.basename:
name = fp.split("/")[-1]
field_table[name] = statistics(fp)[opts.field]
df_output = pd.DataFrame(field_table)
if opts.output == "stdout":
opts.output = sys.stdout
df_output.to_csv(opts.output, sep="\t")
if __name__ == "__main__":
main()
fastQC
will not group data if you use--nogroup
option.Thanks, that's good to know. I ended up writing one last night and updated the question.