How to calculate position-specific fastq stats?
0
0
Entering edit mode
2.3 years ago
O.rka ▴ 740

I want to calculate the mean, 25th, 50th, and 75th percentiles of quality values for each region. FastQC kind of does this but it bins the positions into ranges plus it's very clunky as I'm just looking for a single table output. I don't want to actually trim the sequences, just calculate the stats per position.

#Base   Mean    Median  Lower Quartile  Upper Quartile  10th Percentile 90th Percentile
1   33.23305037285801   33.0    33.0    34.0    32.0    34.0
2   33.11358515751122   33.0    33.0    34.0    32.0    34.0
3   33.41188336801493   34.0    33.0    34.0    32.0    35.0
4   33.451638150002225  34.0    33.0    34.0    32.0    35.0
5   33.23187315792592   34.0    33.0    34.0    32.0    35.0
6   35.61086578407559   37.0    34.0    37.0    33.0    37.0
7   35.60032407544543   37.0    34.0    37.0    33.0    37.0
8   33.54016276899836   34.0    33.0    34.0    32.0    35.0
9   35.295348346391386  37.0    34.0    37.0    33.0    37.0
10-14   36.5042000677587    37.4    36.8    37.4    35.2    37.4
15-19   37.70399836527496   38.0    38.0    38.0    38.0    38.0
20-24   37.94843261711519   38.2    38.2    38.2    38.2    38.2
25-29   37.79203177994343   38.2    38.2    38.2    37.2    38.2
30-34   38.7642066029562    39.0    39.0    39.0    38.6    39.0
35-39   38.04057269990669   38.4    38.2    38.4    38.0    38.6
40-44   38.29040631155675   38.6    38.6    38.6    38.0    38.8
45-49   37.678851462181015  38.0    38.0    38.2    38.0    38.2
50-54   37.8440563952369    38.0    38.0    38.2    38.0    39.0
55-59   37.410261833706066  38.2    38.0    38.2    36.6    38.8
60-64   38.184713469541904  39.0    38.6    39.0    37.2    39.0
65-69   38.30075224196152   39.0    38.6    39.0    37.8    39.0
70-74   38.196683572401845  39.0    38.6    39.0    37.4    39.0
75-79   38.31633614241917   39.0    39.0    39.0    37.4    39.0
80-84   37.75569515617826   38.2    38.2    39.0    36.8    39.0
85-89   38.02949160606644   38.6    38.0    38.8    37.0    39.0
90-94   37.90482609524726   39.0    38.2    39.0    37.2    39.0
95-99   37.48980555473274   38.6    38.0    38.8    36.0    39.0
100-104 37.545679503176885  38.8    38.0    39.0    36.6    39.0
105-109 37.70546424895215   38.8    38.0    39.0    37.0    39.0
110-114 37.91915905152624   39.0    39.0    39.0    37.0    39.0
115-119 37.797473303810776  39.0    38.4    39.0    37.0    39.0
120-124 37.77431301744694   39.0    38.6    39.0    37.0    39.0
125-129 37.246687182496785  38.4    38.2    39.0    35.8    39.0
130-134 37.28997342414728   38.0    38.0    38.8    35.8    38.8
135-139 37.24109704305455   38.4    38.0    39.0    35.0    39.0
140-144 37.62798337319865   38.2    38.0    39.0    36.8    39.0
145-149 37.768444484145206  39.0    38.4    39.0    37.0    39.0
150-154 37.40824022682504   38.6    38.0    39.0    36.2    39.0
155-159 36.842611801863185  38.2    37.6    38.4    34.8    38.4
160-164 37.2724359069299    38.0    37.8    38.8    36.4    38.8
165-169 36.769489421496175  38.0    37.4    38.2    35.0    39.0
170-174 36.99442763333284   38.0    37.6    38.8    36.2    39.0
175-179 36.844243379641284  38.0    37.0    38.2    36.2    38.6
180-184 36.74144648173107   38.0    37.0    38.0    36.4    38.0
185-189 36.416224923725174  38.0    37.0    38.0    34.4    38.0
190-194 35.93546455442172   37.0    37.0    38.0    33.6    38.0
195-199 35.87151065811105   37.0    37.0    37.0    33.0    37.6
200-204 35.521310668108235  37.0    37.0    37.0    32.2    37.0
205-209 35.660209163346615  37.0    37.0    37.0    33.8    37.0
210-214 34.99553060990239   37.0    37.0    37.0    31.2    37.0
215-219 35.589499761178345  37.0    37.0    37.0    33.0    37.0
220-224 35.396030098935114  37.0    37.0    37.0    32.6    37.0
225-229 35.09452538359572   37.0    36.8    37.0    31.4    37.0
230-234 34.99679381914721   37.0    36.2    37.0    30.8    37.0
235-239 34.360394388986805  37.0    35.4    37.0    26.6    37.0
240-244 34.69519177009435   37.0    35.2    37.0    29.2    37.0
245-249 34.53139893955775   37.0    36.0    37.0    28.4    37.0
250-251 32.04626579647951   36.5    29.5    37.0    19.0    37.0

Here is a similar output from FastQC but there are a bunch of other files, I have to parse this out of a larger text file, and its binned.

Edit: I ended up writing one last night

#!/usr/bin/env python
from __future__ import print_function, division
import sys, os, argparse, glob
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm
import pyfastx


pd.options.display.max_colwidth = 100
__program__ = os.path.split(sys.argv[0])[-1]
__version__ = "2022.8.9"

def statistics(fp):

    # Build table
    quality_table = list()
    failed_reads = list()

    for read in tqdm(pyfastx.Fastq(fp), desc="Reading fastq filepath: {}".format(fp), unit=" read"):
        try:
            quality = np.asarray(read.quali).astype(int)
            quality_table.append(quality)
        except UnicodeDecodeError as e:
            failed_reads.append(read.name)

    quality_table = np.stack(quality_table)

    df_minmeanmax = pd.DataFrame([
        pd.Series(np.min(quality_table, axis=0), name="min"),
        pd.Series(np.mean(quality_table, axis=0), name="mean"),
        pd.Series(np.max(quality_table, axis=0), name="max"),
    ]).T
    df_quantiles = pd.DataFrame(np.quantile(quality_table, q=[0.25,0.5,0.75], axis=0).T, columns = ["q=0.25", "q=0.5", "q=0.75"])
    df_output = pd.concat([df_minmeanmax, df_quantiles], axis=1)
    df_output.index = df_output.index.values + 1
    df_output.index.name = "position"

    return df_output

def main(args=None):
    # Path info
    script_directory  =  os.path.dirname(os.path.abspath( __file__ ))
    script_filename = __program__
    # Path info
    description = """
    Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable)
    usage = "{} file.fq > <output_table>".format(__program__)
    epilog = "Copyright 2021 Josh L. Espinoza (jespinoz@jcvi.org)"

    # Parser
    parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
    # Pipeline
    parser.add_argument("fastq_filepath",  type=str, nargs="+", help = "path/to/fastq[.gz] file(s)")
    parser.add_argument("-o","--output", default="stdout", type=str, help = "Output filepath [Default: stdout]")
    parser.add_argument("-f","--field", default="q=0.25",  type=str, help = "Field when batch [min, mean, max, q=0.25, q=0.5, q=0.75] [Default: q=0.25]")
    parser.add_argument("-b","--basename",  action="store_true", help = "Output basename for multiple fastq files")

    # Options
    opts = parser.parse_args()
    opts.script_directory  = script_directory
    opts.script_filename = script_filename

    # Build table
    if len(opts.fastq_filepath) == 1:
        df_output = statistics(opts.fastq_filepath[0])
    else:
        field_table = dict()
        for fp in opts.fastq_filepath:
            name = fp
            if opts.basename:
                name = fp.split("/")[-1]
            field_table[name] = statistics(fp)[opts.field]
        df_output = pd.DataFrame(field_table)

    if opts.output == "stdout":
        opts.output = sys.stdout
    df_output.to_csv(opts.output, sep="\t")

if __name__ == "__main__":
    main()
rnaseq fastq metagenomics genomics • 667 views
ADD COMMENT
0
Entering edit mode

fastQC will not group data if you use --nogroup option.

ADD REPLY
0
Entering edit mode

Thanks, that's good to know. I ended up writing one last night and updated the question.

ADD REPLY

Login before adding your answer.

Traffic: 2282 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6