If anyone still needs help with this, you can use a SAF file as an option with featureCounts
. Here's a script from my VEBA suite https://github.com/jolespin/veba/blob/main/src/scripts/fasta_to_saf.py
Can easily adapt to not require soothsayer_utils below.
from __future__ import print_function, division
import sys, os, argparse
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser
from soothsayer_utils import get_file_object, pv
pd.options.display.max_colwidth = 100
__program__ = os.path.split(sys.argv[0])[-1]
__version__ = "2021.04.04"
def fasta_to_saf(path, compression="infer"):
"""
# GeneID Chr Start End Strand
# http://bioinf.wehi.edu.au/featureCounts/
# Useful:
import re
record_id = "lcl|NC_018632.1_cds_WP_039228897.1_1 [gene=dnaA] [locus_tag=MASE_RS00005] [protein=chromosomal replication initiator protein DnaA] [protein_id=WP_039228897.1] [location=410..2065] [gbkey=CDS]"
re.search("\[locus_tag=(\w+)\]", record_id).group(1)
# 'MASE_RS00005'
"""
saf_data = list()
if path == "stdin":
f = sys.stdin
else:
f = get_file_object(path, mode="read", compression=compression, verbose=False)
for id_record, seq in pv(SimpleFastaParser(f), "Reading sequences [{}]".format(path)):
id_record = id_record.split(" ")[0]
fields = [
id_record,
id_record,
1,
len(seq),
"+",
]
saf_data.append(fields)
if f is not sys.stdin:
f.close()
return pd.DataFrame(saf_data, columns=["GeneID", "Chr", "Start", "End", "Strand"])
def main(args=None):
script_directory = os.path.dirname(os.path.abspath( __file__ ))
script_filename = __program__
description = """
Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable)
usage = "{} -i <fasta_file> --compression infer > <saf_file>".format(__program__)
epilog = "Copyright 2021 Josh L. Espinoza (jespinoz@jcvi.org)"
parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i","--fasta_file", default="stdin", type=str, help = "path/to/fasta.fa[.gz] [Default: stdin")
parser.add_argument("-c","--compression", type=str, default="infer", help = "Compression type {None, gzip, bz2, infer} [Default: infer")
opts = parser.parse_args()
opts.script_directory = script_directory
opts.script_filename = script_filename
df_saf = fasta_to_saf(opts.fasta_file, compression=opts.compression)
df_saf.to_csv(sys.stdout, sep="\t", index=None)
if __name__ == "__main__":
main()
Typically, one uses a GTF file to represent annotation. Is your contig annotated? If not, you can create a 'fake' GTF file with just one feature that goes from the first nt to the last nt of the contig. You won't need the sequence for this but the name, length of the contig. The specs for GTF file are here: http://mblab.wustl.edu/GTF22.html