What Pierre Lindenbaum posted works, but I couldn't be bothered to install another software to my space.
The stuff I wrote in Python was very slow for a ~GB file because of the shoddy way I wrote it - it loaded the ENTIRE file, then
did regex
capture. The bottleneck was loading. After some tinkering, I've made it faster (~2 minutes, compared to ~2 hours or so)
by using subprocess
module (essentially running shell commands via python). I'm sharing the script here in case anyone needs it.
"""
Created on Mon Jul 4 10:50:46 2022
To use from the command line :
python "the_script_name.py" -i input_fasta.fasta -o output.bed
@author: ivanp
"""
import re
import argparse
import subprocess
import pandas as pd
import numpy as np
NUCLEOTIDE_PATTERN_GLOBAL = "[ATCG]+"
def findFastaNames(x,INPUT_FILE):
"""function that finds fasta names
x is the position of line in INPUT_FILE
"""
line_command = f"head -{x} {INPUT_FILE} | tail -n +{x}"
fasta_line = subprocess.check_output(line_command,shell=True).decode("utf-8")
fasta_line = fasta_line.replace(">","").split()[0]
return(fasta_line)
def getFastaPosition_Name(INPUT_FILE):
"""
Finds fasta key position and name and returns it
position - array of line position
name - array of names
example:
If INPUT_FILE consists of
">FastaBlasta mortimer and stuff" (line 0)
">FastaVlasta some more info" (line 12)
">FastaKasta some stuff also" (line 20)
This returns the following:
[0,12,20] and [FastaBlasta, FastaVlasta, FastaKasta]
"""
GREP_COMMAND = f'grep ">" {INPUT_FILE} -n'
GREP_OUTPUT = subprocess.check_output(GREP_COMMAND,shell=True).decode("utf-8")
GREP_OUTPUT = GREP_OUTPUT.split()
GREP_OUTPUT.pop(-1)
positions = [x[:x.find(":")] for x in GREP_OUTPUT]
positions = [int(x) for x in positions]
fastaKeyLines = np.array(positions,dtype=np.int64)
fastaKeyNames = np.array([findFastaNames(xi,INPUT_FILE) for xi in fastaKeyLines])
fastaKeyLines =np.append(fastaKeyLines,0)
return(fastaKeyLines,fastaKeyNames)
def findNucleotides(fasta_string):
"""
finds all nucleotide regions in a given fasta string
"""
spans = pd.DataFrame([what.span() for what in re.finditer(NUCLEOTIDE_PATTERN_GLOBAL,fasta_string)],
columns = ["start","end"])
return(spans)
def getNoGapDataFrame(index,fasta_start_array,INPUT_FILE):
"""
Returns noGapDataFrame from index position (integer)
and an array consisting of where FASTA files are found
and the FASTA file
The function first retrieves nucleotide sequence
with 'head' and 'tail'
And then uses "findNucleotides" to find uninterrupted
strings of ATCG and report it as a dataframe
"""
curr_line = fasta_start_array[index]
next_line = fasta_start_array[index+1]
if next_line == 0:
NUCLEOTIDE_COMMAND = f"tail {INPUT_FILE} -n +{curr_line+1}"
else:
NUCLEOTIDE_COMMAND = f"head -{next_line-1} {INPUT_FILE} | tail -n +{curr_line+1}"
nucleotideString = subprocess.check_output(NUCLEOTIDE_COMMAND,shell=True).decode("utf-8")
nucleotideString = nucleotideString.replace("\n","")
span_dataframe = findNucleotides(nucleotideString)
return(span_dataframe)
def formatNoGapDF(noGapDf,name):
"""
Formats NoGap for Craig Lowe's program
In a Nutshell:
[FASTA KEY] [START] [END] [INDEXED CONTIG ON FASTA KEY]
"""
noGapDf.insert(0,"key",name)
noGapDf["contig"] = noGapDf.index + 1
noGapDf["contig"] = noGapDf.key + "." + noGapDf.contig.astype(str)
return()
def main():
"main function"
parser = argparse.ArgumentParser()
parser.add_argument("-i", help="Input file", required=True)
parser.add_argument("-o", help="Output file")
args = parser.parse_args()
INPUT_FILE = args.i
OUTPUT_FILE = args.o
if OUTPUT_FILE is None:
OUTPUT_FILE = "noGaps.bed"
fasta_positions, fasta_names = getFastaPosition_Name(INPUT_FILE)
dataframes = [getNoGapDataFrame(index,fasta_positions,INPUT_FILE) for index in range(len(fasta_positions)-1)]
[formatNoGapDF(dataframe,fasta_names[index]) for index,dataframe in enumerate(dataframes)]
dataframes = pd.concat(dataframes,ignore_index=True)
dataframes.to_csv(OUTPUT_FILE,sep="\t",header=None,index=False)
if __name__=="__main__":
main()