What Pierre Lindenbaum posted works, but I couldn't be bothered to install another software to my space.
The stuff I wrote in Python was very slow for a ~GB file because of the shoddy way I wrote it - it loaded the ENTIRE file, then
did regex
capture. The bottleneck was loading. After some tinkering, I've made it faster (~2 minutes, compared to ~2 hours or so)
by using subprocess
module (essentially running shell commands via python). I'm sharing the script here in case anyone needs it.
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 4 10:50:46 2022
To use from the command line :
python "the_script_name.py" -i input_fasta.fasta -o output.bed
@author: ivanp
"""
import re
import argparse
import subprocess
import pandas as pd
import numpy as np
NUCLEOTIDE_PATTERN_GLOBAL = "[ATCG]+"
#%% FUNCTIONS
def findFastaNames(x,INPUT_FILE):
"""function that finds fasta names
x is the position of line in INPUT_FILE
"""
line_command = f"head -{x} {INPUT_FILE} | tail -n +{x}"
fasta_line = subprocess.check_output(line_command,shell=True).decode("utf-8")
fasta_line = fasta_line.replace(">","").split()[0]
return(fasta_line)
def getFastaPosition_Name(INPUT_FILE):
"""
Finds fasta key position and name and returns it
position - array of line position
name - array of names
example:
If INPUT_FILE consists of
">FastaBlasta mortimer and stuff" (line 0)
">FastaVlasta some more info" (line 12)
">FastaKasta some stuff also" (line 20)
This returns the following:
[0,12,20] and [FastaBlasta, FastaVlasta, FastaKasta]
"""
#this command outputs all lines that start with ">"
#and counts their position
GREP_COMMAND = f'grep ">" {INPUT_FILE} -n'
GREP_OUTPUT = subprocess.check_output(GREP_COMMAND,shell=True).decode("utf-8")
#this pattern finds which lines start with ">"
#its to be used on the output of GREP_COMMAND
#the following code finds extracts line positions of
#fasta names - the lines starting with ">"
#and extracts them into to arrays
GREP_OUTPUT = GREP_OUTPUT.split()
GREP_OUTPUT.pop(-1)
positions = [x[:x.find(":")] for x in GREP_OUTPUT]
positions = [int(x) for x in positions]
fastaKeyLines = np.array(positions,dtype=np.int64)
fastaKeyNames = np.array([findFastaNames(xi,INPUT_FILE) for xi in fastaKeyLines])
fastaKeyLines =np.append(fastaKeyLines,0) #this allows termination
return(fastaKeyLines,fastaKeyNames)
def findNucleotides(fasta_string):
"""
finds all nucleotide regions in a given fasta string
"""
spans = pd.DataFrame([what.span() for what in re.finditer(NUCLEOTIDE_PATTERN_GLOBAL,fasta_string)],
columns = ["start","end"])
return(spans)
def getNoGapDataFrame(index,fasta_start_array,INPUT_FILE):
"""
Returns noGapDataFrame from index position (integer)
and an array consisting of where FASTA files are found
and the FASTA file
The function first retrieves nucleotide sequence
with 'head' and 'tail'
And then uses "findNucleotides" to find uninterrupted
strings of ATCG and report it as a dataframe
"""
#the following block must be contained within a function
curr_line = fasta_start_array[index]
next_line = fasta_start_array[index+1]
#this selects lines from X+1 to Y-1
#defaults to from X+1 when next_line is 0
if next_line == 0:
NUCLEOTIDE_COMMAND = f"tail {INPUT_FILE} -n +{curr_line+1}"
else:
NUCLEOTIDE_COMMAND = f"head -{next_line-1} {INPUT_FILE} | tail -n +{curr_line+1}"
nucleotideString = subprocess.check_output(NUCLEOTIDE_COMMAND,shell=True).decode("utf-8")
#remove newline from nucleotides
nucleotideString = nucleotideString.replace("\n","")
#get the primitive no gap DataFrame
span_dataframe = findNucleotides(nucleotideString)
return(span_dataframe)
def formatNoGapDF(noGapDf,name):
"""
Formats NoGap for Craig Lowe's program
In a Nutshell:
[FASTA KEY] [START] [END] [INDEXED CONTIG ON FASTA KEY]
"""
noGapDf.insert(0,"key",name)
noGapDf["contig"] = noGapDf.index + 1
noGapDf["contig"] = noGapDf.key + "." + noGapDf.contig.astype(str)
return()
#%%MAIN FILE
def main():
"main function"
#parse main arguments
parser = argparse.ArgumentParser()
parser.add_argument("-i", help="Input file", required=True)
parser.add_argument("-o", help="Output file")
args = parser.parse_args()
INPUT_FILE = args.i
OUTPUT_FILE = args.o
if OUTPUT_FILE is None:
OUTPUT_FILE = "noGaps.bed"
#see functions above for specification
fasta_positions, fasta_names = getFastaPosition_Name(INPUT_FILE)
dataframes = [getNoGapDataFrame(index,fasta_positions,INPUT_FILE) for index in range(len(fasta_positions)-1)]
#quickly format it according to Craig Lowe's specification
[formatNoGapDF(dataframe,fasta_names[index]) for index,dataframe in enumerate(dataframes)]
dataframes = pd.concat(dataframes,ignore_index=True)
dataframes.to_csv(OUTPUT_FILE,sep="\t",header=None,index=False)
if __name__=="__main__":
main()