I created a Python script for doing this, although it appears to be quite slow. The main bottleneck appears to be show-aligns
, but I could be wrong about that.
The main/important function to use is delta_to_fasta
. It requires Pandas and Biopython as dependencies. If it is going really slowly you can try turning on verbose=True
.
Update/edit: Sometimes for really large query sequences (e.g. 25GB but not 500 MB), writing to file only occurs in chunks, so the output file won't be empty until after verbose mode says about 20-25 pairs of ref+query sequences should have already been written. I don't know what the cause or reason for this is. In any case the fact that it is streaming/uses generators means that you can see partial output as the writing progresses and get some output if it's canceled partway through, which is helpful given how incredibly slow this can be.
import subprocess
import re
import pandas as pd
from io import StringIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import write
def delta_file_to_dataframe(delta_filename, skiprows=4):
"""
I believe '1' and 'R' always refer to reference sequences, and '2' and 'Q' to query sequences.
Reference and query corresponding mostly to the order of input of the files to the nucmer or promer commands.
"""
coords_file = subprocess.run(['show-coords', '-c', '-l', '-r', '-T', delta_filename],
stdout=subprocess.PIPE).stdout.decode('utf-8')
column_names =[ 'S1', 'E1', 'S2', 'E2', 'LEN 1', 'LEN 2', '% IDY', 'LEN R', 'LEN Q', 'COV R', 'COV Q', 'Ref Id', 'Query Id']
dataframe = pd.read_table(StringIO(coords_file), skiprows=skiprows, index_col=False, header=None, names=column_names)
return dataframe
def get_alignments_from_ids(delta_filename, ref_id, query_id):
alignments = subprocess.run(['show-aligns', delta_filename, ref_id, query_id],
stdout=subprocess.PIPE).stdout.decode('utf-8')
begin_alignment_regex = '-- BEGIN alignment \[ (?P<ref_direction>[+\-])1 (?P<ref_start>[0-9]+) - (?P<ref_end>[0-9]+) \|' + \
' (?P<query_direction>[+\-])1 (?P<query_start>[0-9]+) - (?P<query_end>[0-9]+) \]\n\n'
end_alignment_regex = '\n\n--\s+END alignment \[ [+\-]1 [0-9]+ - [0-9]+ \| [+\-]1 [0-9]+ - [0-9]+ \]'
parse_regex = '(?s)'+begin_alignment_regex+'(?P<alignment_string>.*?)'+end_alignment_regex
parsed_alignments = [match.groupdict() for match in re.finditer(parse_regex, alignments)]
parsed_alignments = pd.DataFrame(parsed_alignments)
alignment_strings = list(parsed_alignments['alignment_string'])
parsed_alignments = parsed_alignments.drop(columns=['alignment_string'])
ref_sequences, query_sequences = tuple(zip(*[clean_alignment_string(alignment_string) for alignment_string in alignment_strings]))
parsed_alignments['ref_sequence'] = ref_sequences
parsed_alignments['query_sequence'] = query_sequences
return parsed_alignments
def extract_ref_and_query_seq(cleaned_lines):
return re.findall('[0-9]+\s+(.*)\n[0-9]+\s+(.*)', cleaned_lines)[0]
def clean_alignment_string(alignment_string):
seqs_overline_regex = '[\n]?[\s0-9\|]+\n'
seqs_underline_regex = '\n[\s\^]+[\n]?'
cleaned_seqs = re.findall('(?s)'+seqs_overline_regex+'(.*?)'+seqs_underline_regex, alignment_string)
extracted_sequences = [extract_ref_and_query_seq(line) for line in cleaned_seqs]
extracted_sequences = list(zip(*extracted_sequences))
extracted_sequences = [''.join(extracted_sequence).upper() for extracted_sequence in extracted_sequences]
ref_sequence, query_sequence = tuple(extracted_sequences)
return ref_sequence, query_sequence
def get_seq_record(group, delta_filename, ref_sequences=True, query_sequences=True, ungap=False, verbose=False):
assert ref_sequences or query_sequences, "The generator has nothing to do if neither reference nor query sequences are yielded"
ref_id, query_id = group[0]
match_information = group[1]
percent_identities = list(match_information['% IDY'])
alignments = get_alignments_from_ids(delta_filename, ref_id, query_id)
for row_number in range(len(alignments.index)):
if verbose:
print("\tWriting match #{}".format(row_number+1))
if ref_sequences:
ref_start = alignments['ref_start'][row_number]
ref_end = alignments['ref_end'][row_number]
ref_direction = alignments['ref_direction'][row_number]
ref_seq_id = '{}:{}-{}({})'.format(ref_id, ref_start, ref_end, ref_direction)
pct_identity = percent_identities[row_number]
ref_description = '{}% identity match to query'.format(pct_identity)
ref_sequence = Seq(alignments['ref_sequence'][row_number])
if ungap:
ref_sequence = ref_sequence.ungap(".")
ref_seqrecord = SeqRecord(ref_sequence, id=ref_seq_id, description=ref_description)
yield ref_seqrecord
if query_sequences:
query_start = alignments['query_start'][row_number]
query_end = alignments['query_end'][row_number]
query_direction = alignments['query_direction'][row_number]
query_seq_id = '{}:{}-{}({})'.format(query_id, query_start, query_end, query_direction)
pct_identity = percent_identities[row_number]
query_description = '{}% identity match to reference'.format(pct_identity)
query_sequence = Seq(alignments['query_sequence'][row_number])
if ungap:
query_sequence = query_sequence.ungap(".")
query_seqrecord = SeqRecord(query_sequence, id=query_seq_id, description=query_description)
yield query_seqrecord
def delta_to_fasta(delta_filename, output_filename, ref_sequences=True, query_sequences=True, ungap=True, output_format="fasta", verbose=False, skiprows=4):
"""
The created SeqRecord objects only have id, description, and sequence fields.
So the output formats BioPython supports for this besides "fasta" include
e.g. "tab" and do not include "nexus" or "fastq".
That being said, I imagine you could easily modify the code for `get_seq_record`
to add the desired/required extra attributes to each SeqRecord such that your
preferred output format is supported.
"""
dataframe = delta_file_to_dataframe(delta_filename, skiprows=skiprows)
seqrecord_iterators = [get_seq_record(group, delta_filename, ref_sequences=ref_sequences,
query_sequences=query_sequences, ungap=ungap, verbose=verbose)
for group in iter(dataframe.groupby(by=['Ref Id', 'Query Id']))]
def concatenate_iterables(*iterables, verbose=False):
for counter, iterable in enumerate(iterables, 1):
if verbose:
print('Starting group #{}'.format(counter))
yield from iterable
ultimate_seqrecord_iterator = concatenate_iterables(*seqrecord_iterators, verbose=verbose)
write(ultimate_seqrecord_iterator, output_filename, output_format)
Would the 'show-aligns' command in Mummer package help? I used that for relative small regions. Btw, from the Mummer output (like 'show-snps') you can get snp/indel positions with flanking regions. I used that for grabbing sequences and pipe into primer3 to get primers.
Hi Vitis & thanks for the suggestion. Yes, for small inserts it works ... but I'm interested in multiple-kb insertions. Those are split into different "aligns" by mummer...