Could do something like this in python.
The below implementation:
- uses Hamming distance to rank order the changes in terms of distance.
- can weight transitions and transversions according to user input or will both be weighted as 1 if no weights are specified.
- will only include each pair of changes once, not twice.
- does not consider codons in the context of a nucleotide string, and as a result does not support deletion or insertion events, only substitutions
- returns a list of changes rank ordered by distance (smallest to largest differences).
from itertools import combinations
from collections import defaultdict
amino_acid_to_codons = { 'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
'Asn': ['AAT', 'AAC'],
'Asp': ['GAT', 'GAC'],
'Cys': ['TGT', 'TGC'],
'Gln': ['CAA', 'CAG'],
'Glu': ['GAA', 'GAG'],
'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
'His': ['CAT', 'CAC'],
'Ile': ['ATT', 'ATC', 'ATA'],
'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
'Lys': ['AAA', 'AAG'],
'Met': ['ATG'],
'Phe': ['TTT', 'TTC'],
'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
'Trp': ['TGG'],
'Tyr': ['TAT', 'TAC'],
'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
'Stop': ['TAA', 'TAG', 'TGA'] }
def is_transition(nuc1, nuc2):
transitions = {'A': 'G', 'G': 'A', 'C': 'T', 'T': 'C'}
return transitions.get(nuc1) == nuc2
def hamming_distance_weighted(codon1, codon2, transition_weight=1, transversion_weight=1):
distance = 0
for c1, c2 in zip(codon1, codon2):
if c1 != c2:
if is_transition(c1, c2):
distance += transition_weight
else:
distance += transversion_weight
return distance
def compute_and_rank_distances(transition_weight=1, transversion_weight=1):
distances = []
processed_pairs = set()
for aa1, codons1 in amino_acid_to_codons.items():
for aa2, codons2 in amino_acid_to_codons.items():
for codon1 in codons1:
for codon2 in codons2:
if codon1 != codon2 and (codon2, codon1) not in processed_pairs:
distance = hamming_distance_weighted(codon1, codon2, transition_weight, transversion_weight)
distances.append((aa1, codon1, aa2, codon2, distance))
processed_pairs.add((codon1, codon2))
distances.sort(key=lambda x: x[4])
return distances
transition_weight = 1
transversion_weight = 2
ranked_distances = compute_and_rank_distances(transition_weight, transversion_weight)
numChangesToDisplay=50
for aa1, codon1, aa2, codon2, distance in ranked_distances[:numChangesToDisplay]:
print(f"{codon1} ({aa1}) -> {codon2} ({aa2}) with distance {distance}")
see Amino Acid Change To Genomic Location
rs number of an article PMID: 15816807
Amino Acid Change To Genomic Location: using 'backlocate' ; ?