Could do something like this in python.
The below implementation:
- uses Hamming distance to rank order the changes in terms of distance.
- can weight transitions and transversions according to user input or will both be weighted as 1 if no weights are specified.
- will only include each pair of changes once, not twice.
- does not consider codons in the context of a nucleotide string, and as a result does not support deletion or insertion events, only substitutions
- returns a list of changes rank ordered by distance (smallest to largest differences).
from itertools import combinations
from collections import defaultdict
# Step 1: Define codons and amino acids
amino_acid_to_codons = { 'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
'Asn': ['AAT', 'AAC'],
'Asp': ['GAT', 'GAC'],
'Cys': ['TGT', 'TGC'],
'Gln': ['CAA', 'CAG'],
'Glu': ['GAA', 'GAG'],
'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
'His': ['CAT', 'CAC'],
'Ile': ['ATT', 'ATC', 'ATA'],
'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
'Lys': ['AAA', 'AAG'],
'Met': ['ATG'],
'Phe': ['TTT', 'TTC'],
'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
'Trp': ['TGG'],
'Tyr': ['TAT', 'TAC'],
'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
'Stop': ['TAA', 'TAG', 'TGA'] }
def is_transition(nuc1, nuc2):
transitions = {'A': 'G', 'G': 'A', 'C': 'T', 'T': 'C'}
return transitions.get(nuc1) == nuc2
def hamming_distance_weighted(codon1, codon2, transition_weight=1, transversion_weight=1):
# Calculate the Hamming distance between two codons with optional weighting for transitions and transversions.
# codon1 (str): The first and second codons.
# transition_weight (int): User-specified weight for transition changes.
# transversion_weight (int): User-specified weight for transversion changes.
# Returns: The weighted Hamming distance as an (int).
distance = 0
for c1, c2 in zip(codon1, codon2):
if c1 != c2:
if is_transition(c1, c2):
distance += transition_weight
else:
distance += transversion_weight
return distance
def compute_and_rank_distances(transition_weight=1, transversion_weight=1):
# Compute and rank the distances between all pairs of codons for all amino acids, using specified weights for
# transitions and transversions. Args:
# transition_weight and transversion_weights (int): The weight for transitions and transversions (int).
# Returns a list of tuples containing amino acid, codon1, amino acid, codon2, and distance, rank ordered by weight
# Hamming distance.
distances = []
processed_pairs = set() # Important: using a set prevents GAT -> GAC and GAC -> GAT from both appearing in the output.
for aa1, codons1 in amino_acid_to_codons.items():
for aa2, codons2 in amino_acid_to_codons.items():
for codon1 in codons1:
for codon2 in codons2:
if codon1 != codon2 and (codon2, codon1) not in processed_pairs:
distance = hamming_distance_weighted(codon1, codon2, transition_weight, transversion_weight)
distances.append((aa1, codon1, aa2, codon2, distance))
processed_pairs.add((codon1, codon2))
distances.sort(key=lambda x: x[4]) # Rank order the changes
return distances
# Example usage
transition_weight = 1 # Set your desired weights here
transversion_weight = 2 # If no weights are set, transitions and transversions will both be weighted as 1.
ranked_distances = compute_and_rank_distances(transition_weight, transversion_weight)
# Display the simplest changes
numChangesToDisplay=50
for aa1, codon1, aa2, codon2, distance in ranked_distances[:numChangesToDisplay]:
print(f"{codon1} ({aa1}) -> {codon2} ({aa2}) with distance {distance}")
see Amino Acid Change To Genomic Location
rs number of an article PMID: 15816807
Amino Acid Change To Genomic Location: using 'backlocate' ; ?