Hi everyone,
I would like to write a python script to extract amino acid substitutions and indels from several protein alignments, and use it as an input for variant effect prediction with PROVEAN. So far I have modified an initial script (C: How to extract polymorphic positions from several alignments?) to extract only amino acid substitutions, but I haven't been able to extract correctly the indels according to PROVEAN input format (http://provean.jcvi.org/help.php#protein_variation_input_format). I have been able to extract indels position by position, but this doesn't work for PROVEAN. Any help would be very welcome!!
Here is the script:
for sequence in sequences:
print('Alignment: ' + sequence.split('/')[-1].split('_')[0])
alignment = AlignIO.read(sequence, 'fasta')
alignment.sort()
align_array_trim = prune_aln(alignment) #prune_aln is a function that I've created.
align_array = np.array([list(rec) for rec in alignment], np.character)
single = {}
if ((align_array_trim.shape[1] / alignment.get_alignment_length()) > 0.7):
#Only alignments over 70% of initial length after gaps are removed
seq_ids=[]
for record in alignment:
seq_ids.appendrecord.id)
refseq = list(alignment)[-1]
file1 = refseq.id.split('|')[1] + '.fasta'
filepath1=os.path.join('',file1)
f = open(filepath1,'w')
f.write('>' + refseq.id + '\n' + str(refseq.seq) + '\n')
f.close()
for column in range(align_array.shape[1]):
c = str(align_array[:,column])
counter = Counter(align_array[:,column])
if (len(counter) == 2):
# single polymorphism
main_letter = counter.most_common(1)[0][0]
letter = counter.most_common(2)[1][0]
for pos in np.where(align_array[:,column] == letter)[0]:
change = main_letter.decode('UTF-8') + str(column + 1) + letter.decode('UTF-8')
single.setdefault(change, [])
single[change].append(seq_ids[int(pos)].split('|')[0])
file2 = refseq.id.split('|')[1] + '_variants.txt'
filepath2=os.path.join('',file2)
f2 = open(filepath2,'w')
for change in single:
if not ('-' in change): #Here I'm avoiding indels because I don't know how to include them
number_spp = len(single.get(change))
if (number_spp == 1) and (single.get(change)[0] == '<Species_of_interest>'): #I only want changes that are otherwise conserved and just change in my focus species.
f2.write(str(re.findall('\d+', change)[0]) + ',' + change[0] + ',' + change[-1] + '\n')
f2.close()
#Input: a protein sequence and amino acid variants
#Provean input format per alignment: <position>,<reference_amino_acids>,<variant_amino_acids>