I want to calculate the backbone (c+ca+n+o) RMSD between protein loops after superimposing 3 residues from each end of the loop (stems), the data is provided by a file called RF_models.csv
which contains lines like {4FR9.pdb,A,28,35,1OFC.pdb,X,776,783}
where the first is the pdb file, then the chain then the one stem residue before that start of the loop and one after the loop ends, then the second four values are the information of the second loop. In other words, in this specific case my loops are 29A-34A of protein 4FR9 and 777X-782X from protein 1OFC. I wrote the following script to calculate the backbone RMSD after superimposing 3 residues from each end of the loop, however, the RMSD values I obtained weren't very accurate, for example, some proteins achieved RMSD value = 3.656 where it's RMSD was much lower =0.456 when I calculated it in Pymol Also by visual checking I can tell that the values of Pymol is more accurate than then ones I obtained, so I wonder if my script is right or do I have something wrong in my script? this RMSD calculation problem is very important, yet I wasn't able to find any blog or forum to help me so any help is highly appreciated in advance, the following is my python code:
#!/usr/bin/python
import Bio.PDB
import numpy
import os
import sys, getopt
import csv
def compute_cluster():
reader=csv.reader(open("RF_CASP10_models.csv")) #for Calculating RMSD of RF model
for row in reader:
ref_atoms = []
alt_atoms = []
target=row[0]
template=row[4]
chain1=row[1]
chain2=row[5]
in1=int(row[2])
in2=int(row[6])
en1=int(row[3])
en2=int(row[7])
structure = Bio.PDB.PDBParser().get_structure(target, target)
ref_model = structure[0]
structure = Bio.PDB.PDBParser().get_structure(template, template)
alt_model = structure[0]
rmsd=100
try :
for count in range((in1-2),in1):
ref_atoms.append(ref_model[chain1][count]['N'])
ref_atoms.append(ref_model[chain1][count]['CA'])
ref_atoms.append(ref_model[chain1][count]['C'])
ref_atoms.append(ref_model[chain1][count]['O'])
for count in range((in2-2),in2):
alt_atoms.append(alt_model[chain2][count]['N'])
alt_atoms.append(alt_model[chain2][count]['CA'])
alt_atoms.append(alt_model[chain2][count]['C'])
alt_atoms.append(alt_model[chain2][count]['O'])
for count in range(en1,(en1+2)):
ref_atoms.append(ref_model[chain1][count]['N'])
ref_atoms.append(ref_model[chain1][count]['CA'])
ref_atoms.append(ref_model[chain1][count]['C'])
ref_atoms.append(ref_model[chain1][count]['O'])
for count in range(en2,(en2+2)):
alt_atoms.append(alt_model[chain2][count]['N'])
alt_atoms.append(alt_model[chain2][count]['CA'])
alt_atoms.append(alt_model[chain2][count]['C'])
alt_atoms.append(alt_model[chain2][count]['O'])
except :
with open("new_RF_CASP10_models_RMSD.csv", "a") as myfile:
myfile.write(target+","+chain1+","+str(in1)+","+str(en1)+","+template+","+ chain2+","+str(in2)+","+str(en2)+","+"NA"+'\n')
continue
super_imposer = Bio.PDB.Superimposer()
super_imposer.set_atoms(ref_atoms, alt_atoms)
super_imposer.apply(alt_model.get_atoms())
id1=[]
id2=[]
icod1=[]
icod2=[]
resall1=[]
resall2=[]
for residue in ref_model[chain1].get_residues():
het, resseq, icode=residue.get_id()
if (resseq>(in1+1) and resseq<(en1-1)):
id1.append(resseq)
icod1.append(icode)
resall1.append(str(resseq)+icode)
for residue in alt_model[chain2].get_residues():
het, resseq, icode=residue.get_id()
if (resseq>(in2+1) and resseq<(en2-1)):
id2.append(resseq)
icod2.append(icode)
resall2.append(str(resseq)+icode)
sum=0
count=0
out=False
for (i, item) in enumerate(id2):
for atomo in ['N','CA','C','O']:
count=count+1
try:
diff_vector = ref_model[chain1][' ',id1[i],icod1[i]][atomo].get_coord() - alt_model[chain2][' ',id2[i],icod2[i]][atomo].get_coord()
except :
out=True
break
sum=sum+ (numpy.sum(diff_vector * diff_vector))
if (out):
break
sum=sum/count
sum=numpy.sqrt(sum)
if(out):
with open("new_RF_CASP10_models_RMSD.csv", "a") as myfile:
myfile.write(target+","+chain1+","+str(in1)+","+str(en1)+","+template+","+ chain2+","+str(in2)+","+str(en2)+","+"NA"+'\n')
else :
with open("new_RF_CASP10_models_RMSD.csv", "a") as myfile:
myfile.write(target+","+chain1+","+str(in1)+","+str(en1)+","+template+","+ chain2+","+str(in2)+","+str(en2)+","+str(sum)+'\n')
return 0
def main():
compute_cluster()
if __name__ == "__main__":
main()
OP asked for a reason why PyMOL and Biopython give different results, not on a third method to calculate values. This answer is slightly irrelevant for the question at hand..