Question

Protein Loop Rmsd Calculations

2

Entering edit mode

11.0 years ago

mario.messih ▴ 30

I want to calculate the backbone (c+ca+n+o) RMSD between protein loops after superimposing 3 residues from each end of the loop (stems), the data is provided by a file called RF_models.csv which contains lines like {4FR9.pdb,A,28,35,1OFC.pdb,X,776,783} where the first is the pdb file, then the chain then the one stem residue before that start of the loop and one after the loop ends, then the second four values are the information of the second loop. In other words, in this specific case my loops are 29A-34A of protein 4FR9 and 777X-782X from protein 1OFC. I wrote the following script to calculate the backbone RMSD after superimposing 3 residues from each end of the loop, however, the RMSD values I obtained weren't very accurate, for example, some proteins achieved RMSD value = 3.656 where it's RMSD was much lower =0.456 when I calculated it in Pymol Also by visual checking I can tell that the values of Pymol is more accurate than then ones I obtained, so I wonder if my script is right or do I have something wrong in my script? this RMSD calculation problem is very important, yet I wasn't able to find any blog or forum to help me so any help is highly appreciated in advance, the following is my python code:

#!/usr/bin/python
import Bio.PDB
import numpy
import os
import sys, getopt
import csv
def compute_cluster():
             reader=csv.reader(open("RF_CASP10_models.csv")) #for Calculating RMSD of RF model
             for row in reader:
                  ref_atoms = []
              alt_atoms = []
              target=row[0]
              template=row[4]
              chain1=row[1]
              chain2=row[5]
              in1=int(row[2])
              in2=int(row[6])
              en1=int(row[3])
              en2=int(row[7])
              structure = Bio.PDB.PDBParser().get_structure(target, target)
              ref_model = structure[0]
              structure = Bio.PDB.PDBParser().get_structure(template, template)
              alt_model = structure[0]
              rmsd=100
                  try :
                  for count in range((in1-2),in1):
                      ref_atoms.append(ref_model[chain1][count]['N'])
                      ref_atoms.append(ref_model[chain1][count]['CA'])          
                      ref_atoms.append(ref_model[chain1][count]['C'])          
                      ref_atoms.append(ref_model[chain1][count]['O'])
                  for count in range((in2-2),in2):          
                          alt_atoms.append(alt_model[chain2][count]['N'])          
                          alt_atoms.append(alt_model[chain2][count]['CA'])          
                      alt_atoms.append(alt_model[chain2][count]['C'])          
                      alt_atoms.append(alt_model[chain2][count]['O'])          
                  for count in range(en1,(en1+2)):
                      ref_atoms.append(ref_model[chain1][count]['N'])
                      ref_atoms.append(ref_model[chain1][count]['CA'])          
                      ref_atoms.append(ref_model[chain1][count]['C'])          
                      ref_atoms.append(ref_model[chain1][count]['O'])
                  for count in range(en2,(en2+2)):          
                          alt_atoms.append(alt_model[chain2][count]['N'])          
                      alt_atoms.append(alt_model[chain2][count]['CA'])          
                      alt_atoms.append(alt_model[chain2][count]['C'])          
                      alt_atoms.append(alt_model[chain2][count]['O'])          
          except :
                         with open("new_RF_CASP10_models_RMSD.csv", "a") as myfile:
                               myfile.write(target+","+chain1+","+str(in1)+","+str(en1)+","+template+","+ chain2+","+str(in2)+","+str(en2)+","+"NA"+'\n')
                         continue

              super_imposer = Bio.PDB.Superimposer()
              super_imposer.set_atoms(ref_atoms, alt_atoms)
              super_imposer.apply(alt_model.get_atoms())
                  id1=[]
              id2=[]
              icod1=[]
              icod2=[]
              resall1=[]
              resall2=[]
              for residue in ref_model[chain1].get_residues():
                  het, resseq, icode=residue.get_id() 
                  if (resseq>(in1+1) and resseq<(en1-1)):
                     id1.append(resseq)
                     icod1.append(icode)
                     resall1.append(str(resseq)+icode)

              for residue in alt_model[chain2].get_residues():
                      het, resseq, icode=residue.get_id() 
                  if (resseq>(in2+1) and resseq<(en2-1)):
                     id2.append(resseq)
                     icod2.append(icode)
                     resall2.append(str(resseq)+icode)

              sum=0
              count=0
                  out=False
              for (i, item) in enumerate(id2):    
                      for atomo in ['N','CA','C','O']:
                        count=count+1
                          try:
                          diff_vector  = ref_model[chain1][' ',id1[i],icod1[i]][atomo].get_coord() - alt_model[chain2][' ',id2[i],icod2[i]][atomo].get_coord()
                          except :  
                              out=True
                              break 
                          sum=sum+ (numpy.sum(diff_vector * diff_vector))
                      if (out):
                         break 

              sum=sum/count
              sum=numpy.sqrt(sum)
                  if(out):
                      with open("new_RF_CASP10_models_RMSD.csv", "a") as myfile:
                            myfile.write(target+","+chain1+","+str(in1)+","+str(en1)+","+template+","+ chain2+","+str(in2)+","+str(en2)+","+"NA"+'\n')
                  else : 
                      with open("new_RF_CASP10_models_RMSD.csv", "a") as myfile:
                            myfile.write(target+","+chain1+","+str(in1)+","+str(en1)+","+template+","+ chain2+","+str(in2)+","+str(en2)+","+str(sum)+'\n')

               return 0

  def main():
           compute_cluster()
  if __name__ == "__main__":
     main()

biopython • 5.9k views

ADD COMMENT • link updated 2.1 years ago by Ram 45k • written 11.0 years ago by mario.messih ▴ 30

score 1 · Answer 1 · 2014-04-05

Problem 1: Pymol performs an optimization of the fit and removes outlier atoms to reduce the RMSD value. Try calculating the RMSD with the cycles option set to 0 and compare that value to the one obtained with BioPython.
Problem 2: You can obtain the RMSD of the loops just be accessing rmsd attribute of superimposer. Look at this example. No need for manual calculations.

score 0 · Answer 2 · 2014-04-01

Ideally, you should be using MDAnalysis for this sort of analysis. You can easily subset the loop sections from each PDB.

Here's an example from their website that calculates the RMS between two frames, and you can extend to multiple frames by performing pair-wise calculation:

import MDAnalysis
import MDAnalysis.analysis.rms
from MDAnalysis.tests.datafiles import PSF,DCD,CRD
u = MDAnalysis.Universe(PSF,DCD)
ref = MDAnalysis.Universe(PSF,DCD)     # reference closed AdK (1AKE) (with the default ref_frame=0)
#
# Or, for your case
#     u = MDAnalysis.Universe(pdb1_fname)
#     ref = MDAnalysis.Universe(pdb2_fname)     # reference closed AdK (1AKE) (with the default ref_frame=0)
#
# Actual call below
R = MDAnalysis.analysis.rms.RMSD(u, ref,
       select="backbone",             # superimpose on whole backbone of the whole protein
       groupselections=["backbone and (resid 1-29 or resid 60-121 or resid 160-214)",   # CORE
                        "backbone and resid 122-159",                                   # LID
                        "backbone and resid 30-59"],                                    # NMP
       filename="rmsd_all_CORE_LID_NMP.dat")
R.run()
R.save()
#
# Some plotting code below
#
import matplotlib.pyplot as plt
rmsd = R.rmsd.T   # transpose makes it easier for plotting
time = rmsd[1]
fig = plt.figure(figsize=(4,4))
ax = fig.add_subplot(111)
ax.plot(time, rmsd[2], 'k-',  label="all")
ax.plot(time, rmsd[3], 'k--', label="CORE")
ax.plot(time, rmsd[4], 'r--', label="LID")
ax.plot(time, rmsd[5], 'b--', label="NMP")
ax.legend(loc="best")
ax.set_xlabel("time (ps)")
ax.set_ylabel(r"RMSD ($\AA$)")
fig.savefig("rmsd_all_CORE_LID_NMP_ref1AKE.pdf")

And for simplicity, you can compute the RMSD between frames of a trajectory file from a MD simulation

u = Universe(PSF,DCD)
bb = u.selectAtoms('backbone')
A = bb.coordinates()  # coordinates of first frame
u.trajectory[-1]      # forward to last frame
B = bb.coordinates()  # coordinates of last frame
rmsd(A,B)
[Output] 6.8342494129169804

It requires numpy, scipy, and biopython as dependencies.

Best of luck!