|
import urllib |
|
import os, os.path |
|
from optparse import OptionParser |
|
|
|
def main(superfamily): |
|
#fetch the list of domains in the superfamily from the CathDomainList |
|
dom_lst = get_domain_list(superfamily) |
|
#for each domain, retrieve the PDB file from CATH |
|
get_domain_structures(dom_lst, superfamily) |
|
|
|
def get_domain_list(superfamily): |
|
count = 0 |
|
domain_list = [] |
|
sf_tokens = superfamily.split('.') |
|
fh = open(os.path.join("data", "cath", "CathDomainList"), 'r') |
|
for line in fh.readlines(): |
|
if not line.startswith('#'): #exclude comment lines |
|
tokens = line.rstrip().split() |
|
#if C, A, T and H match, the domain is a member of the right superfamily |
|
if int(tokens[1]) == int(sf_tokens[0]) \ |
|
and int(tokens[2]) == int(sf_tokens[1]) \ |
|
and int(tokens[3]) == int(sf_tokens[2]) \ |
|
and int(tokens[4]) == int(sf_tokens[3]): |
|
domain_list.append(tokens[0]) |
|
count += 1 |
|
print("There are "+str(count)+" domains in superfamily "+superfamily) |
|
return domain_list |
|
|
|
def get_domain_structures(domain_list, superfamily): |
|
for domain in domain_list: |
|
#can also get chain and full pdb entries by modifying the URL |
|
url = 'http://www.cathdb.info/api/data/pdb/'+domain |
|
pdb = urllib.urlopen(url).read() |
|
if not os.path.exists(os.path.join('data', 'pdb', superfamily)): |
|
os.mkdir(os.path.join('data', 'pdb', superfamily)) |
|
out = open(os.path.join('data', 'pdb', superfamily, domain+'.pdb'), 'w') |
|
out.write(pdb) |
|
return url |
|
|
|
if __name__ == '__main__': |
|
parser = OptionParser(usage="Usage: %prog [options]", version="%prog 0.1") |
|
parser.add_option("-c", dest="c", help="Class", metavar="CLASS") |
|
parser.add_option("-a", dest="a", help="Architecture", metavar="ARCHITECTURE") |
|
parser.add_option("-t", dest="t", help="Topology", metavar="TOPOLOGY") |
|
parser.add_option("-s", dest="s", help="Homologous Superfamily", metavar="SUPERFAMILY") #would have used -h, but it's reserved for help |
|
(options, args) = parser.parse_args() |
|
sf = options.c+'.'+options.a+'.'+options.t+'.'+options.s |
|
main(sf) |
Hi, Andy from CATH here :-)
Do you mean the original whole-protein PDBs from which the domains were extracted, or the chopped PDBs we produce that just contain individual domains?