# ============================================================================ # # This script performs RMSD clustering of the multiple "fits" starting from # the best scoring model using the list_pdb_score file. # # ======================= Maya Topf, 19 Dec 2008 ============================ from modeller import * from modeller.scripts import complete_pdb from random import * import shutil import sys, os, os.path import string log.verbose() env = environ() def Usage(): if len(sys.argv) != 4: print 'Usage:', sys.argv[0] + ' list_pdbs_filename cutoff_rmsd score_column' sys.exit(-1) Usage() list_pdb_file = sys.argv[1] rms_min_value = float(sys.argv[2]) col_num = int(sys.argv[3]) # the tyoe of score based on which we want to sort the models # ---------------------------- class MyPDB: def __init__(self, name): self.NAME = name def __init__(self, code): self.NAME = code def __init__(self, cc): self.CC = [] def __init__(self, fit): self.FIT = [] def __init__(self, rmsd): self.RMSD = [] def __init__(self, lrms): self.LRMS = [] def __init__(self, flag): self.CLASS = [] # load the domain pdbs def load_input(models): #sorting the input based on the energy os.system("sort -nk " + str(col_num) + " " + list_pdb_file + " | awk \'{print $1, $" + str(col_num) + "}\' > " + list_pdb_file + "_sort") in_file = open(list_pdb_file + '_sort' ,"r") num=0 while True: in_line = in_file.readline() if len(in_line) == 0: break in_line = in_line[:-1] [model_name, score] = string.split(in_line) newmodel = MyPDB(num) newmodel.NAME = model_name newmodel.CODE = model_name.split('_')[0] newmodel.CC = float(score) newmodel.RMSD = [] newmodel.LRMS = float(0) newmodel.CLASS = int(0) num+=1 models.append(newmodel) def rmsd_calc(ipdb): for fit in ipdb.NAME: if (fit!=''): mdl = model(env, file=ipdb.NAME) aln = alignment(env) aln.append_model(mdl=mdl, align_codes=ipdb.NAME, atom_files=ipdb.NAME) mdl2 = model(env, file=fit) aln.append_model(mdl=mdl2, align_codes=fit, atom_files=fit) atmsel = selection(mdl).only_atom_types('CA') r = atmsel.superpose(mdl2, aln,rms_cutoff=3.5,fit=False) ipdb.RMSD.append(float(r.rms)) print "rms= %.2f" % r.rms # print the results for a specific domain (in the entire map) in pdb_name.results def print_results(models,class_num): outp = open("classes.txt", "w") outp.write("pdb name\tscore\tlrms\tclass\n") for i in range(1,class_num+1): # print the fits of each class ordered by the highest score for ipdb in models: if (ipdb.CLASS == i): outp.write("%s\t%.3f\t%.1f\t%d\n" %(ipdb.NAME,ipdb.CC,ipdb.LRMS,ipdb.CLASS)) outp.close() ############################################################################# models = [] load_input(models) ini_num = 0 end_num = len(models) fit_class = 0 for ipdb in models: # print 'pdb name ' + ipdb.NAME # if ipdb.NAME!='': print "model num %d: %s\n" %(models.index(ipdb)+1, ipdb.CODE) #cluster_fits ini_num1 = models.index(ipdb) print 'next index ' + str(ini_num1) if ipdb.CLASS == 0: fit_class+=1 for ipdb1 in models[ini_num1 : end_num]: if ipdb1.CLASS == 0: mdl = model(env, file=ipdb.NAME) aln = alignment(env) aln.append_model(mdl=mdl, align_codes=ipdb.NAME, atom_files=ipdb.NAME) mdl2 = model(env, file=ipdb1.NAME) aln.append_model(mdl=mdl2, align_codes=ipdb1.NAME, atom_files=ipdb1.NAME) atmsel = selection(mdl).only_atom_types('CA') r = atmsel.superpose(mdl2, aln,rms_cutoff=3.5,fit=False) ipdb1.LRMS = float(r.rms) print "rms of %s from best local fit (%s)= %.2f" %(ipdb1.NAME, ipdb.NAME, r.rms) if ipdb1.LRMS < rms_min_value: ipdb1.CLASS = fit_class print 'class= ' + str(ipdb1.CLASS) else: continue else: continue # calc RMSD of each fit # rmsd_calc(ipdb) #print the results print_results(models,fit_class) os.system("rm -f *_init_1.pdb *.MRC tmp")