#  ============================================================================
#
#  This script performs RMSD clustering of the multiple "fits" starting from
#  the best scoring model using the list_pdb_score file.
#
#  =======================  Maya Topf, 19 Dec 2008 ============================

from modeller import *
from modeller.scripts import complete_pdb
from random import *
import shutil
import sys, os, os.path
import string

log.verbose()
env = environ()

def Usage():
    if len(sys.argv) != 4:
          print 'Usage:', sys.argv[0] + ' list_pdbs_filename cutoff_rmsd score_column'
          sys.exit(-1)

Usage()
list_pdb_file = sys.argv[1]
rms_min_value = float(sys.argv[2])
col_num = int(sys.argv[3]) # the tyoe of score based on which we want to sort the models

# ----------------------------

class MyPDB:
    def __init__(self, name):
        self.NAME = name
    def __init__(self, code):
        self.NAME = code
    def __init__(self, cc):
        self.CC = []
    def __init__(self, fit):
        self.FIT = []
    def __init__(self, rmsd):
        self.RMSD = []
    def __init__(self, lrms):
        self.LRMS = []
    def __init__(self, flag):
        self.CLASS = []

# load the domain pdbs
def load_input(models):

#sorting the input based on the energy
    os.system("sort -nk " + str(col_num) + " " + list_pdb_file + " | awk \'{print $1, $" + str(col_num) + "}\' > " + list_pdb_file + "_sort")
    in_file = open(list_pdb_file + '_sort' ,"r")
    num=0
    while True:
        in_line = in_file.readline()
        if len(in_line) == 0:
           break
        in_line = in_line[:-1]
        [model_name, score] = string.split(in_line)
        newmodel = MyPDB(num)
        newmodel.NAME = model_name 
        newmodel.CODE = model_name.split('_')[0] 
        newmodel.CC = float(score) 
        newmodel.RMSD = []
        newmodel.LRMS = float(0) 
        newmodel.CLASS = int(0) 
        num+=1
        models.append(newmodel)

def rmsd_calc(ipdb):

     for fit in ipdb.NAME:
       if (fit!=''):
         mdl = model(env, file=ipdb.NAME)
         aln = alignment(env)
         aln.append_model(mdl=mdl, align_codes=ipdb.NAME, atom_files=ipdb.NAME)
         mdl2  = model(env, file=fit)
         aln.append_model(mdl=mdl2, align_codes=fit, atom_files=fit)
         atmsel = selection(mdl).only_atom_types('CA')
         r = atmsel.superpose(mdl2, aln,rms_cutoff=3.5,fit=False)
         ipdb.RMSD.append(float(r.rms))
         print "rms= %.2f" % r.rms

# print the results for a specific domain (in the entire map) in pdb_name.results 
def print_results(models,class_num):

     outp = open("classes.txt", "w")
     outp.write("pdb name\tscore\tlrms\tclass\n")

     for i in range(1,class_num+1):
       
# print the fits of each class ordered by the highest score 
       for ipdb in models:
         if (ipdb.CLASS == i): 
           outp.write("%s\t%.3f\t%.1f\t%d\n" %(ipdb.NAME,ipdb.CC,ipdb.LRMS,ipdb.CLASS))
       
     outp.close()

#############################################################################      
models = []

load_input(models)

ini_num = 0
end_num = len(models)
fit_class = 0

for ipdb in models:
#  print 'pdb name ' + ipdb.NAME 
#  if ipdb.NAME!='':
    print "model num %d: %s\n" %(models.index(ipdb)+1, ipdb.CODE)

#cluster_fits
    ini_num1 = models.index(ipdb)
    print 'next index ' + str(ini_num1)
    if ipdb.CLASS == 0:
        fit_class+=1
        for ipdb1 in models[ini_num1 : end_num]:

           if ipdb1.CLASS == 0:

             mdl = model(env, file=ipdb.NAME)
             aln = alignment(env)
             aln.append_model(mdl=mdl, align_codes=ipdb.NAME, atom_files=ipdb.NAME)
             mdl2  = model(env, file=ipdb1.NAME)
             aln.append_model(mdl=mdl2, align_codes=ipdb1.NAME, atom_files=ipdb1.NAME)
             atmsel = selection(mdl).only_atom_types('CA')
             r = atmsel.superpose(mdl2, aln,rms_cutoff=3.5,fit=False)
             ipdb1.LRMS = float(r.rms)
             print "rms of %s from best local fit (%s)= %.2f" %(ipdb1.NAME, ipdb.NAME, r.rms)
             if ipdb1.LRMS < rms_min_value:
               ipdb1.CLASS = fit_class
             print 'class= ' + str(ipdb1.CLASS)


           else: continue

    else: continue

# calc RMSD of each fit
#    rmsd_calc(ipdb)

#print the results 
print_results(models,fit_class)

os.system("rm -f *_init_1.pdb *.MRC tmp")