# THis one...all pairs of solutions in same run, compare delta perfect_cc with delta of scoring
# 070808 count number of correct decisions for |delta_cc|>0.05
"""
DIR MAD/aep/run_020208_scoring
Solution 2 AutoSol_run_1_ Z_SCORE: 431.8692 CC_TO_PERFECT: 0.6591
 NAME CC RFACTOR SKEW FOM NCS_OVERLAP
 RAW 0.6905 0.314 0.4314 0.78 1.077
 ZSCORE 100.3708 165.0859 96.964 15.6 53.8485
Solution 4 AutoSol_run_1_ Z_SCORE: 19.5832 CC_TO_PERFECT: 0.0365
 NAME CC RFACTOR SKEW FOM NCS_OVERLAP
 RAW 0.267 0.5769 0.0037 0.78 0.0
 ZSCORE 0.772 2.3374 0.8738 15.6 0.0
Solution 6 AutoSol_run_1_ Z_SCORE: 434.4561 CC_TO_PERFECT: 0.6591
 NAME CC RFACTOR SKEW FOM NCS_OVERLAP
 RAW 0.6923 0.3106 0.4314 0.78 1.078
 ZSCORE 100.7941 167.1953 96.9677 15.6 53.8991
"""
import sys,os
extra=""
replace_dm_cc=True# set to true to look at DM maps
if 'no_replace' in sys.argv:
  print " Not replacing dm_cc..."
  replace_dm_cc=False
  extra+="_no_replace_dm_cc"
#
match_ncs_copies=True
if 'no_match_ncs' in sys.argv:
  print " Not matching NCS copies..."
  match_ncs_copies=False
  extra+="_no_match_ncs_copies"

ncs_max=100 # set this to restrict ncs copies
solvent_min=0.  # set these to restrict solvent content
solvent_max=1.
perfect_cc_min=0.00
perfect_cc_max=1.
expt_type_only=None #'mad'
resolution_min=0.
resolution_max=200.

tol_top=0.02  # set this to 0 to only consider top solution

build_cc_cut=0.20
tolerance=0.05  # worth talking about
use_dm_cc=True

use_list=[]
info_dict={}
fom_cut=0.00  # minimum FOM because below that we do not density modify...
if 'no_fom_cut' in sys.argv:
  fom_cut=-1.0
  extra+="_no_fom_cut"
  
for line in open('no_ncs_med_frac.log').readlines():
  spl=line.split()
  if spl[0]!='DIR': continue
  ncs=int(spl[4])
  frac=float(spl[5])
  resolution=float(spl[6])
  expt_type=spl[7]
  
  run=spl[1]
  if ncs > ncs_max: continue
  if frac < solvent_min: continue
  if frac > solvent_max: continue
  use_list.append(run)
  if not run in info_dict.keys(): info_dict[run]={}
  info_dict[run]['ncs']=ncs
  info_dict[run]['frac']=frac
  info_dict[run]['expt_type']=expt_type
  info_dict[run]['resolution']=resolution
print "RUNS TO USE: ",use_list


max_solns=0
if use_dm_cc:
 # 071108: read in values of density-modified CC separately
 dm_cc_dict={}
 for line in open('all_dm_cc.log').readlines(): 
  if not line: continue
  spl=line.split() 
  dataset=spl[0]
  dataset_spl=dataset.split('/')[:-2]
  if len(dataset_spl)==2: dataset_spl=["jcsg"]+dataset_spl
  dataset="/".join(dataset_spl)
  solution=spl[1]
  cc=float(spl[2])
  if not dataset in dm_cc_dict.keys():
     dm_cc_dict[dataset]={}
  dm_cc_dict[dataset][solution]=cc 
  if len(dm_cc_dict[dataset].keys())>max_solns: 
     max_solns=len(dm_cc_dict[dataset].keys())

number_used=0
number_of_solutions=0
for dataset in dm_cc_dict.keys():
  if len(dm_cc_dict[dataset].keys())>1:
    number_used+=1
    number_of_solutions+=len(dm_cc_dict[dataset].keys())
print "Number with >1 solutions: ",number_used," number of such solutions: ",number_of_solutions
     

# count datasets and max solutions per dataset
print "Number of datasets with dm_cc:",len(dm_cc_dict.keys())
print "Max solutions per dataset: ",max_solns

verbose=True
file="all_res_dm.dat"
print "Analyzing data in ",file
all_data_raw={}
all_data_z_score={}
all_data_overall_z={}
all_data_perfect_cc={}
score_type_list=[]
run_data={}
run_name=None
solution_name=None
solution_data={}

solution_name=None
types=None
raw_scores=None
z_scores=None

for line in open(file).readlines():
  print "LINE: ",line
  if not line: continue
  if not line.split(): continue
  if line.split()[0]=="DIR" and len(line.split())>1 and not line.split()[1] in use_list:
     print "SKIPPING: ",line.split()[1]
     run_name=None
     continue
  if line.split()[0]=="DIR" and len(line.split())>1:
     run_name=line.split()[1]
     #print "New run: ",run_name
     run_data_raw={}
     all_data_raw[run_name]=run_data_raw
     run_data_z_score={}
     all_data_z_score[run_name]=run_data_z_score
     run_data_overall_z={}
     all_data_overall_z[run_name]=run_data_overall_z
     run_data_perfect_cc={}
     all_data_perfect_cc[run_name]=run_data_perfect_cc

  elif not run_name:
     continue
  elif  line.split()[0]=="Solution" and len(line.split())>1:
     solution_name=line.split()[1]
     try:
       solution_z_score=float(line.split()[4])
       solution_perfect_cc=float(line.split()[6])
     except:
       solution_z_score=None
       solution_perfect_cc=None

     #print "new solution: ",solution_name 
  elif  line.split()[0]=="NAME" and len(line.split())>1:
     types=line.split()[1:]
     if not score_type_list or len(score_type_list)<len(types):
          score_type_list=types
          print "SCORE TYPE LIST: ",score_type_list
  elif  line.split()[0]=="RAW" and len(line.split())>1:
     raw_scores=line.split()[1:]
  elif  line.split()[0]=="ZSCORE" and len(line.split())>1:
     z_scores=line.split()[1:]

  if solution_name and types and raw_scores and z_scores and\
       solution_z_score and solution_perfect_cc : # new solution
     solution_data_raw={}
     solution_data_z_score={}
     ok=True
     for type,raw_score,z_score in zip(types,raw_scores,z_scores):
       solution_data_raw[type]=raw_score
       solution_data_z_score[type]=z_score
       print solution_name,type,raw_score
       if type == 'FOM' and float(raw_score) < fom_cut: 
         print "SKIPPING: ",solution_name,type,raw_score
         ok=False
     if use_dm_cc:
      if run_name in dm_cc_dict.keys() and \
        solution_name in dm_cc_dict[run_name].keys():
       solution_data_raw['dm_cc']=dm_cc_dict[run_name][solution_name]
       solution_data_z_score['dm_cc']=dm_cc_dict[run_name][solution_name]
       if not 'dm_cc' in score_type_list: score_type_list.append('dm_cc')
      else:
       solution_data_raw['dm_cc']=None
       solution_data_z_score['dm_cc']=None
     if ok:
       run_data_raw[solution_name]=solution_data_raw
       run_data_z_score[solution_name]=solution_data_z_score
       run_data_overall_z[solution_name]=solution_z_score
       run_data_perfect_cc[solution_name]=solution_perfect_cc
     else:
       print "SKIPPING LOW FOM: ",solution_name,solution_data_raw
     solution_name=None
     types=None
     raw_scores=None
     z_scores=None

# analyze the data now:

run_list=all_data_raw.keys()
print "Total of ",len(run_list),"runs to analyze"

# write out all pairs of solutions from same run differing by 0.05 or less
#  in perfect_cc, with dm_cc of the two as x, y
# 080308 ...with same number of NCS copies used...
pair_file=open('pair_file'+extra+'.list','w')
for run in run_list:
  if verbose:print "\nRUN ",run
  solution_data_raw=all_data_raw[run]
  solution_data_perfect_cc=all_data_perfect_cc[run]
  solution_list=solution_data_raw.keys()
  if len(solution_list)<2: continue  # skip ones with just one...
  # Eliminate run if BUILD_CC is used and no run has BUILD_CC > build_cc_cut
  if 'BUILD_CC' in score_type_list:  
    highest_build_cc=None
    for solution in solution_list:
      data_raw=solution_data_raw[solution]
      try:
         raw=float(data_raw['BUILD_CC'])
         if highest_build_cc is None or raw>highest_build_cc:
           highest_build_cc=raw 
      except: pass
    if highest_build_cc is None or highest_build_cc<build_cc_cut:
      continue

  used_solutions=[]
  for s1 in solution_list:
   data_raw_1=solution_data_raw[s1]
   used_solutions.append(s1)
   for s2 in solution_list:
    if s2 in used_solutions: continue
    data_raw_2=solution_data_raw[s2]
    if match_ncs_copies:
       if not data_raw_1['NCS_COPIES'] in ['None',None] and \
          not data_raw_2['NCS_COPIES'] in ['None',None] and \
          int(data_raw_1['NCS_COPIES']) != int(data_raw_2['NCS_COPIES']):
        print "Skipping pair ",run,s1,s2,\
        " WITH DIFFERING NCS: ",data_raw_1['NCS_COPIES'],data_raw_2['NCS_COPIES']
        continue

    if data_raw_1['dm_cc'] in ['None',None] :continue
    if data_raw_2['dm_cc'] in ['None',None] :continue
    dm_cc_1=float(data_raw_1['dm_cc'])
    dm_cc_2=float(data_raw_2['dm_cc'])
    delta_dm=dm_cc_1-dm_cc_2
    if delta_dm<0.: delta_dm=-1.*delta_dm
    perfect_cc_1=solution_data_perfect_cc[s1]
    perfect_cc_2=solution_data_perfect_cc[s2]
    delta=perfect_cc_1-perfect_cc_2
    if delta<-0.05 or delta>0.05: continue
    if delta >=0:
      print >> pair_file, run,s1,s2,dm_cc_1,dm_cc_2,delta_dm
    else:
      print >> pair_file, run,s1,s2,dm_cc_2,dm_cc_1,delta_dm




dm_cc_file=open('cc_vs_dm_cc.list','w')
raw_out_3_dm=open('raw_3_dm.list','w')
zscore_out_3_dm=open('zscore_3_dm.list','w')
print >>raw_out_3_dm,"run solution other perfect_cc-perfect_cc_other",
for score_type in score_type_list:
  print >> raw_out_3_dm, score_type+"-"+score_type+"_other",
print >>raw_out_3_dm
print >>zscore_out_3_dm,"run solution other perfect_cc-perfect_cc_other zscore-zscore_other",
for score_type in score_type_list:
  print >> zscore_out_3_dm, score_type+"-"+score_type+"_other",
print >>zscore_out_3_dm

same={}
correct={}
incorrect={}
top_correct={}
top_incorrect={}
top_perfect={}
top_obtained={}
top_n={}
top_worst_delta={}
for score_type in score_type_list:
  same[score_type]=0
  correct[score_type]=0
  incorrect[score_type]=0
  top_correct[score_type]=0
  top_incorrect[score_type]=0
  top_perfect[score_type]=0.
  top_obtained[score_type]=0.
  top_n[score_type]=0.
  top_worst_delta[score_type]=None

runs_used=0
unique_runs=[]
solutions_used=0
unique_runs_for_correct=[]
all_solutions_for_currect=[]
for run in run_list:
  if verbose:print "\nRUN ",run
  solution_data_raw=all_data_raw[run]
  solution_data_z_score=all_data_z_score[run]
  solution_data_overall_z=all_data_overall_z[run]
  solution_data_perfect_cc=all_data_perfect_cc[run]
  solution_list=solution_data_raw.keys()


  # Eliminate run if BUILD_CC is used and no run has BUILD_CC > build_cc_cut
  # NOT eliminating here 071208, but testing
  skip_this_one=False
  if 'BUILD_CC' in score_type_list:  
    highest_build_cc=None
    for solution in solution_list:
      data_raw=solution_data_raw[solution]
      try:
         raw=float(data_raw['BUILD_CC'])
         if highest_build_cc is None or raw>highest_build_cc:
           highest_build_cc=raw 
      except: pass
    if highest_build_cc is None or highest_build_cc<build_cc_cut:
      print "Skipping",run," with build_cc of ",highest_build_cc
      skip_this_one=True
  if len(solution_list)<1:   # nothing: skip entirely
    continue

  if len(solution_list)<2: 
     skip_this_one=True
     print "Skipping run ",run," with ",len(solution_list)," solutions"
   
 
  for score_type in score_type_list:
    highest_score=None
    highest_perfect=None
    perfect_of_highest_score=None
    for solution in solution_list:
 
      data_raw=solution_data_raw[solution]
      try:
         raw=float(data_raw[score_type])
      except: 
         raw=0.0
      if score_type=='FULL_DM_R':
         raw=1.-raw
      perfect_cc=solution_data_perfect_cc[solution]
      if replace_dm_cc:  # replace perfect_cc with dm_cc
         if 1:#try:
           dm_cc=float(data_raw['dm_cc'])
           ncs=info_dict[run]['ncs']
           frac=info_dict[run]['frac']
           expt_type=info_dict[run]['expt_type']
           resolution=info_dict[run]['resolution']
           if score_type=='dm_cc' and perfect_cc>=perfect_cc_min and \
            perfect_cc<=perfect_cc_max  and expt_type_only in [None,expt_type] \
              and resolution>=resolution_min and resolution<=resolution_max:

             # plot them all here
             print >> dm_cc_file , run,solution,perfect_cc,dm_cc,\
                  ncs,frac,resolution,expt_type
             if not run in unique_runs:
               runs_used+=1
               unique_runs.append(run)
             solutions_used+=1
           perfect_cc=dm_cc
         if 0:#except: 
            continue # skip it
      if highest_perfect is None or \
        perfect_cc>highest_perfect:
           highest_perfect=perfect_cc
           print "NEW HIGHEST PERFECT: ",highest_perfect,run,solution
      if highest_score is None or \
        raw>highest_score:
           highest_score=raw
           perfect_of_highest_score=perfect_cc
           print "NEW PERFECT OF HIGHEST SCORE: ",perfect_cc,score_type,raw,run,solution

    if highest_perfect is None or perfect_of_highest_score is None:
      raise AssertionError,"no highest_perfect? "+str(run)+" "+str(highest_perfect)+" "+str(perfect_of_highest_score)
      continue
    else:
      pass #raise AssertionError,"yes highest_perfect? "+str(run)

    if skip_this_one: continue
   
    if perfect_of_highest_score>=highest_perfect-tol_top: 
       top_correct[score_type]+=1
       print "CORRECT: ",score_type,perfect_of_highest_score,highest_perfect
    else:
       print "WRONG  : ",score_type,perfect_of_highest_score,highest_perfect
       top_incorrect[score_type]+=1
    if score_type == score_type_list[0]:
     if not run in unique_runs_for_correct:
      unique_runs_for_correct.append(run)
     for solution in solution_list:
      all_solutions_for_currect.append(solution)

    if highest_perfect is not None and perfect_of_highest_score is not None:
      top_perfect[score_type]+=highest_perfect
      top_obtained[score_type]+=perfect_of_highest_score
      top_n[score_type]+=1.
      dd= perfect_of_highest_score - highest_perfect
      if top_worst_delta[score_type] is None or dd <top_worst_delta[score_type]:
        top_worst_delta[score_type]=dd 
        print "NEW WORST DELTA FOR ",score_type,":",dd,"for ",run,solution
        
print "Final runs used: ",runs_used," solutions used: ",solutions_used 
print "RUNS used in correct/incorrect: ",len(unique_runs_for_correct),\
     " solutions used: ",len(all_solutions_for_currect)
f=open('unique_runs.dat','w')
for run in unique_runs:
  print >>f, run
f.close()
   
print "PERCENT TOP IS BEST"
for score_type in score_type_list:
  total=top_correct[score_type]+top_incorrect[score_type]
  if total<0.01: total=0.01
  percent=100.*float(top_correct[score_type])/total
  print score_type,top_correct[score_type],total,percent

print 
print "EFFECTS OF CHOICES"
print "DELTA = CC_of_best_possible_solution - CC_of_soln_with_best_score>" 
print " TYPE <DELTA>  (worst DELTA)   N"
for score_type in score_type_list:
  top_perfect[score_type]=top_perfect[score_type]/top_n[score_type]
  top_obtained[score_type]=top_obtained[score_type]/top_n[score_type]
  diff=top_perfect[score_type]-top_obtained[score_type]
  print score_type,":",diff,  top_worst_delta[score_type],top_n[score_type]

