#!/farm/bmm/apps2/bin/python

import math, string
from math import exp
from math import log
import random
import numpy as np
from array import *

output_file = open("coaltered_modules.txt", 'w')

class scoring_function (): # this class contains all of the coding for the mathematical manipulations in this process

	def __init__(self):
		pass

	#def binomial(self, n, k):
    		#result = 1
    		#for i in range(1, k+1):
        		#result = result*(n-i+1)/ i
    		#return result

	def log_fac(self, m, n):
    		summation = 0
		#print 'm', m, 'n', n
    		for i in range(int(m), int(n+1)):
			if i > 0:
				summation += log(i)
    		return summation
 
	def log_binomial(self, n,k):
		if k > (n-k):
			#print '!', (self.log_fac(n-k+1,n)-self.log_fac(2,k)) 
			return (self.log_fac(n-k+1,n)-self.log_fac(2,k))
    		else:
			#print '!!', (self.log_fac(k+1,n)-self.log_fac(2,n-k))
        		return (self.log_fac(k+1,n)-self.log_fac(2,n-k))

	def P(self, a, b, c, d, k, n):
		return (exp(self.log_binomial(a + b, k)))*(exp(self.log_binomial(c + d, a + c - k)))/(exp(self.log_binomial(n, a + c)))

	def S(self, P):
		if P > 0:
			return -log(P)
		if P < 0:
			#print 'error, used modulus'
			return 0
		if P == 0:
			# print 'Error, P = 0'
			return 0
		

class adjacency_matrix ():

	def get_seeds(self, input_file): # we need to define a group called 'connect' two sets of genes, our two modules
		file = open(input_file, 'r')
		lines = file.readlines()
		connect = []
		temp = []
		ID_list = lines[0].split()
		for x in ID_list:
			temp.append(x)
		connect.append(temp)
		ID_list = lines[1].split()
		temp = []
		for x in ID_list:
			temp.append(x)
		connect.append(temp)
		return connect
		

	def get_data_from_matrix(self, input_file, x): # This is an alternative to the above 'get_seeds', if the startign data is a adjacency matrix. This sorts genes into their modules, using an adjacency matrix
		file = open(input_file, 'r')
		lines = file.readlines()
		connect = []
		# processing the adjecency matrix
		for i in range(x, len(lines)):
			l = lines[i].split()
			l.remove(l[0])
			j = 0;
	
			# check/create the isloated group array
			sign = 0
			if len(connect) == 0:
				temp = []
				sign = 1
	
			for x in l:
				j=j+1	
				if x == '1':
					if i < j:
						# check if the interacting partner is, and j is not, in array
						if sign == 1:
							if i not in temp:
								temp.append(i)
								temp.append(j)
							else:
								temp.append(j)
							connect.append(temp)
							sign = 0
						else:
							for array in connect:
								flag = 0
								for g in array: 
									if g == i:
										flag += 1
									elif g == j:
										flag += 2
							if flag == 1:
								array.append(j)
							if flag == 0:
								temp = [i]
								temp.append(j)
								connect.append(temp)
					elif i == j:
						if sign == 1:
							if i not in temp:
									temp.append(i)
									connect.append(temp)
									sign = 0
									
						else:
							flag = 0
							for array in connect:
								if i in array:
									flag = 1
							if flag == 0:
								temp = [i]
								connect.append(temp)
								
				else:
					continue
		print 'There are', len(connect), 'module(s)'
		# print "%d module(s): %s" %(len(connect), connect)			
		return connect # connect is a master array, containing multiple arrays. Each of these arrays represents one module.
		

	def expand(self, input_file, input_expression_data, seeds, connect, module_data): # This calculates the co-alteration scores for different configurations of the seed genes and their interaction partners
		exp = expression_analysis(input_expression_data)
		app = scoring_function()
		coalteration_values = {}
		n = exp.n(input_expression_data)
		#a = abc[0]
		#b = abc[1]
		#c = abc[2]
		#d = n - (a + b + c)		
		#result = 0
		#app = scoring_function()
		#for k in range (0, int(a)):
		#	result = result + app.P(a, b, c, d, k, n)
		#PG1G2 = 1.0 - result # probability score
		#key = () # below, the new seeds array is converted into a tuple, in order to be used as a key in the dictionary
		#for MG in seeds:
	#		temp = ()
	#		for s in MG:
	#			tupe = (s,)
	#			temp = temp + tupe	
	#		key = key + (temp,)
	#	coalteration_values[key] = app.S(PG1G2) # This is the co-alteration score being added to a dictionary

		# expanding through network, finding more co-alteration values
		File = open(input_file, 'r') # reading adjacency matrix file
		lines = File.readlines()
		rows = []
		for i in range(1, len(lines)):
			l = lines[i].split()
			l.remove(l[0])
			rows.append(l)
		for module_group in seeds:
			usedgenes = []# to ensure the same combination is not computed multiple times
			for seed in module_group:
				usedgenes.append(seed)
			for seed in module_group:
				j = 0 # j will be the new gene being added for co-alteration score calculation
				for x in rows[seed - 1]: # below, we find out which genes the seed gene is directly interacting with (j)
					j += 1 # j is a direct interaction partner of one of the seed genes (or upon iteration, the interaction partner of one of the genes added to the group)
					if x == '1': # '1' means there is direct interaction
						flag = 0
						for usedgene in usedgenes: # to ensure the same combination is not computed multiple times
							if j == usedgene:
								flag += 1
						if flag == 0:
							usedgenes.append(j) # to ensure the same combination is not computed multiple times
							module_group.append(j) # this adds to the seeds set, so that we can compute the co-alteration score for a new combination of genes.
							abc = exp.abc(seeds, connect, module_data)
							a = abc[0]
							b = abc[1]
							c = abc[2]
							d = n - (a + b + c)
							result = 0 
							for k in range (0, int(a)):
								result = result + app.P(a, b, c, d, k, n)
							PG1G2 = 1.0 - result # probability score
							key = () # below, the new seeds array is converted into a tuple, in order to be used as a key in the dictionary
							for MG in seeds:
								temp = ()
								for s in MG:
									tupe = (s,)
									temp = temp + tupe	
								key = key + (temp,)
							coalteration_values[key] = app.S(PG1G2) # This is the co-alteration score being added to a dictionary
							module_group.remove(j) # here, we return back to the original seeds set, ready to add a new gene upon iteration
		if coalteration_values == {}: # the the event that there are no more genes to add
			key = () # below, the new seeds array is converted into a tuple, in order to be used as a key in the dictionary
			for MG in seeds:
				temp = ()
				for s in MG:
					tupe = (s,)
					temp = temp + tupe	
				key = key + (temp,)
			coalteration_values[key] = 0

		return coalteration_values
	
			
class expression_analysis(): # Only works for comparing TWO modules

	def __init__ (self, input_expression_data): # reads expression data file
		file = open(input_expression_data, 'r')
		lines = file.readlines()
		self.lines = lines

	def n (self, input_expression_data):
		for i in range(1, len(self.lines)):
			l = self.lines[i].split("\t")
			l.remove(l[0])
			# l.remove(l[-1]) # This is needed if there is a 'tab' space at the end of the patient expression data line
		n = float(len(l))
		return n

	def get_module_data(self, input_expression_data, connect): # This makes an array parallel to 'connect', with arrays of expression data from the samples instead of gene numbers
		module_data = connect
		# print "connect", module_data
		j = 0
		for i in range(1, len(self.lines)):
			l = self.lines[i].split("\t")
			l.remove(l[0])
			# l.remove(l[-1]) # This is needed if there is a 'tab' space at the end of the patient expression data line
			l = [float(unit) for unit in l]
			j += 1
			for dataset in module_data:
				for sample in dataset:
					if j == sample:
						dataset.append(l)
						dataset.remove(sample)		
		return module_data
		
	def abc(self, seeds, connect, module_data): # returns the number of samples with mutations in both modules, and each module separately, in the genes in 'seeds'
		mutations = [] # patients with mutation in first module. After flag = 2, this is for patients with mutations in the first, but not the second, module.
		comutations = [] # patients with mutations in both modules
		mutations2 = [] # patients  with mutations in second module, but not first module
		flag = 0
		for module_group in seeds: # Looks at a module set from seeds
			flag += 1
			for seed in module_group: # looks at individual genes in that module set
				##for array in connect:
				for array in range(0, len(connect)):
					##for g in array:	
					for g in range (0, len(connect[array])):					
						##if g == seed: # we see which module this gene belongs to, by checking 'connect'
						 if connect[array][g] == seed:
							sample = module_data[array][g]
							#for dataset in module_data:
								#if module_data.index(dataset) == connect.index(array): # We find the corresponding expression data 
									##for sample in dataset:
										##if dataset.index(sample) == array.index(g):
							std = np.std(sample) # standard deviation. Criteria for alteration is > +- std.
							mean = np.mean(sample) # mean of sample
							for unit in range (0, len(sample)): # looking at individual patient's expression data
								if sample[unit] > (mean + std) or sample[unit] < (mean - std): #(float(sum (sample)))/(float(len(sample))): # check for alteration. enrer criteria for 'alteraion' in this line.
									if flag == 1: # for the first module, the index of any patient with upregulation is added to the array 'mutations'. No upregulation, or patient number already in array, then it is not added.
										sign = 0
										for patient in mutations:
											if patient == unit:
												sign += 1
										if sign ==0:
											mutations.append(unit)
									if flag == 2: # when we look as the second module, we check to see if patients with upregulated expression in this modules gene(s) are also in 'mutations'. if so, they are removed from mutations and added to 'comutations'. 
										sign = 0
										for patient in mutations:
											if unit == patient:
												sign += 1
														
										if sign > 0:
											signal = 0
											for coalteration in comutations:
												if coalteration == unit:
													signal += 1
											if signal == 0:
												comutations.append(unit)
												mutations.remove(unit)
										if sign == 0: # if there is upregulation, but no comutation, patient number is added to 'mutations2'
											signal = 0
											for patient in mutations2:
												if patient == unit:
													signal += 1
																
											for coalteration in comutations:
												if coalteration == unit:
													signal += 1
											if signal == 0:
												mutations2.append(unit)
		
									if flag > 2:
										print "Error - too many modules?"
		a = float(len(comutations))
		b = float(len(mutations))
		c = float(len(mutations2))
		return (a, b, c) 
	

def main():

	input_graph = raw_input("which adjacency matrix file to read: ")
	input_expression_data1 = raw_input ("First set of expression data to read: ")
	input_genes = raw_input ("Input seeds gene list: ")
	#input_expression_data2 = raw_input ("Second set of expression data to read: ")
	adj = adjacency_matrix ()
	exp = expression_analysis(input_expression_data1)
	app = scoring_function()

	gene_set = adj.get_data_from_matrix(input_graph, 1) # connect is an array containing arrays (the 'modules') containing gene numbers for distinct modules
	module_data_sets = exp.get_module_data(input_expression_data1, gene_set)# this retrieves expression data and orders it as per 'connect'
	chance =  adj.get_data_from_matrix(input_graph, 1) # restores connect, under a new name, 'chance'
	chance2 = adj.get_data_from_matrix(input_graph, 1) # this is here so that after modules are removed from 'chance' we will still be able to know their original sequence, in order to make the module_data correlate.

	# IF YOU WANT TO RANDOMLY SELECT MODULES AND SEEDS
	#import random
	#connect = random.sample(chance, 2)
	#module_data = []
	#print '2 random modules', connect
	#for array in connect:
	#	i = chance.index(array)
	#	ii = connect.index(array)
	#	module_data.insert(ii, module_data_sets[i])
	#seeds = []

	#for array in connect:
	#	g = random.sample(array, 1)
	#	seeds.append(g)
	print 'Modules', chance
	for module in chance: # we need to methodically choose two modules from the interactome
		connect = []
		connect.append(module) # here, the first module is chosen
		chance.remove(module) # to make sure the same module combinations are not repeated
		for anothermodule in chance:
			connect.append(anothermodule) # here, the second module is chosen
			# output_file.write("Looking at modules %s and %s\n" % (module, anothermodule))
			module_data = []
			ii = -1
			for array in connect: # making sure that the module data fits connect
				ii += 1 #connect.index(array)
				i = chance2.index(array)
	 			ii = connect.index(array)
			 	module_data.insert(ii, module_data_sets[i])# Now we need to methodically choose seed genes
				#print 'CONNECT', connect, 'MODULE DATA', module_data
				print 'Looking at modules', connect[0], 'and', connect[1]
			for gene in connect[0]: # look at first module
				for g in connect[1]:# look at second module
					seeds = []
					seeds.append([gene]) #  seed gene from first module chosen
					seeds.append([g]) # seed gene from secodn module chosen
					print 'The seed genes are', seeds# these are the randomly generated starting seeds, step 1 from Fig.2 of Gu et al.
					originalseeds = seeds # this is to make sure that we have access to the original seeds, though the array 'seeds' will change
		
	
					coalteration_values1 = adj.expand(input_graph, input_expression_data1, seeds, connect, module_data) # this is step 2 from Fig.2 of Gu et al.
					v=list(coalteration_values1.values())
					k=list(coalteration_values1.keys())
					key = k[v.index(max(v))]# this returns the gene set, as a tuple, with the maximum co-alteration score (step 3)

					#key = max(coalteration_values1, key=coalteration_values1.get) 
					seeds = [] # below, we convert the tuple key back into an array called 'seeds'. This is the new set of genes, which is passed to step 4
					for series in key:
						module_group = []
						for s in series:
							module_group.append(s)
						seeds.append(module_group)
					flag = 0
					runs = -1
					print 'Step 3 genes:', seeds
					while flag == 0: # creates loop
						marker = 0 # this will decide whether the while loop continues or ends
						flare = 0 # if flare goes above zero, at least one combination during step 5 in which a gene has been removed from the set yields aa higher score, so this combination must be kept and recycled back to step 3.
						runs += 1 # Number of times we go through the 'loop'
						
						n = exp.n(input_expression_data1)# below we re-caclulate the step 3 seed gene score for later comparison, to mak sure the co-alteration score continues to increase if the programme continues
						abc = exp.abc(seeds, connect, module_data)
						a = abc[0]
						b = abc[1]
						c = abc[2]
						d = n - (a + b + c)		
						result = 0
						for k in range (0, int(a)):
							result = result + app.P(a, b, c, d, k, n)
						PG1G2 = 1.0 - result # probability score
						original_score = app.S(PG1G2)
						print 'Step 3 score: ', original_score, seeds

						coalteration_values2 = adj.expand(input_graph, input_expression_data1, seeds, connect, module_data) # step 4

						v=list(coalteration_values2.values())
						k=list(coalteration_values2.keys())
						keys = k[v.index(max(v))] # this returns the gene set, as a tuple, with the maximum co-alteration score
						key_value = coalteration_values2[keys] # get value to compare to later, in step 5


						if key_value <= original_score: # exit from step 4 if none of the combinations increase the co-alteration score
							flag += 1
							final_score = original_score
							print 'No change', seeds, 'score:', original_score

						if key_value > original_score:							
							newseeds = [] # convert key for maximum value from tuple, into an array called 'newseeds' for step 5
							for series in keys:# convert tuple 'keys' to array 'newseeds'
								MG = []
								for s in series:
									MG.append(s)
								newseeds.append(MG)

							for MG in newseeds: # the newly added gene from the step 4 process (expand) is not removed along with other non-original seeds in step 5, in order to ensure we cannot go backwards to the same set of genes we had in step 3.
								for s in MG:
									sign = 0
									for module_group in seeds:
										for seed in module_group:
											if s == seed:
												sign += 1
									if sign == 0:
										newly_added = s
										print 'Newly added gene:', newly_added, 'score: ', key_value
							if newseeds == seeds:
								flag += 1
								final_score = original_score
								print 'Error. No change', newseeds, 'score:', original_score
							else:
								higher_sets = {}
								for MG in newseeds: # step 5 of Fig.2 in Gu et al., we remove each new gene in turn, and see if co-alteration score change
									for s in MG:
										sign = 0
										if s == newly_added:
											sign += 1
										else:
											for module_group in originalseeds:
												for seed in module_group:
													if s == seed:
														sign += 1
							
							
										if sign == 0:
											index = MG.index(s) # find index of s so it can be re-inserted in the right place later					
											MG.remove(s)# re-computation of co-alteration score, minus new gene
											n = exp.n(input_expression_data1)
											abc = exp.abc(newseeds, connect, module_data)
											a = abc[0]
											b = abc[1]
											c = abc[2]
											d = n - (a + b + c)
											result = 0 
											app = scoring_function() 
											for k in range (0, int(a)):	
												result = result + app.P(a, b, c, d, k, n)
											PG1G2 = 1.0 - result
											score = app.S(PG1G2)
											if score == key_value: # co-alteration score is no longer increasing, loop will break (Step 6)
												MG.insert(index, s)
												print 'Removing', s, 'has no effect', newseeds, 'score:', score
											if score > key_value: # co-alteration score has increased in this instance
												flare += 1
												key = ()
												for group in newseeds:
													temp = ()
													for item in group:
														tupe = (item,)
														temp = temp + tupe	
													key = key + (temp,)
												higher_sets[key] = score	
												MG.insert(index, s)
												print 'Removing', s,'heightens score', newseeds, 'score:', score
											
											if score < key_value:
												marker += 1				
												MG.insert(index, s) # s is re-inserted into the correct position in its module group array 
												print 'Removing', s,'lowers score', newseeds, 'score:', score
							if flare > 0: # a gene removal generated a higher co-alteration score, therefore, we must find out which removal gave the highest score (if there was more than one) and feed that back to step 3 by re-naming this group 'seeds'. 
								v=list(higher_sets.values())
								k=list(higher_sets.keys())
								key = k[v.index(max(v))] # this returns the gene set, as a tuple, with the maximum co-alteration score
								seeds = [] # convert key for maximum value from tuple, into an array called 'newseeds' for step 5
								for series in key:
									module_group = []
									for seed in series:
										module_group.append(seed)
									seeds.append(module_group)
								print 'Step 3 genes:', seeds, 'score: ', max(v) 
							if marker == 0 and flare == 0: # there has been no increase or depreciation, therefore the loop ends and we get our final set
								seeds = newseeds
								final_score = key_value
								flag += 1
							if marker > 0 and flare == 0: # there has been a deprecation when a gene was removed, so we can continue to grow. the loop continues.
								seeds = newseeds
								print 'Step 3 genes:', seeds, 'score: ', key_value
					originalseeds = sorted(originalseeds)
					seeds = sorted(seeds) # makes readout easier to understand	
					output_file.write("%s\tstarting seeds: %s\tfinal seeds: %s\n" % (final_score, originalseeds, seeds))
					print 'END:', seeds, 'score:', final_score, 'Runs:', runs, 'marker', marker, 'flag', flag

			connect.remove(anothermodule) # now ready to select a new module-pair at which to look


if __name__ == '__main__':		
	main()