#!/farm/bmm/apps2/bin/python import math, string from math import exp from math import log import random import numpy as np from array import * output_file = open("coaltered_modules.txt", 'w') class scoring_function (): # this class contains all of the coding for the mathematical manipulations in this process def __init__(self): pass #def binomial(self, n, k): #result = 1 #for i in range(1, k+1): #result = result*(n-i+1)/ i #return result def log_fac(self, m, n): summation = 0 #print 'm', m, 'n', n for i in range(int(m), int(n+1)): if i > 0: summation += log(i) return summation def log_binomial(self, n,k): if k > (n-k): #print '!', (self.log_fac(n-k+1,n)-self.log_fac(2,k)) return (self.log_fac(n-k+1,n)-self.log_fac(2,k)) else: #print '!!', (self.log_fac(k+1,n)-self.log_fac(2,n-k)) return (self.log_fac(k+1,n)-self.log_fac(2,n-k)) def P(self, a, b, c, d, k, n): return (exp(self.log_binomial(a + b, k)))*(exp(self.log_binomial(c + d, a + c - k)))/(exp(self.log_binomial(n, a + c))) def S(self, P): if P > 0: return -log(P) if P < 0: #print 'error, used modulus' return 0 if P == 0: # print 'Error, P = 0' return 0 class adjacency_matrix (): def get_seeds(self, input_file): # we need to define a group called 'connect' two sets of genes, our two modules file = open(input_file, 'r') lines = file.readlines() connect = [] temp = [] ID_list = lines[0].split() for x in ID_list: temp.append(x) connect.append(temp) ID_list = lines[1].split() temp = [] for x in ID_list: temp.append(x) connect.append(temp) return connect def get_data_from_matrix(self, input_file, x): # This is an alternative to the above 'get_seeds', if the startign data is a adjacency matrix. This sorts genes into their modules, using an adjacency matrix file = open(input_file, 'r') lines = file.readlines() connect = [] # processing the adjecency matrix for i in range(x, len(lines)): l = lines[i].split() l.remove(l[0]) j = 0; # check/create the isloated group array sign = 0 if len(connect) == 0: temp = [] sign = 1 for x in l: j=j+1 if x == '1': if i < j: # check if the interacting partner is, and j is not, in array if sign == 1: if i not in temp: temp.append(i) temp.append(j) else: temp.append(j) connect.append(temp) sign = 0 else: for array in connect: flag = 0 for g in array: if g == i: flag += 1 elif g == j: flag += 2 if flag == 1: array.append(j) if flag == 0: temp = [i] temp.append(j) connect.append(temp) elif i == j: if sign == 1: if i not in temp: temp.append(i) connect.append(temp) sign = 0 else: flag = 0 for array in connect: if i in array: flag = 1 if flag == 0: temp = [i] connect.append(temp) else: continue print 'There are', len(connect), 'module(s)' # print "%d module(s): %s" %(len(connect), connect) return connect # connect is a master array, containing multiple arrays. Each of these arrays represents one module. def expand(self, input_file, input_expression_data, seeds, connect, module_data): # This calculates the co-alteration scores for different configurations of the seed genes and their interaction partners exp = expression_analysis(input_expression_data) app = scoring_function() coalteration_values = {} n = exp.n(input_expression_data) #a = abc[0] #b = abc[1] #c = abc[2] #d = n - (a + b + c) #result = 0 #app = scoring_function() #for k in range (0, int(a)): # result = result + app.P(a, b, c, d, k, n) #PG1G2 = 1.0 - result # probability score #key = () # below, the new seeds array is converted into a tuple, in order to be used as a key in the dictionary #for MG in seeds: # temp = () # for s in MG: # tupe = (s,) # temp = temp + tupe # key = key + (temp,) # coalteration_values[key] = app.S(PG1G2) # This is the co-alteration score being added to a dictionary # expanding through network, finding more co-alteration values File = open(input_file, 'r') # reading adjacency matrix file lines = File.readlines() rows = [] for i in range(1, len(lines)): l = lines[i].split() l.remove(l[0]) rows.append(l) for module_group in seeds: usedgenes = []# to ensure the same combination is not computed multiple times for seed in module_group: usedgenes.append(seed) for seed in module_group: j = 0 # j will be the new gene being added for co-alteration score calculation for x in rows[seed - 1]: # below, we find out which genes the seed gene is directly interacting with (j) j += 1 # j is a direct interaction partner of one of the seed genes (or upon iteration, the interaction partner of one of the genes added to the group) if x == '1': # '1' means there is direct interaction flag = 0 for usedgene in usedgenes: # to ensure the same combination is not computed multiple times if j == usedgene: flag += 1 if flag == 0: usedgenes.append(j) # to ensure the same combination is not computed multiple times module_group.append(j) # this adds to the seeds set, so that we can compute the co-alteration score for a new combination of genes. abc = exp.abc(seeds, connect, module_data) a = abc[0] b = abc[1] c = abc[2] d = n - (a + b + c) result = 0 for k in range (0, int(a)): result = result + app.P(a, b, c, d, k, n) PG1G2 = 1.0 - result # probability score key = () # below, the new seeds array is converted into a tuple, in order to be used as a key in the dictionary for MG in seeds: temp = () for s in MG: tupe = (s,) temp = temp + tupe key = key + (temp,) coalteration_values[key] = app.S(PG1G2) # This is the co-alteration score being added to a dictionary module_group.remove(j) # here, we return back to the original seeds set, ready to add a new gene upon iteration if coalteration_values == {}: # the the event that there are no more genes to add key = () # below, the new seeds array is converted into a tuple, in order to be used as a key in the dictionary for MG in seeds: temp = () for s in MG: tupe = (s,) temp = temp + tupe key = key + (temp,) coalteration_values[key] = 0 return coalteration_values class expression_analysis(): # Only works for comparing TWO modules def __init__ (self, input_expression_data): # reads expression data file file = open(input_expression_data, 'r') lines = file.readlines() self.lines = lines def n (self, input_expression_data): for i in range(1, len(self.lines)): l = self.lines[i].split("\t") l.remove(l[0]) # l.remove(l[-1]) # This is needed if there is a 'tab' space at the end of the patient expression data line n = float(len(l)) return n def get_module_data(self, input_expression_data, connect): # This makes an array parallel to 'connect', with arrays of expression data from the samples instead of gene numbers module_data = connect # print "connect", module_data j = 0 for i in range(1, len(self.lines)): l = self.lines[i].split("\t") l.remove(l[0]) # l.remove(l[-1]) # This is needed if there is a 'tab' space at the end of the patient expression data line l = [float(unit) for unit in l] j += 1 for dataset in module_data: for sample in dataset: if j == sample: dataset.append(l) dataset.remove(sample) return module_data def abc(self, seeds, connect, module_data): # returns the number of samples with mutations in both modules, and each module separately, in the genes in 'seeds' mutations = [] # patients with mutation in first module. After flag = 2, this is for patients with mutations in the first, but not the second, module. comutations = [] # patients with mutations in both modules mutations2 = [] # patients with mutations in second module, but not first module flag = 0 for module_group in seeds: # Looks at a module set from seeds flag += 1 for seed in module_group: # looks at individual genes in that module set ##for array in connect: for array in range(0, len(connect)): ##for g in array: for g in range (0, len(connect[array])): ##if g == seed: # we see which module this gene belongs to, by checking 'connect' if connect[array][g] == seed: sample = module_data[array][g] #for dataset in module_data: #if module_data.index(dataset) == connect.index(array): # We find the corresponding expression data ##for sample in dataset: ##if dataset.index(sample) == array.index(g): std = np.std(sample) # standard deviation. Criteria for alteration is > +- std. mean = np.mean(sample) # mean of sample for unit in range (0, len(sample)): # looking at individual patient's expression data if sample[unit] > (mean + std) or sample[unit] < (mean - std): #(float(sum (sample)))/(float(len(sample))): # check for alteration. enrer criteria for 'alteraion' in this line. if flag == 1: # for the first module, the index of any patient with upregulation is added to the array 'mutations'. No upregulation, or patient number already in array, then it is not added. sign = 0 for patient in mutations: if patient == unit: sign += 1 if sign ==0: mutations.append(unit) if flag == 2: # when we look as the second module, we check to see if patients with upregulated expression in this modules gene(s) are also in 'mutations'. if so, they are removed from mutations and added to 'comutations'. sign = 0 for patient in mutations: if unit == patient: sign += 1 if sign > 0: signal = 0 for coalteration in comutations: if coalteration == unit: signal += 1 if signal == 0: comutations.append(unit) mutations.remove(unit) if sign == 0: # if there is upregulation, but no comutation, patient number is added to 'mutations2' signal = 0 for patient in mutations2: if patient == unit: signal += 1 for coalteration in comutations: if coalteration == unit: signal += 1 if signal == 0: mutations2.append(unit) if flag > 2: print "Error - too many modules?" a = float(len(comutations)) b = float(len(mutations)) c = float(len(mutations2)) return (a, b, c) def main(): input_graph = raw_input("which adjacency matrix file to read: ") input_expression_data1 = raw_input ("First set of expression data to read: ") input_genes = raw_input ("Input seeds gene list: ") #input_expression_data2 = raw_input ("Second set of expression data to read: ") adj = adjacency_matrix () exp = expression_analysis(input_expression_data1) app = scoring_function() gene_set = adj.get_data_from_matrix(input_graph, 1) # connect is an array containing arrays (the 'modules') containing gene numbers for distinct modules module_data_sets = exp.get_module_data(input_expression_data1, gene_set)# this retrieves expression data and orders it as per 'connect' chance = adj.get_data_from_matrix(input_graph, 1) # restores connect, under a new name, 'chance' chance2 = adj.get_data_from_matrix(input_graph, 1) # this is here so that after modules are removed from 'chance' we will still be able to know their original sequence, in order to make the module_data correlate. # IF YOU WANT TO RANDOMLY SELECT MODULES AND SEEDS #import random #connect = random.sample(chance, 2) #module_data = [] #print '2 random modules', connect #for array in connect: # i = chance.index(array) # ii = connect.index(array) # module_data.insert(ii, module_data_sets[i]) #seeds = [] #for array in connect: # g = random.sample(array, 1) # seeds.append(g) print 'Modules', chance for module in chance: # we need to methodically choose two modules from the interactome connect = [] connect.append(module) # here, the first module is chosen chance.remove(module) # to make sure the same module combinations are not repeated for anothermodule in chance: connect.append(anothermodule) # here, the second module is chosen # output_file.write("Looking at modules %s and %s\n" % (module, anothermodule)) module_data = [] ii = -1 for array in connect: # making sure that the module data fits connect ii += 1 #connect.index(array) i = chance2.index(array) ii = connect.index(array) module_data.insert(ii, module_data_sets[i])# Now we need to methodically choose seed genes #print 'CONNECT', connect, 'MODULE DATA', module_data print 'Looking at modules', connect[0], 'and', connect[1] for gene in connect[0]: # look at first module for g in connect[1]:# look at second module seeds = [] seeds.append([gene]) # seed gene from first module chosen seeds.append([g]) # seed gene from secodn module chosen print 'The seed genes are', seeds# these are the randomly generated starting seeds, step 1 from Fig.2 of Gu et al. originalseeds = seeds # this is to make sure that we have access to the original seeds, though the array 'seeds' will change coalteration_values1 = adj.expand(input_graph, input_expression_data1, seeds, connect, module_data) # this is step 2 from Fig.2 of Gu et al. v=list(coalteration_values1.values()) k=list(coalteration_values1.keys()) key = k[v.index(max(v))]# this returns the gene set, as a tuple, with the maximum co-alteration score (step 3) #key = max(coalteration_values1, key=coalteration_values1.get) seeds = [] # below, we convert the tuple key back into an array called 'seeds'. This is the new set of genes, which is passed to step 4 for series in key: module_group = [] for s in series: module_group.append(s) seeds.append(module_group) flag = 0 runs = -1 print 'Step 3 genes:', seeds while flag == 0: # creates loop marker = 0 # this will decide whether the while loop continues or ends flare = 0 # if flare goes above zero, at least one combination during step 5 in which a gene has been removed from the set yields aa higher score, so this combination must be kept and recycled back to step 3. runs += 1 # Number of times we go through the 'loop' n = exp.n(input_expression_data1)# below we re-caclulate the step 3 seed gene score for later comparison, to mak sure the co-alteration score continues to increase if the programme continues abc = exp.abc(seeds, connect, module_data) a = abc[0] b = abc[1] c = abc[2] d = n - (a + b + c) result = 0 for k in range (0, int(a)): result = result + app.P(a, b, c, d, k, n) PG1G2 = 1.0 - result # probability score original_score = app.S(PG1G2) print 'Step 3 score: ', original_score, seeds coalteration_values2 = adj.expand(input_graph, input_expression_data1, seeds, connect, module_data) # step 4 v=list(coalteration_values2.values()) k=list(coalteration_values2.keys()) keys = k[v.index(max(v))] # this returns the gene set, as a tuple, with the maximum co-alteration score key_value = coalteration_values2[keys] # get value to compare to later, in step 5 if key_value <= original_score: # exit from step 4 if none of the combinations increase the co-alteration score flag += 1 final_score = original_score print 'No change', seeds, 'score:', original_score if key_value > original_score: newseeds = [] # convert key for maximum value from tuple, into an array called 'newseeds' for step 5 for series in keys:# convert tuple 'keys' to array 'newseeds' MG = [] for s in series: MG.append(s) newseeds.append(MG) for MG in newseeds: # the newly added gene from the step 4 process (expand) is not removed along with other non-original seeds in step 5, in order to ensure we cannot go backwards to the same set of genes we had in step 3. for s in MG: sign = 0 for module_group in seeds: for seed in module_group: if s == seed: sign += 1 if sign == 0: newly_added = s print 'Newly added gene:', newly_added, 'score: ', key_value if newseeds == seeds: flag += 1 final_score = original_score print 'Error. No change', newseeds, 'score:', original_score else: higher_sets = {} for MG in newseeds: # step 5 of Fig.2 in Gu et al., we remove each new gene in turn, and see if co-alteration score change for s in MG: sign = 0 if s == newly_added: sign += 1 else: for module_group in originalseeds: for seed in module_group: if s == seed: sign += 1 if sign == 0: index = MG.index(s) # find index of s so it can be re-inserted in the right place later MG.remove(s)# re-computation of co-alteration score, minus new gene n = exp.n(input_expression_data1) abc = exp.abc(newseeds, connect, module_data) a = abc[0] b = abc[1] c = abc[2] d = n - (a + b + c) result = 0 app = scoring_function() for k in range (0, int(a)): result = result + app.P(a, b, c, d, k, n) PG1G2 = 1.0 - result score = app.S(PG1G2) if score == key_value: # co-alteration score is no longer increasing, loop will break (Step 6) MG.insert(index, s) print 'Removing', s, 'has no effect', newseeds, 'score:', score if score > key_value: # co-alteration score has increased in this instance flare += 1 key = () for group in newseeds: temp = () for item in group: tupe = (item,) temp = temp + tupe key = key + (temp,) higher_sets[key] = score MG.insert(index, s) print 'Removing', s,'heightens score', newseeds, 'score:', score if score < key_value: marker += 1 MG.insert(index, s) # s is re-inserted into the correct position in its module group array print 'Removing', s,'lowers score', newseeds, 'score:', score if flare > 0: # a gene removal generated a higher co-alteration score, therefore, we must find out which removal gave the highest score (if there was more than one) and feed that back to step 3 by re-naming this group 'seeds'. v=list(higher_sets.values()) k=list(higher_sets.keys()) key = k[v.index(max(v))] # this returns the gene set, as a tuple, with the maximum co-alteration score seeds = [] # convert key for maximum value from tuple, into an array called 'newseeds' for step 5 for series in key: module_group = [] for seed in series: module_group.append(seed) seeds.append(module_group) print 'Step 3 genes:', seeds, 'score: ', max(v) if marker == 0 and flare == 0: # there has been no increase or depreciation, therefore the loop ends and we get our final set seeds = newseeds final_score = key_value flag += 1 if marker > 0 and flare == 0: # there has been a deprecation when a gene was removed, so we can continue to grow. the loop continues. seeds = newseeds print 'Step 3 genes:', seeds, 'score: ', key_value originalseeds = sorted(originalseeds) seeds = sorted(seeds) # makes readout easier to understand output_file.write("%s\tstarting seeds: %s\tfinal seeds: %s\n" % (final_score, originalseeds, seeds)) print 'END:', seeds, 'score:', final_score, 'Runs:', runs, 'marker', marker, 'flag', flag connect.remove(anothermodule) # now ready to select a new module-pair at which to look if __name__ == '__main__': main()