-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsubmodels_module.py
320 lines (275 loc) · 17.6 KB
/
submodels_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import numpy as np
import pickle
from functools import partial
from model_module import model
import load_format_data
import plot_model
class seq_to_x_model():
'sets get_input_seq to ordinal or onehot sequence based upon model_architecture'
def __init__(self, model_architecture):
if 'emb' in model_architecture:
self.get_input_seq=load_format_data.get_ordinal
else:
self.get_input_seq=load_format_data.get_onehot
class assay_to_x_model():
'sets get_input_seq to assay scores of assays'
def __init__(self, assays):
self.get_input_seq=partial(load_format_data.get_assays,assays)
class control_to_x_model():
'sets get_input_seq to nothing, not sure if needed'
def __init__(self):
self.get_input_seq=load_format_data.get_control
class sequence_embedding_to_x_model():
'sets get_input_seq to load the sequence embedding from a saved seq-to-assay model'
def __init__(self):
self.get_input_seq=load_format_data.get_embedding
class x_to_yield_model(model):
'sets model output to yield'
def __init__(self, model_in, model_architecture, sample_fraction):
super().__init__(model_in, 'yield', model_architecture, sample_fraction)
self.get_output_and_explode=load_format_data.explode_yield
self.plot_type=plot_model.x_to_yield_plot
self.training_df=load_format_data.load_df('assay_to_dot_training_data')
self.testing_df=load_format_data.load_df('seq_to_dot_test_data')
self.lin_or_sig='linear'
self.num_cv_splits=10
self.num_cv_repeats=10
self.num_test_repeats=10
self.num_hyp_trials=50
def change_sample_seed(self,seed):
self.sample_seed=seed
self.update_model_name('seed'+str(seed)+'_'+self.model_name)
def save_predictions(self,input_df_description=None):
'saves model predictions for the large dataset'
if not input_df_description:
input_df_description='seq_to_assay_train_'+self.assay_str #only a certain number of these files exist, but more can be created
df=load_format_data.load_df(input_df_description)
else:
df=load_format_data.load_df('predicted/'+input_df_description) #for using predicted embeddings
OH_matrix=np.eye(2)
matrix_col=['IQ_Average_bc','SH_Average_bc']
x_a=self.get_input_seq(df)
for z in range(1): #no of models
self.load_model(z)
for i in range(2):
cat_var=[]
for j in x_a:
cat_var.append(OH_matrix[i].tolist())
x=load_format_data.mix_with_cat_var(x_a,cat_var)
df_prediction=self._model.model.predict(x).squeeze().tolist()
col_name=matrix_col[i]
df.loc[:,col_name]=df_prediction
col_name_std=matrix_col[i]+'_std'
df.loc[:,col_name_std]=[0]*len(df_prediction)
df.to_pickle('./datasets/predicted/'+input_df_description+'_'+self.model_name+'_'+str(z)+'.pkl')
def switch_train_test(self):
regular_training_df=self.training_df
extra_training_df,self.testing_df=load_format_data.get_random_split(self.testing_df)
self.training_df=regular_training_df.append(extra_training_df)
def limit_test_set(self,assays):
#Limit test set for data that has all assay scores used in model
sort_names=[]
for i in assays:
sort_names.append('Sort'+str(i)+'_mean_score')
dataset=self.testing_df
dataset=dataset[~dataset[sort_names].isna().any(axis=1)]
self.testing_df=dataset
def apply_predicted_assay_scores(self,seq_to_assay_model_prop):
'uses saved predicted assay scores and saved assay-to-yield model to determine performance on test-set'
seq_to_assay_model_name='seq_assay'+self.assay_str+'_'+str(seq_to_assay_model_prop[0])+'_'+str(seq_to_assay_model_prop[1])+'_'+str(seq_to_assay_model_prop[2])
self.num_test_repeats=1
self.testing_df=load_format_data.load_df('predicted/seq_to_dot_test_data_'+seq_to_assay_model_name)
self.figure_file='./figures/'+self.model_name+'_'+seq_to_assay_model_name+'.png'
self.stats_file='./model_stats/'+self.model_name+'_'+seq_to_assay_model_name+'.pkl'
self.test_model()
# self.plot()
class x_to_assay_model(model):
'sets to assay_model'
def __init__(self, model_in, assays, model_architecture, sample_fraction):
assay_str=','.join([str(x) for x in assays])
super().__init__(model_in, 'assay'+assay_str, model_architecture, sample_fraction)
self.assays=assays
self.get_output_and_explode=partial(load_format_data.explode_assays,assays)
self.plot_type=plot_model.x_to_assay_plot
self.training_df=load_format_data.load_df('seq_to_assay_train_1,8,10') #could adjust in future for sequences with predictive assays
self.testing_df=load_format_data.load_df('assay_to_dot_training_data')
self.lin_or_sig='sigmoid'
self.num_cv_splits=3
self.num_cv_repeats=3
self.num_test_repeats=10
self.num_hyp_trials=50
def save_predictions(self):
'save assay score predictions of test dataset to be used with assay-to-yield model'
df=load_format_data.load_df('seq_to_dot_test_data') #will have to adjust if missing datapoints
OH_matrix=np.eye(len(self.assays))
x_a=self.get_input_seq(df)
for z in range(3): #for each model
for i in range(len(self.assays)): #for each assay
cat_var=[]
for j in x_a: #for each sequence add cat_var
cat_var.append(OH_matrix[i].tolist())
x=load_format_data.mix_with_cat_var(x_a,cat_var)
self._model.set_model(self.get_best_trial()['hyperparam'],xa_len=len(x[0])-len(cat_var[0]), cat_var_len=len(cat_var[0]),lin_or_sig=self.lin_or_sig) #need to build nn arch
self.load_model(z) #load pkled sklearn model or weights of nn model
df_prediction=self._model.model.predict(x).squeeze().tolist()
df.loc[:,'Sort'+str(self.assays[i])+'_mean_score']=df_prediction
df.to_pickle('./datasets/predicted/seq_to_dot_test_data_'+self.model_name+'_'+str(z)+'.pkl')
def save_sequence_embeddings(self,df_list=None):
'save sequence embeddings of model'
if not df_list:
df_list=['assay_to_dot_training_data','seq_to_dot_test_data','seq_to_assay_train_1,8,10']
OH_matrix=np.eye(len(self.assays))
for df_name in df_list:
df=load_format_data.load_df(df_name)
x_a=self.get_input_seq(df)
for z in range(10): #for each model
for i in range(1): #only need to get cat var for one assay to get sequence embedding
cat_var=[]
for j in x_a: #for each sequence add cat_var
cat_var.append(OH_matrix[i].tolist())
x=load_format_data.mix_with_cat_var(x_a,cat_var)
self._model.set_model(self.get_best_trial()['hyperparam'],xa_len=len(x[0])-len(cat_var[0]), cat_var_len=len(cat_var[0]),lin_or_sig=self.lin_or_sig) #need to build nn arch
self.load_model(z) #load pkled sklearn model or weights of nn model
seq_embedding_model=self._model.get_seq_embeding_layer_model()
df_prediction=seq_embedding_model.predict([x])
seq_emb_list=[]
for i in df_prediction:
seq_emb_list.append([i])
df.loc[:,'learned_embedding']=seq_emb_list
df.to_pickle('./datasets/predicted/learned_embedding_'+df_name+'_'+self.model_name+'_'+str(z)+'.pkl')
class assay_to_yield_model(x_to_yield_model, assay_to_x_model):
'assay to yield, provide which assays, limit test set to useable subset'
def __init__(self, assays, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('assays'+self.assay_str, model_architecture, sample_fraction)
assay_to_x_model.__init__(self,assays)
class weighted_assay_to_yield_model(x_to_yield_model, assay_to_x_model):
'weight training data by average(log2(trials))'
def __init__(self, assays, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('weighted_assays'+self.assay_str, model_architecture, sample_fraction)
assay_to_x_model.__init__(self,assays)
self.weightbycounts=True
self.weightbycountsfxn=partial(load_format_data.weightbycounts,assays)
class twogate_assay_to_yield_model(x_to_yield_model, assay_to_x_model):
'assay to yield, provide which assays and stringency, limit test set to useable subset'
def __init__(self, assays, stringency, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('twogate'+stringency+'_assays'+self.assay_str, model_architecture, sample_fraction)
assay_to_x_model.__init__(self,assays)
self.training_df=load_format_data.load_df('assay_to_dot_training_data_twogate_'+stringency)
self.testing_df=load_format_data.load_df('seq_to_dot_test_data_twogate_'+stringency)
class assay_count_to_yield_model(x_to_yield_model):
'assay to yield including the number of observations in the input'
def __init__(self, assays, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('assays_counts_'+self.assay_str, model_architecture, sample_fraction)
self.get_input_seq=partial(load_format_data.get_assays_and_counts,assays)
class stassay_to_yield_model(x_to_yield_model):
'assay to yield, provide which assays and which trial '
def __init__(self, assays, trial, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('st'+str(trial)+'_assays'+self.assay_str, model_architecture, sample_fraction)
self.get_input_seq=partial(load_format_data.get_stassays,assays,trial)
class ttassay_to_yield_model(x_to_yield_model):
'assay to yield, provide which assays and which 2 trials'
def __init__(self, assays, trials, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('tt'+str(trials[0])+','+str(trials[1])+'_assays'+self.assay_str, model_architecture, sample_fraction)
self.get_input_seq=partial(load_format_data.get_ttassays,assays,trials)
class seq_to_yield_model(x_to_yield_model, seq_to_x_model):
'seq to yield'
def __init__(self, model_architecture, sample_fraction):
super().__init__('seq', model_architecture, sample_fraction)
seq_to_x_model.__init__(self,model_architecture)
class seqandassay_to_yield_model(x_to_yield_model):
'combine sequence and assay scores for model input'
def __init__(self,assays,model_architecture,sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('seq_and_assays'+self.assay_str,model_architecture,sample_fraction)
self.get_input_seq=partial(load_format_data.get_seq_and_assays,assays)
class seqandtwogateassay_to_yield_model(x_to_yield_model):
'combine sequence and assay scores for model input'
def __init__(self,assays, stringency, model_architecture,sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('seq_and_twogate'+stringency+'_assays'+self.assay_str,model_architecture,sample_fraction)
self.get_input_seq=partial(load_format_data.get_seq_and_assays,assays)
self.training_df=load_format_data.load_df('assay_to_dot_training_data_twogate_'+stringency)
self.testing_df=load_format_data.load_df('seq_to_dot_test_data_twogate_'+stringency)
class seqandweightedassay_to_yield_model(x_to_yield_model):
'sequence and assay input, training weighted by observations'
def __init__(self,assays,model_architecture,sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('weighted_seq_and_assays'+self.assay_str,model_architecture,sample_fraction)
self.get_input_seq=partial(load_format_data.get_seq_and_assays,assays)
self.weightbycounts=True
self.weightbycountsfxn=partial(load_format_data.weightbycounts,assays)
class seqandstassay_to_yield_model(x_to_yield_model):
'seq and assay to yield, provide which assays and which trial '
def __init__(self, assays, trial, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('seq_and_st'+str(trial)+'_assays'+self.assay_str, model_architecture, sample_fraction)
self.get_input_seq=partial(load_format_data.get_seq_and_stassays,assays,trial)
class seqandttassay_to_yield_model(x_to_yield_model):
'seq and assay to yield, provide which assays and which trials to average '
def __init__(self, assays, trials, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('seq_and_tt'+str(trials[0])+','+str(trials[1])+'_assays'+self.assay_str, model_architecture, sample_fraction)
self.get_input_seq=partial(load_format_data.get_seq_and_ttassays,assays,trials)
class seqandassay_count_to_yield_model(x_to_yield_model):
'seq and assay (including counts) to yield'
def __init__(self,assays,model_architecture,sample_fraction):
self.assay_str=','.join([str(x) for x in assays])
super().__init__('seq_and_assays_counts_'+self.assay_str,model_architecture,sample_fraction)
self.get_input_seq=partial(load_format_data.get_seq_and_assays_and_counts,assays)
class final_seq_to_yield_model(seq_to_yield_model):
'redoes training and testing divison for final comparison'
def __init__(self,model_architecture,sample_fraction):
super().__init__(model_architecture,sample_fraction)
self.update_model_name('final'+self.model_name)
self.switch_train_test()
class seq_to_pred_yield_model(x_to_yield_model,seq_to_x_model):
'sequence to yield model using predicted yields from assay scores'
def __init__(self, pred_yield_model_prop, seq_to_pred_yield_prop):
super().__init__('seq',seq_to_pred_yield_prop[0],seq_to_pred_yield_prop[1])
seq_to_x_model.__init__(self,seq_to_pred_yield_prop[0])
self.assay_str=','.join([str(x) for x in pred_yield_model_prop[0]])
pred_yield_model_name='seq_and_assays'+self.assay_str+'_yield_'+pred_yield_model_prop[1]+'_'+str(pred_yield_model_prop[2])+'_'+str(pred_yield_model_prop[3]) #change for seq and assay
self.update_model_name(self.model_name+':'+pred_yield_model_name)
# self.training_df=load_format_data.load_df('predicted/seq_to_assay_train_1,8,10_'+pred_yield_model_name)
self.training_df=load_format_data.load_df('predicted/seq_to_assay_train_1,8,10_seq_and_assay_yield_forest_1_0')
self.num_cv_splits=3
self.num_cv_repeats=3
self.num_test_repeats=1
self.num_hyp_trials=50
class seq_to_assay_model(x_to_assay_model, seq_to_x_model):
'seq to assay, provide assays'
def __init__(self, assays, model_architecture, sample_fraction):
super().__init__('seq',assays, model_architecture, sample_fraction)
seq_to_x_model.__init__(self,model_architecture)
class control_to_assay_model(x_to_assay_model, control_to_x_model):
'predict assay scores based upon average of assay score of training set'
def __init__(self, assays, model_architecture, sample_fraction):
super().__init__('control',assays, model_architecture, sample_fraction)
control_to_x_model.__init__(self)
class control_to_yield_model(x_to_yield_model, control_to_x_model):
'predict assay scores based upon average of assay score of training set'
def __init__(self, model_architecture, sample_fraction):
super().__init__('control', model_architecture, sample_fraction)
control_to_x_model.__init__(self)
class sequence_embeding_to_yield_model(x_to_yield_model, sequence_embedding_to_x_model):
'predict yield from sequence embedding trained by a seq-to-assay model'
def __init__(self, seq_to_assay_model_prop, model_architecture, sample_fraction):
self.assay_str=','.join([str(x) for x in seq_to_assay_model_prop[0]])
seq_to_assay_model_name='seq_assay'+self.assay_str+'_'+str(seq_to_assay_model_prop[1])+'_'+str(seq_to_assay_model_prop[2])+'_'+str(seq_to_assay_model_prop[3])
super().__init__('embedding_'+seq_to_assay_model_name, model_architecture, sample_fraction)
sequence_embedding_to_x_model.__init__(self)
self.num_test_repeats=1
self.training_df=load_format_data.load_df('/predicted/learned_embedding_assay_to_dot_training_data_'+seq_to_assay_model_name)
self.testing_df=load_format_data.load_df('/predicted/learned_embedding_seq_to_dot_test_data_'+seq_to_assay_model_name)
class final_sequence_embeding_to_yield_model(sequence_embeding_to_yield_model):
'look at class name, but done with better train/test split'
def __init__(self, seq_to_assay_model_prop, model_architecture, sample_fraction):
super().__init__(seq_to_assay_model_prop, model_architecture, sample_fraction)
self.update_model_name('final'+self.model_name)
self.switch_train_test()