-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmain_DevRep_example.py
72 lines (52 loc) · 3.03 KB
/
main_DevRep_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
####
# Example script for predicting the recombinant yield of Gp2 paratope mutants in two strains of E.coli.
# Utilizing HT assays for training devrep, and using the devrep embedding to predict yield
# assays for training devrep are currently limited to 1,8,10 as those were the most predictive in the first paper
# Below is an example of training and testing and predicting the best performing architecture of DevRep
###
import submodels_module as modelbank
#define model parameters
#assays are numbered in order as found in SI table #
#model architectures for predicting yield are: ['ridge','fnn','emb_fnn_flat','emb_fnn_maxpool','emb_fnn_maxpool_linear','emb_rnn','small_emb_rnn','small_emb_atn_rnn','small_emb_rnn_linear',
# 'emb_cnn','small_emb_cnn','small_emb_atn_cnn','small_emb_cnn_linear']
devrep_mdl_param={'assays':[1,8,10], 'model_architecture':'emb_cnn', 'sample_fraction':1}
#initialize model based upon model parameters
mdl=modelbank.seq_to_assay_model(**devrep_mdl_param)
#cross-validate model
mdl.cross_validate_model()
#test the model on the limited test set
mdl.test_model()
#return the results from cv and testing
print(mdl.model_stats)
#plot the predicted results
#figure is saved in ./figures/
mdl.plot()
#save the learned embeddings
#saved as pickled files in ./datasets/predicted/learned_embeddings_*
#saves 10 different embeddings from the 10 created models with the same hyperparameters
mdl.save_sequence_embeddings()
#load devrep embedding to yield model
#each embedding from the different models of the same architecture trains the top model independently
#model architectures for predicting yield are 'ridge','forest','svm','fnn'
top_mdl_param={'model_architecture':'svm', 'sample_fraction':1}
devrep_param_list=[devrep_mdl_param['assays'],devrep_mdl_param['model_architecture'],devrep_mdl_param['sample_fraction']]
for i in range(10):
#initialize embedding to yield model
emb_mdl=modelbank.sequence_embeding_to_yield_model(devrep_param_list+[i],**top_mdl_param)
#cross validate
emb_mdl.cross_validate_model()
#limit test set to compare to paper one results
emb_mdl.limit_test_set([1,8,10])
#test model
emb_mdl.test_model()
#return the results from cv and testing
print(emb_mdl.model_stats)
if i==0:
#plot the predicted results for a single embedding
#figure is saved in ./figures/
emb_mdl.plot()
# save predictions of the same set predicted in the first paper
df_str='learned_embedding_seq_to_assay_train_1,8,10_seq_assay'+','.join([str(x) for x in devrep_mdl_param['assays']])+'_'+devrep_mdl_param['model_architecture']+'_'+str(devrep_mdl_param['sample_fraction'])+'_'+str(i)
#final save file for example is ./datasets/predicted/learned_embedding_seq_to_assay_train_1,8,10_seq_assay1,8,10_emb_cnn_1_0_embedding_seq_assay1,8,10_emb_cnn_1_0_yield_svm_1_0.pkl
#result is a pickeled pandas DataFrame with yields predicted under IQ_Average_bc and SH_Average_bc for the two strains
emb_mdl.save_predictions(input_df_description=df_str)