forked from qiaoliuhub/seqCrispr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.py
170 lines (131 loc) · 5.23 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import os
from datetime import datetime
import shutil
import sys
from pytz import timezone
# cellline folder
if len(sys.argv) >= 2 and not sys.argv[1].split("/")[-1].startswith("run"):
cellline = sys.argv[1].split("/")[-1]
elif len(sys.argv) >= 2:
cellline = sys.argv[1].split("/")[-2]
else:
cellline = "test"
# data speficic name, for example, if file name is K562_CN, then data_specific is _CN
data_specific = "_CN_centrality" if cellline != "test" else ""
# split method, it could be stratified or regular
split_method = 'regular'
# model types: rnn, cnn or mixed
model_type = "rnn"
seq_cnn = False
# deco_part of output directory, run_ + deco_part + time
deco_part = "_".join([cellline, data_specific, split_method[:3], model_type[:3]])
# sequence length
seq_len = 20
# epochs
n_epochs = 20
patience = 100
decay_steps = 100
min_lr = 0.001
# model hyperparameters
start_lr = 0.01
dropout = 0.4
LSTM_hidden_unit = 32
batch_size = 32
embedding_voca_size = 65
embedding_vec_dim = 8
activation_method =["relu", "sigmoid"]
fully_connected_layer_layout = [64] * 2 + [32] * 2
cnn_levels = [8, 16, 32, 64, 128]
bio_fully_connected_layer_layout = [32, 64]
lr_decay = 0.0005
maxnorm = 20
with_pam = False
pam_dim = 4
LSTM_stacks_num = 4
rnn_time_distributed_dim = 32
word_len = 3
test_cellline = None
# could be None, "group" and "cellline"
group_col = None
# weight matrix cold start
word2vec_weight_matrix = False
# MIN_MAX scale feature
scale_feature = [] #+ ["r_d_tm" ] + ['aa_cut'] + ['essentiality'] + ['log_CN']
# RNN input features, every unit input is 200 elements vector
seq2vec_features = ["sequence"]
# extra features
extra_numerical_features = [] + ["r_d_tm" ] + ['aa_cut'] #+ ['essentiality'] + ['log_CN']
extra_categorical_features = [] #+ ["DNase"] + ["CTCF", "RRBS", "H3K4me3"]
if with_pam:
extra_categorical_features = ["PAM"] + extra_categorical_features
# output
y = ["log2fc"]
# cols having gene names info or False to avoid this
group = False
# current directory
cur_dir = os.getcwd()
# Whether check feature importance using deep learning
check_feature_importance = True
# Whether to do RF training
ml_train = False
# Whether or not to train the model again
training = True
test_method = 'regular'
# parameters when retraining
transfer_learning = False
fine_tune_trainable = False
retraining_datasize = 1.0/1 if transfer_learning else 1
frozen_embedding_only = True
fullly_connected_train_fraction = 1.0/1
retraining_model_path = os.path.join(cur_dir, "dataset/test/run_test__lea_cnn1810111631")
transfer_learning_model = os.path.join(retraining_model_path, "lstm_model.h5")
retraining_dataset = ""
# each run specific
rerun_name = sys.argv[1].split("/")[-1] if len(sys.argv) >= 2 and sys.argv[1].split("/")[-1].startswith("run") else None
unique_name = rerun_name if rerun_name else "run_" + deco_part + datetime.now(timezone('US/Eastern')).strftime("%y%m%d%H%M")
data_dir = os.path.join(cur_dir, "dataset", cellline)
run_dir = os.path.join(data_dir, unique_name)
# pretrained seq2vec 3mer vector representation
seq2vec_mapping = os.path.join(cur_dir, "dataset", "cds_vector")
mismatch_matrix = os.path.join(cur_dir, "dataset", "off_target_matrix.csv")
pam_matrix = os.path.join(cur_dir, "dataset", "Pam_score.csv")
input_dataset = os.path.join(data_dir, "{!s}{!s}.csv".format(cellline, data_specific))
if transfer_learning and retraining_dataset != "":
input_dataset = retraining_dataset
# training and test index
train_index = os.path.join(data_dir, "train_index" + data_specific)
test_index = os.path.join(data_dir, "test_index" + data_specific)
if transfer_learning:
train_index += "_retrain"
test_index += "_retrain"
# if execute with a new dataset, use the config.py in cur working directory, create a new
# directory for this execution, create __init__ to generate a package, then copy
# cur working directory to the new created directory
# else use the existed config file
run_specific_config = os.path.join(run_dir, "config.py")
cur_dir_config = os.path.join(cur_dir, "config.py")
if not os.path.exists(run_dir):
os.makedirs(run_dir)
open(os.path.join(data_dir, "__init__.py"), 'w+').close()
open(os.path.join(run_dir, "__init__.py"), 'w+').close()
shutil.copyfile(cur_dir_config, run_specific_config)
# Directory to save test and prediction y plot
test_prediction = os.path.join(run_dir, "test_prediction.csv")
# Directory to save the pickled training history
training_history = os.path.join(run_dir, "training_history.pi")
# model saving in hdf5 directory
hdf5_path = os.path.join(run_dir, "lstm_model.h5")
# machine learning model saving path
ml_model_path = os.path.join(run_dir, "ml_model")
feature_importance_path = os.path.join(run_dir, 'features.csv')
# temprature save models
temp_hdf5_path = os.path.join(run_dir, "temp_model.h5")
# directory to save logfile
run_specific_log = os.path.join(run_dir, "logfile")
# logfile to save the training process
training_log = os.path.join(run_dir, "training_log")
# logfile to save the file used for tensorboard
tb_log = os.path.join(run_dir, "tb_log")
# output_attrs list
output_attrs = ["split_method", "model_type", "dropout", "scale_feature", "seq2vec_features", "extra_numerical_features",
"extra_categorical_features", "cellline", "word2vec_weight_matrix", "input_dataset"]