-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsetup_data.jl
378 lines (314 loc) · 13.8 KB
/
setup_data.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
# using Plots
using JLD2
using Printf
using LinearAlgebra
function normalize_inputs!(inputs, norm_mode="none")
if lowercase(norm_mode) == "standard"
# normalize training data
x_mu = mean(inputs, dims=2)
x_std = std(inputs, dims=2)
inputs[:] = (inputs .- x_mu) ./ (x_std .+ 1e-08)
norm_factors = (x_mu, x_std) # tuple of Array{Float64,2}
elseif lowercase(norm_mode) == "minmax"
# normalize training data
x_max = maximum(inputs, dims=2)
x_min = minimum(inputs, dims=2)
inputs[:] = (inputs .- x_min) ./ (x_max .- x_min .+ 1e-08)
norm_factors = (x_min, x_max) # tuple of Array{Float64,2}
else # handles case of "", "none" or really any crazy string
norm_factors = ([0.0], [1.0])
end
# to translate to unnormalized regression coefficients: m = mhat / stdx, b = bhat - (m*xmu)
# precalculate a and b constants, and
# then just apply newvalue = a * value + b. a = (max'-min')/(max-min) and b = max - a * max
# (x - x.min()) / (x.max() - x.min()) # values from 0 to 1
# 2*(x - x.min()) / (x.max() - x.min()) - 1 # values from -1 to 1
return norm_factors
end
# apply previously used training normalization to a validation or test data set
function normalize_inputs!(inputs, norm_factors, norm_mode)
if norm_mode == "standard"
x_mu = norm_factors[1]
x_std = norm_factors[2]
inputs[:] = (inputs .- x_mu) ./ (x_std .+ 1e-08)
elseif norm_mode == "minmax"
x_min = norm_factors[1]
x_max = norm_factors[2]
inputs[:] = (inputs .- x_min) ./ (x_max .- x_min .+ 1e-08)
else
error("Input norm_mode = $norm_mode must be standard or minmax")
end
end
####################################################################
# functions to pre-allocate data updated during training loop
####################################################################
# use for test and training data
function preallocate_data!(dat, nnw, n, hp)
# feedforward
dat.a = [dat.inputs] # allocates only tiny memory--it's a reference
dat.z = [dat.inputs] # not used for input layer TODO--this permeates the code but not needed
if hp.sparse
for i = 2:nnw.output_layer
push!(dat.z, spzeros(nnw.ks[i], n, 0.1))
push!(dat.a, spzeros(nnw.ks[i], n, 0.1)) # and up... ...output layer set after loop
end
else
for i = 2:nnw.output_layer
push!(dat.z, zeros(nnw.ks[i], n))
push!(dat.a, zeros(nnw.ks[i], n)) # and up... ...output layer set after loop
end
end
# training / backprop -- pre-allocate only minibatch size (except last one, which could be smaller)
# this doesn't work for test set when not using minibatches (minibatch size on training then > entire test set)
# if istrain # e.g., only for training->no backprop data structures needed for test data
# if hp.dobatch # TODO fix this HACK
# dat.epsilon = [i[:,1:hp.mb_size_in] for i in dat.a]
# dat.grad = [i[:,1:hp.mb_size_in] for i in dat.a]
# dat.delta_z = [i[:,1:hp.mb_size_in] for i in dat.a]
# else # this should pick up sparsity
# dat.epsilon = [i for i in dat.a]
# dat.grad = [i for i in dat.a]
# dat.delta_z = [i for i in dat.a]
# end
# end
dat.epsilon = []
dat.grad = []
# dat.delta_z = []
if hp.sparse
for i = 1:nnw.output_layer
push!(dat.epsilon, spzeros(nnw.ks[i], n, 0.1))
push!(dat.grad, spzeros(nnw.ks[i], n, 0.1)) # and up... ...output layer set after loop
push!(dat.delta_z, spzeros(nnw.ks[i], n, 0.1)) # and up... ...output layer set after loop
end
else
for i = 1:nnw.output_layer
push!(dat.epsilon, zeros(nnw.ks[i], n))
push!(dat.grad, zeros(nnw.ks[i], n)) # and up... ...output layer set after loop
# push!(dat.delta_z, zeros(nnw.ks[i], n)) # and up... ...output layer set after loop
end
end
if hp.do_batch_norm # required for full pass performance stats TODO: really? or only for batch_norm
# feedforward
dat.z_norm = deepcopy(dat.z)
# backprop
# dat.delta_z_norm = deepcopy(dat.z)
# preallocate_bn_params!(bn, mb, nnw.ks)
end
end
# method for batch views--currently the only method used
function preallocate_minibatch!(mb::Batch_view, nnw, hp)
# feedforward: minibatch views update the underlying data
# TODO put @inbounds back after testing
n_layers = nnw.output_layer
# we don't need all of these depending on minibatches and batchnorm, but it's very little memory
mb.a = Array{SubArray{}}(undef, n_layers)
mb.targets = view([0.0],1:1)
mb.z = Array{SubArray{}}(undef, n_layers)
mb.z_norm = Array{SubArray{}}(undef, n_layers)
# mb.delta_z_norm = Array{SubArray{}}(undef, n_layers)
# mb.delta_z = Array{SubArray{}}(undef, n_layers)
mb.grad = Array{SubArray{}}(undef, n_layers)
mb.epsilon = Array{SubArray{}}(undef, n_layers)
end
####################################################################
# functions to pre-allocate trained parameters
####################################################################
function preallocate_wgts!(nnw, hp, in_k, n, out_k)
# initialize and pre-allocate data structures to hold neural net training data
# theta = weight matrices for all calculated layers (e.g., not the input layer)
# bias = bias term used for every layer but input
# in_k = no. of features in input layer
# n = number of examples in input layer (and throughout the network)
# out_k = number of features in the targets--the output layer
# theta dimensions for each layer of the neural network
# Follows the convention that rows = outputs of the current layer activation
# and columns are the inputs from the layer below
# layers
nnw.output_layer = 2 + size(hp.hidden, 1) # input layer is 1, output layer is highest value
nnw.ks = [in_k, map(x -> x[2], hp.hidden)..., out_k] # no. of output units by layer
# set dimensions of the linear Wgts for each layer
push!(nnw.theta_dims, (in_k, 1)) # weight dimensions for the input layer -- if using array, must splat as arg
for l = 2:nnw.output_layer
push!(nnw.theta_dims, (nnw.ks[l], nnw.ks[l-1]))
end
# initialize the linear Wgts
nnw.theta = [zeros(2,2)] # layer 1 not used
# Xavier initialization--current best practice for relu
if hp.initializer == "xavier"
xavier_initialize!(nnw, hp.scale_init)
elseif hp.initializer == "uniform"
uniform_initialize!(nnw. hp.scale_init)
elseif hp.initializer == "normal"
normal_initialize!(nnw, hp.scale_init)
else # using zeros generally produces poor results
for l = 2:nnw.output_layer
push!(nnw.theta, zeros(nnw.theta_dims[l])) # sqrt of no. of input units
end
end
# bias initialization: small positive values can improve convergence
nnw.bias = [zeros(2)] # this is layer 1: never used. placeholder to make layer indices consistent
if hp.bias_initializer == 0.0
bias_zeros(nnw.ks, nnw)
elseif hp.bias_initializer == 1.0
bias_ones(nnw.ks, nnw)
elseif 0.0 < hp.bias_initializer < 1.0
bias_val(hp.bias_initializer, nnw.ks, nnw)
elseif np.bias_initializer == 99.9
bias_rand(nnw.ks, nnw)
else
bias_zeros(nnw.ks, nnw)
end
# structure of gradient matches theta
nnw.delta_th = deepcopy(nnw.theta)
nnw.delta_b = deepcopy(nnw.bias)
# initialize gradient, 2nd order gradient for Momentum or Adam or rmsprop
if hp.opt == "momentum" || hp.opt == "adam" || hp.opt == "rmsprop"
nnw.delta_v_th = [zeros(size(a)) for a in nnw.delta_th]
nnw.delta_v_b = [zeros(size(a)) for a in nnw.delta_b]
end
if hp.opt == "adam"
nnw.delta_s_th = [zeros(size(a)) for a in nnw.delta_th]
nnw.delta_s_b = [zeros(size(a)) for a in nnw.delta_b]
end
# dropout
if hp.dropout
nnw.dropout_mask = [BitArray(ones(k)) for k in nnw.ks]
end
end
function xavier_initialize!(nnw, scale=2.0)
for l = 2:nnw.output_layer
push!(nnw.theta, randn(nnw.theta_dims[l]...) .* sqrt(scale/nnw.theta_dims[l][2])) # sqrt of no. of input units
end
end
function uniform_initialize!(nnw, scale=0.15)
for l = 2:nnw.output_layer
push!(nnw.theta, (rand(nnw.theta_dims[l]...) .- 0.5) .* (scale/.5)) # sqrt of no. of input units
end
end
function normal_initialize!(nnw, scale=0.15)
for l = 2:nnw.output_layer
push!(nnw.theta, randn(nnw.theta_dims[l]...) .* scale) # sqrt of no. of input units
end
end
function bias_zeros(ks, nnw)
for l = 2:nnw.output_layer
push!(nnw.bias, zeros(ks[l]))
end
end
function bias_ones(ks, nnw)
for l = 2:nnw.output_layer
push!(nnw.bias, ones(ks[l]))
end
end
function bias_val(val, ks, nnw)
for l = 2:nnw.output_layer
push!(nnw.bias, fill(val, ks[l]))
end
end
function bias_rand(ks, nnw)
for l = 2:nnw.output_layer
push!(nnw.bias, rand(ks[l]) .* 0.1)
end
end
function preallocate_bn_params!(bn, mb, k)
# initialize batch normalization parameters gamma and beta
# vector at each layer corresponding to no. of inputs from preceding layer, roughly "features"
# gamma = scaling factor for normalization standard deviation
# beta = bias, or new mean instead of zero
# should batch normalize for relu, can do for other unit functions
# note: beta and gamma are reserved keywords, using bet and gam
bn.gam = [ones(i) for i in k] # gamma is a builtin function
bn.bet = [zeros(i) for i in k] # beta is a builtin function
bn.delta_gam = [zeros(i) for i in k]
bn.delta_bet = [zeros(i) for i in k]
bn.delta_v_gam = [zeros(i) for i in k]
bn.delta_s_gam = [zeros(i) for i in k]
bn.delta_v_bet = [zeros(i) for i in k]
bn.delta_s_bet = [zeros(i) for i in k]
bn.mu = [zeros(i) for i in k] # same size as bias = no. of layer units
bn.mu_run = [zeros(i) for i in k]
bn.stddev = [zeros(i) for i in k]
bn.std_run = [zeros(i) for i in k]
end
"""
Function setup_stats(hp, dotest::Bool)
Creates data structure to hold everything needed to plot progress of
neural net training by iteration.
Training statistics are tracked in a dict containing:
"track"=>Dict of bools to select each type of results to be collected.
Currently used are: "train", "test", "learning", "cost". This determines what
data will be collected during training iterations and what data series will be
plotted.
"labels"=>array of strings provides the labels to be used in the
plot legend.
"cost"=>array of calculated cost at each iteration
with iterations as rows and data types ("train", "test") as columns.
"accuracy"=>array of percentage of correct classification
at each iteration with iterations as rows and result types as columns ("Training", "Test").
This plots a so-called learning curve. Very interesting indeed.
"col_train"=>col_train: column of the arrays above to be used for Training results
"col_test"=>col_test: column of the arrays above to be used for Test results
"period"=>single string of "epoch" or "batch" chooses interval of data, or "" or "none" for none
"""
function setup_stats(hp, dotest::Bool)
# set up cost_history to track 1 or 2 data series for plots
# lots of indirection here: someday might add "validation"
if size(hp.stats,1) > 5
@warn("Only 4 plot requests permitted. Proceeding with up to 4.")
end
valid_inputs = ["train", "test", "learning", "cost", "epoch", "batch"]
if in(hp.stats, ["None", "none", ""])
track = Dict(item => false for item in valid_stats) # set all to false
else
track = Dict(item => in(item, hp.stats) for item in valid_stats)
end
# determine whether to plot per batch or per epoch
period = ""
if in(hp.stats, ["None", "none", ""]) || isempty(hp.stats)
period = ""
elseif in("epoch", hp.stats) # this is the default and overrides choosing both, which isn't supported
period = "epoch"
elseif in("batch", hp.stats) && hp.dobatch
period = "batch"
end
pointcnt = if period == "epoch"
hp.epochs
elseif period == "batch"
hp.n_mb * hp.epochs
else
0
end
# must have test data to plot test results
if !dotest # no test data
if track["test"] # input requested plotting test data results
@warn("Can't plot test data. No test data. Proceeding.")
track["test"] = false
end
end
# set column in cost_history for each data series
col_train = track["train"] ? 1 : 0
col_test = track["test"] ? col_train + 1 : 0
no_of_cols = max(col_train, col_test)
labels = if col_train == 1 && col_test == 2
("Train", "Test")
elseif col_train == 0 && col_test ==1
("Test",) # trailing comma needed because a one-element tuple generates to its element
elseif col_train == 1 && col_test == 0
("Train",) # trailing comma needed because a one-element tuple generates to its element
else
()
end
# labels = reshape(labels,1,size(labels,1)) # 1 x N row array required by pyplot
# create all keys and values for dict stats
stats = Dict("track"=>track, "labels"=>labels)
if track["cost"]
stats["cost"] = zeros(pointcnt, no_of_cols) # cost history initialized to 0's
end
if track["learning"]
stats["accuracy"] = zeros(pointcnt, no_of_cols)
end
stats["col_train"] = col_train
stats["col_test"] = col_test
stats["period"] = period
return stats
end