-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtestgrad.jl
583 lines (448 loc) · 24.5 KB
/
testgrad.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
# hp = setup_params("nninputs.toml")
# train_x, train_y = extract_data("digits5000by400.mat")
# shuffle_data!(train_x, train_y)
# train, mb, nnw, bn = pretrain(train_x, train_y, hp)
# stats = setup_stats(hp, dotest)
# training_loop to run some epochs--start with minibatch, optimization, etc.
# then check grads in one epoch without that stuff
# method for using the full model and training data to check grads
function prep_check(hp, train_x, train_y, iters; samplepct=.005, quiet=false, tweak=1e-6)
# prepare the model (sub-steps that are part of train() )
dotest = false
train, mb, nnw, bn, model = pretrain(train_x, train_y, hp)
# advance the training loop with model hyper-parameters
@printf(" ****** Advancing model %d iterations\n", iters)
hp.epochs = iters # set iters to train before grad testing
stats = setup_stats(hp, dotest)
training_time = training_loop!(hp, train, mb, nnw, bn, stats, model)
output_stats(train, nnw, hp, bn, training_time, stats, model)
# set new hyper-parameters for gradient checking
if hp.do_batch_norm
minihp = hp # leave hyper-parameters as is
else
minihp = set_hp(hp)
setup_functions!(model, minihp, train.out_k)
@bp
end
return minihp, nnw, train, bn, model
end
# method to use full model with real training data
function check_grads(hp, train_x, train_y, iters; samplepct=.005, quiet=false, tweak=1e-6, post_iters=0,
biasidx = [], thetaidx = [])
println(" ****** Testing gradient calculation on input training dataset")
println(" ****** Setting up model and data")
@bp
hp, nnw, train, bn, model = prep_check(hp, train_x, train_y, iters; samplepct=samplepct, quiet=quiet, tweak=tweak)
run_check_grads(hp, nnw, train, model; bn=bn, samplepct=samplepct, tweak=tweak, quiet=quiet, iters=post_iters,
biasgradidx = biasidx, thetagradidx = thetaidx)
end
# method to use mini-model with same number/type of hidden layers as designed model
function prep_check(hp; in_k=3, out_k=3, m=5, hid=5, iters=30, samplepct=1.0, tweak=1e-4, quiet=false)
# set new hyper-parameters for gradient checking
minihp = set_hp(hp)
# create minidata and mini-model
# @bp
minidat, minimb, miniwgts, minibn, ministats, minimodel = use_minidata(minihp;
in_k=in_k, out_k=out_k, m=m, hid=hid)
# advance some iterations using the analytic model
@printf("\n ****** Advancing model %d iterations\n", iters)
minihp.epochs = iters
training_time = training_loop!(minihp, minidat, minimb, miniwgts, minibn, ministats, minimodel)
# TODO do we want this? minihp and setup_stats currently setup so that no training stats get collected
quiet && output_stats(minidat, miniwgts, minihp, training_time, ministats, minimodel)
return minihp, miniwgts, minidat, minimodel
end
# method to use mini-model with same number/type of hidden layers as designed model
function check_grads(hp; in_k=3, out_k=3, m=5, hid=5, iters=30, samplepct=1.0, tweak=1e-4, quiet=false,
post_iters=0, biasidx = [], thetaidx = [])
println(" ****** Testing gradient calculation on minitiature dataset")
println(" ****** Setting up model and test data")
# @bp
minihp, miniwgts, minidat, minimodel = prep_check(hp; in_k=in_k, out_k=out_k, m=m, hid=hid, iters=iters,
samplepct=samplepct, tweak=tweak, quiet=false)
# @bp
run_check_grads(minihp, miniwgts, minidat, minimodel; samplepct=samplepct, tweak=tweak, quiet=quiet)
end
# eps_wgt = (lr, idx, typ) where lr is int of layer (either hidden or output)
# idx is the linear index of theta or bias for lr
# typ must be "theta" or "bias"
function check_one(hp, inwgts, indat, model; tweak=1e-6, eps_wgt=(2,3,"bias"), example=10)
wgts = deepcopy(inwgts)
dat = deepcopy(indat)
selr = eps_wgt[1]
idx = eps_wgt[2]
typ = eps_wgt[3]
n_layers = wgts.output_layer
deltype = if typ == "theta"
"delta_th"
elseif typ == "bias"
"delta_b"
else
error("Third element of eps_wgt must be either theta or bias, was $typ")
end
# create model data holding only one example. Need z, a, and targets only.
onedat = Batch_view() # holder
preallocate_minibatch!(onedat::Batch_view, wgts, hp) # create array layers that match the model
update_batch_views!(onedat, dat, wgts, hp, example:example) # slice out column example
# 1. get the cost for a single example at current weights
feedfwd!(onedat, wgts, hp, bn, model.ff_execstack, dotrain=false)
cost = model.cost_function(onedat.targets[:,1], onedat.a[wgts.output_layer][:,1], 1, wgts.theta, hp.lambda, hp.reg,
wgts.output_layer)
# 2. compute the gradient of the weight we will perturb by tweak (not yet...)
backprop!(wgts, dat, hp) # for all layers
gradone = getproperty(wgts, Symbol(deltype))[selr][idx] # delta_th or delta_b
# 3. calculate the loss for the perturbed weight
w = getproperty(wgts, Symbol(typ))[selr][idx] # bias or theta
w_plus = w + tweak
getproperty(wgts, Symbol(typ))[selr][idx] = w_plus # set the chosen weight to tweaked value
feedfwd!(onedat, wgts, hp, bn, model.ff_execstack, dotrain=false)
# cost_plus = model.cost_function(onedat.targets[:,1], onedat.a[wgts.output_layer][:,1], 1, wgts.theta, hp.lambda, hp.reg,
# wgts.output_layer)
costnew = model.cost_function(onedat.targets[:,1], onedat.a[wgts.output_layer][:,1], 1, wgts.theta, hp.lambda, hp.reg,
wgts.output_layer)
# w_minus = w - tweak
# getproperty(wgts, Symbol(typ))[selr][idx] = w_minus # set the chosen weight to tweaked value
# feedfwd_predict!(onedat, wgts, hp)
# cost_minus = model.cost_function(onedat.targets[:,1], onedat.a[wgts.output_layer][:,1], 1, wgts.theta, hp.lambda, hp.reg,
# wgts.output_layer)
# costnew = (cost_plus + cost_minus) / (2.0) # centered cost difference, can also use single-sided
# 4. compare to test value: asserts that difference in the costs is the gradient scaled by the tweak,
# which is a crude way to show the diff as the limit of tweak approaches zero
costcheck = cost + (tweak * gradone)
# println("Original Cost: ", cost, " tweaked cost: ", costnew)
# println("Original cost + delta: ",costcheck)
# println("Diff tweaked cost - (orig. cost + delta) ", costnew - costcheck)
return cost, costnew, gradone
end
function run_check_grads(hp, wgts, dat, model; bn=Batch_norm_params(), samplepct=.015, tweak=1e-6, quiet=false, iters=0,
biasgradidx = [], thetagradidx = []) # last 2 args are the indices of the bias and theta we sample
println("\n\n ****** Starting run_check_grads")
printstruct(hp)
println()
if !(samplepct > 0.0 && samplepct <= 1.0)
error("input samplepct must be greater than zero and less than or equal to 1.0")
end
if length(biasgradidx) == 0 # no input of pre-selected indices
if samplepct == 1.0
biasgradidx = [eachindex(x) for x in wgts.bias[2:wgts.output_layer]]
else # must be in (0.0,1.0)
biasgradidx = [sample_idx(samplepct,x) for x in wgts.bias[2:wgts.output_layer]]
end
insert!(biasgradidx, 1, eachindex([])) # we need this placeholder for layer 1
end
if length(thetagradidx) == 0 # no input of pre-selected indices
if samplepct == 1.0
thetagradidx = [eachindex(x) for x in wgts.theta[2:wgts.output_layer]]
else # must be in (0.0,1.0)
thetagradidx = [sample_idx(samplepct,x) for x in wgts.theta[2:wgts.output_layer]]
end
insert!(thetagradidx, 1, eachindex([])) # we need this placeholder for layer 1
end
# run model a few more iterations: especially valuable if full model uses minibatches
# TODO does this work or mess things up?
for i = 1:iters
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false) # no batches, no batchnorm, no dropout
backprop!(wgts, dat, hp)
update_parameters!(wgts, hp)
end
# initialize numeric gradients results holder
numgradtheta = [zeros(length(x)) for x in thetagradidx]
numgradbias = [zeros(length(x)) for x in biasgradidx]
println("numgradbias holder ", size(numgradbias), " ", size(numgradbias[1]))
# compute numgrad
kinkrejecttheta, kinkrejectbias, numcost, numpreds, bnnumgrads = compute_numgrad!(numgradtheta, numgradbias,
thetagradidx, biasgradidx, wgts, dat, hp, model; bn=bn, tweak=tweak)
numgrad = (numgradtheta,numgradbias)
# compute model grad
println(" ****** Calculating feedfwd/backprop gradients")
# @bp
modcost, modpreds, bnmodgrads = compute_modelgrad!(dat, wgts, hp, bn, model) # changes dat, wgts and hp
modgrad = (deepcopy(wgts.delta_th), deepcopy(wgts.delta_b)) # do I need to copy since compute_numgrad! does no backprop?
# compare results from modgrads and numgrads
println(" ****** Are costs the same? (model, num cost) ")
println(modcost, " ", numcost)
println(" ****** How close are the predictions?")
println(count(modpreds .== numpreds), " of ", length(modpreds))
println(" mean abs difference ",sum(abs.(modpreds .- numpreds)) / length(modpreds))
# filter modgrads to match sample of numgrads
modgrad = [ [modgrad[1][i][thetagradidx[i]] for i in 2:wgts.output_layer],
[modgrad[2][i][biasgradidx[i]] for i in 2:wgts.output_layer] ]
println("modgrad output ", size(modgrad), " ", size(modgrad[2]))
println("numgrad output ", length(numgrad), " ", size(numgrad[2]))
deltacols = hcat(flat(modgrad), flat(numgrad))
relative_error = map(x -> abs(x[1]-x[2]) / (maximum(abs.(x)) + 1e-15), eachrow(deltacols))
rel_err2 = norm(deltacols[1] .- deltacols[2]) / (norm(deltacols[2]) + norm(deltacols[1]))
!quiet && begin
println("\nrelative errors")
for v in relative_error;
@printf(" % 8.5f\n", v)
end
println(rel_err2)
# compare modelgrad and numgrad
println("\ngradients")
println(" model", " numeric")
# print theta gradients
startrow = 1
endrow = 0
for l = 2:wgts.output_layer
@printf("theta of layer %d, sample size %s, actual size %s\n", l, size(numgradtheta[l]), size(wgts.theta[l]))
endrow = startrow + length(numgradtheta[l]) - 1
@printf("start: %d end: %d\n", startrow, endrow)
printby2(deltacols[startrow:endrow,:])
startrow = endrow + 1
end
# print bias gradients
startrow = endrow + 1
for l = 2:wgts.output_layer
@printf("bias of layer %d, sample size %s, actual size %s\n", l, size(numgradbias[l]), size(wgts.bias[l]))
endrow = startrow + length(numgradbias[l]) - 1
@printf("start: %d end: %d\n", startrow, endrow)
printby2(deltacols[startrow:endrow,:])
startrow = endrow + 1
end
# print batchnorm gradients
if hp.do_batch_norm
bnmodgrads = [ [bnmodgrads[1][i][biasgradidx[i]] for i in 2:wgts.output_layer-1],
[bnmodgrads[2][i][biasgradidx[i]] for i in 2:wgts.output_layer-1] ]
# println("bnmodgrads ", size(bnmodgrads), " ", size(bnmodgrads[2]), " ", size(bnmodgrads[2][2]))
# println("bnnumgrads ", size(bnnumgrads), " ", size(bnnumgrads[2]), " ", size(bnnumgrads[2][2]))
bncols = hcat(flat(bnmodgrads), flat(bnnumgrads))
println("batchnorm parameter gradients\n")
printby2(bncols)
else
# do nothing
end
end # begin block
# summary stats
println("\nMean Gross difference")
println(mean(abs.(deltacols[:,1] .- deltacols[:,2])))
println("\nRelative difference using norm")
println(norm(deltacols[:,1] .- deltacols[:,2])/norm(deltacols[:,1] .+ deltacols[:,2]))
println("\nMean Relative error")
println(mean(relative_error))
println("\nMax Relative error")
println(maximum(relative_error))
println("\nCount Relative error >= 0.2")
println(count(i -> i >= 0.2, relative_error), " out of ", length(relative_error))
end
"""
two methods for dealing with batch normalization
1. do a final feedfwd_predict using the batch norm terms. Then feedfwd and backprop to generate theta
and bias weights without batchnorm terms: mu, stddev, gamma and beta. Possibly more than one
"settle down" iteration will be needed. Then:
For analytic, backprop to calculate the deltas for the weights, leaving out the batchnorm terms.
For numeric, calculate numeric approximation for weights, leaving out the batchnorm terms.
This can work, but the high learning rate possible with batchnorm won't work in many cases.
The bias weights will be off.
2. Include the gamma and beta parameters and compare numeric approximation and backprop deltas for theta, gamma
and delta. Use running average of mu and stddev for feedforward, as one would for a test or validation
data set.
For analytic, backprop to calculate the delta updates for gamma and delta.
For numeric, perturb (separately) gamma and beta and use cost predictions to calculate the approximations to
the deltas.
This method is better because it checks are code to see if we calculate the deltas for all trained parameters.
"""
function set_hp(hp)
# set appropriate bogus hyper-parameters for mini-model AND FOR FINAL GRAD CALCULATION RUN???
minihp = Hyper_parameters()
# change parameters to special values for grad checking
minihp.reg = "" # no reg
# minihp.stats = [] # no stats STATS ARE OK. DON'T HURT ANYTHING OR COST THAT MUCH IN PERF FOR THIS
minihp.plot_now = false # no plots
minihp.opt = "none" # no optimization
minihp.learn_decay = [1.0, 1.0] # no optimization
minihp.do_batch_norm = false # no batches
minihp.dobatch = false # no batches
minihp.alphamod = 0.02 # low learning rate
minihp.dropout = false
minihp.norm_mode = "minmax" # only used with minimodel
minihp.bias_initializer = hp.bias_initializer # only used with minimodel
minihp.lambda = 0.0 # shouldn't matter as reg is ""
minihp.sparse = false
minihp.quiet = true
# use the real models params here
minihp.hidden = hp.hidden
minihp.classify = hp.classify
# printstruct(minihp)
return minihp
end
function use_minidata(minihp;in_k=3, out_k=3, m=5, hid=5)
# change hidden layer unit count to tiny example
for hl in 1:length(minihp.hidden)
minihp.hidden[hl] = (minihp.hidden[hl][1], hid) # can't update the tuple; replace it
end
# create repeatable, bogus train data (x,y) for mini-model
mini_x = zeros(in_k, m)
mini_x = debug_initialize_weights!(mini_x)
mini_y = 1 .+ mod.(collect(1:m), out_k)
mini_y = collect(onehot(mini_y, out_k)') # onehot encode categories
# create training data structures and setup functions, etc (see pretrain function in setup_training.jl)
minidat, minimb, miniwgts, minibn, minimodel = pretrain(mini_x, mini_y, minihp)
# repeatable initization of theta and bias
for l = 2:miniwgts.output_layer
# debug_initialize_weights!(miniwgts.theta[l])
xavier_initialize_weights!(miniwgts.theta[l])
bias_val!(miniwgts.bias[l], minihp.bias_initializer)
end
dotest = false
ministats = setup_stats(minihp, dotest)
return minidat, minimb, miniwgts, minibn, ministats, minimodel
end
"""
caution
Kinks in the objective. One source of inaccuracy to be aware of during gradient checking is the problem of kinks. Kinks refer to non-differentiable parts of an objective function, introduced by functions such as ReLU (max(0,x)), or the SVM loss, Maxout neurons, etc. Consider gradient checking the ReLU function at x=−1e6. Since x<0, the analytic gradient at this point is exactly zero. However, the numerical gradient would suddenly compute a non-zero gradient because f(x+h) might cross over the kink (e.g. if h>1e−6) and introduce a non-zero contribution. You might think that this is a pathological case, but in fact this case can be very common. For example, an SVM for CIFAR-10 contains up to 450,000 max(0,x) terms because there are 50,000 examples and each example yields 9 terms to the objective. Moreover, a Neural Network with an SVM classifier will contain many more kinks due to ReLUs.
Note that it is possible to know if a kink was crossed in the evaluation of the loss. This can be done by keeping track of the identities of all “winners” in a function of form max(x,y); That is, was x or y higher during the forward pass. If the identity of at least one winner changes when evaluating f(x+h) and then f(x−h), then a kink was crossed and the numerical gradient will not be exact.
"""
function compute_numgrad!(numgradtheta, numgradbias, thetagradidx, biasgradidx, wgts, dat, hp, model;
bn=Batch_norm_params(), tweak=1e-7, quiet=true)
# TODO how to handle conditional passing of bn struct
# TODO factor this to handle 1 pass of numeric grads. Caller builds the complete set.
# TODO be sure we use right hp values and setup functions to match
# @bp
kinkcnt = 0
kinkrejectbias = Tuple{Int64, Int64}[]
kinkrejecttheta = Tuple{Int64, Int64}[]
comploss = 0.0
numpreds = deepcopy(dat.a[wgts.output_layer])
olddat = deepcopy(dat)
output_layer = wgts.output_layer
# test first pass of cost
# feedfwd_predict!(dat, wgts, hp)
# loss1 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
# println("1st pass at cost ", loss1)
for lr in 2:wgts.output_layer # loop by layer
@printf(" ****** Calculating numeric bias gradients layer: %d\n", lr)
for (idx,bi) in enumerate(biasgradidx[lr]) # eachindex(wgts.bias[lr]) # tweak each bias, compute partial diff for each bias
wgts.bias[lr][bi] += tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false) #feedfwd_predict!(dat, wgts, hp)
kinktst1 = findkink(olddat, dat, output_layer)
loss1 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
wgts.bias[lr][bi] -= 2.0 * tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false)
kinktst2 = findkink(olddat, dat, output_layer)
loss2 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
wgts.bias[lr][bi] += tweak
if kinktst1 || kinktst2
kinkcnt += 1
push!(kinkrejectbias, (lr, bi))
end
numgradbias[lr][idx] = (loss1 - loss2) / (2.0 * tweak)
!quiet && @printf("loss1: %f loss2: %f grad: %f kink: %d\n", loss1, loss2, numgradbias[lr][idx], kinkcnt)
end
println("kink errors for bias = ", kinkcnt)
@printf(" ****** Calculating numeric theta gradients layer: %d\n", lr)
for (idx,thi) in enumerate(thetagradidx[lr]) # eachindex(wgts.theta[lr]) # tweak each theta, compute partial diff for each theta
wgts.theta[lr][thi] += tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false)
kinktst1 = findkink(olddat, dat, output_layer)
loss1 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
wgts.theta[lr][thi] -= 2.0 * tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false)
kinktst2 = findkink(olddat, dat, output_layer)
loss2 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
wgts.theta[lr][thi] += tweak # set it back to starting value
comploss = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n) # diagnostic
if kinktst1 || kinktst2
kinkcnt += 1
push!(kinkrejecttheta, (lr, thi))
end
numgradtheta[lr][idx] = (loss1 - loss2) / (2.0 * tweak)
!quiet && @printf("loss1: %f loss2: %f grad: %f kink: %d\n", loss1, loss2, numgradtheta[lr][idx], kinkcnt)
numpreds = dat.a[wgts.output_layer]
end
end
println("count of gradients rejected: ", kinkcnt)
if hp.do_batch_norm
gamgrad = [zeros(length(x)) for x in biasgradidx]
betgrad = [zeros(length(x)) for x in biasgradidx]
for lr = 2:wgts.output_layer-1
@printf(" ****** Calculating numeric batchnorm gamma gradients layer: %d\n", lr)
for (idx,bi) in enumerate(biasgradidx[lr])
println(idx)
bn.gam[lr][bi] += tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false) # bn is closure in function placeholder
loss1 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
bn.gam[lr][bi] -= 2.0 * tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false) # bn is closure in function placeholder
loss2 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
bn.gam[lr][bi] += tweak
gamgrad[lr][idx] = (loss1 - loss2) / (2.0 * tweak)
end
@printf(" ****** Calculating numeric batchnorm beta gradients layer: %d\n", lr)
for (idx,bi) in enumerate(biasgradidx[lr])
bn.bet[lr][bi] += tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false) # bn is closure in function placeholder
loss1 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
bn.bet[lr][bi] -= 2.0 * tweak
feedfwd!(dat, wgts, hp, bn, model.ff_execstack, dotrain=false) # bn is closure in function placeholder
loss2 = model.cost_function(dat.targets, dat.a[wgts.output_layer], dat.n)
bn.bet[lr][bi] += tweak
betgrad[lr][idx] = (loss1 - loss2) / (2.0 * tweak)
end
end
bngrads = [gamgrad[2:wgts.output_layer-1], betgrad[2:wgts.output_layer-1]]
println("bngrads ", size(bngrads), " ", size(bngrads[1]), " ", size(bngrads[2]))
else
bngrads = [[], []] # signifies empty
end
return kinkrejecttheta, kinkrejectbias, comploss, numpreds, bngrads
end # function compute_numgrad!
function findkink(olddat, dat, output_layer)
kink = false
for lr = 2:output_layer
kink = !all(sign.(olddat.a[lr]) .== sign.(dat.a[lr]))
if kink
break
end
end
return kink
end
function compute_modelgrad!(dat, nnw, hp, bn, model)
# @bp
cost, accuracy = quick_stats(dat, nnw, hp, model)
println("accuracy: ", accuracy)
println("cost: ", cost)
# @bp
hp.mb_size = float(dat.n)
feedfwd!(dat, nnw, hp, bn, model.ff_execstack, dotrain=false)
backprop!(nnw, dat, hp) # for all layers
if hp.do_batch_norm
for lr in 2:nnw.output_layer-1 # defaults never touched; set to zero for comparison to numeric grads
nnw.delta_b[lr] .= 0.0
end
bngrads = [bn.delta_gam, bn.delta_bet]
println("model bn grads ", size(bngrads), " ", size(bngrads[2]), " ", size(bngrads[2][2]))
else
bngrads = [[] for i in 2:nnw.output_layer-1]
end
return cost, dat.a[nnw.output_layer], bngrads
end
"""
**debug_initialize_weights** Initialize the weights of a layer
using a fixed strategy, which will help you later in debugging.
We use this with a modeling approach that separates weights from the bias term.
"""
function debug_initialize_weights!(mx)
# Initialize w using "sin", this ensures that w is always of the same
# values and will be useful for debugging
mx[:] = reshape(sin.(0:length(mx)-1), size(mx)) / 10.0
end
function xavier_initialize_weights!(mx, scale=2.0)
mx[:] = randn(size(mx)) .* sqrt(scale/size(mx,2))
end
function bias_val!(bias, val)
fill!(bias, val)
end
function bias_rand!(bias, val)
bias[:] = rand(size(bias,1))
end
# method works on the number of elements you want to sample from
function sample_idx(xpct, lx::Int)
xcnt = ceil(Int, xpct * lx) + 2
sort!(randperm(lx)[1:xcnt])
end
# method works on the object that you want indices for
function sample_idx(xpct, x::Array{T}) where {T}
lx = length(x)
sample_idx(xpct, lx)
end