Skip to content

Commit

Permalink
Significant changes, to make LDNet more usable
Browse files Browse the repository at this point in the history
Added a correct implementation of normalized ReHu to LDLayers. Cleaned
up LDNet code a bit, changed it's default plotting to training set loss
and added weight/bias fuzzing.
  • Loading branch information
Philip-Bachman committed May 30, 2014
1 parent a026d8f commit 8892979
Show file tree
Hide file tree
Showing 11 changed files with 360 additions and 263 deletions.
Binary file modified .DS_Store
Binary file not shown.
54 changes: 32 additions & 22 deletions ActFunc.m
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
end

function [ node_grads ] = backprop(self, post_grads, post_weights,...
pre_values, pre_weights, act_grads)
pre_values, pre_weights, act_grads, act_masks)
% Backpropagate gradients through some activation function
%
% BP functions take the following arguments:
Expand All @@ -56,6 +56,8 @@
% current layer's nodes. size: (pre_dim x cur_dim)
% act_grads: gradients directly on the post-transform activations
% at the current layer's nodes.
% act_masks: dropout masks that were applied to activations
%
obs_count = size(post_grads,1);
cur_dim = max(size(pre_weights,2),size(post_weights,1));
if ~exist('act_grads','var')
Expand All @@ -64,25 +66,25 @@
switch self.func_type
case 1
node_grads = ActFunc.linear_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads);
pre_values, pre_weights, act_grads, act_masks);
case 2
node_grads = ActFunc.sigmoid_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads);
pre_values, pre_weights, act_grads, act_masks);
case 3
node_grads = ActFunc.tanh_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads);
pre_values, pre_weights, act_grads, act_masks);
case 4
node_grads = ActFunc.logexp_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads);
pre_values, pre_weights, act_grads, act_masks);
case 5
node_grads = ActFunc.relu_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads);
pre_values, pre_weights, act_grads, act_masks);
case 6
node_grads = ActFunc.rehu_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads);
pre_values, pre_weights, act_grads, act_masks);
case 7
node_grads = ActFunc.norm_rehu_bp(post_grads,...
post_weights, pre_values, pre_weights, act_grads);
node_grads = ActFunc.norm_rehu_bp(post_grads, post_weights,...
pre_values, pre_weights, act_grads, act_masks);
otherwise
error('No valid activation function type selected.');
end
Expand All @@ -108,7 +110,7 @@
end

function [ dLdF ] = linear_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -123,13 +125,14 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
%
dLdA = (post_grads * post_weights') + act_grads;
dAdF = 1;
dLdF = dLdA .* dAdF;
dLdF = (dLdA .* act_masks) .* dAdF;
return
end

Expand All @@ -147,7 +150,7 @@
end

function [ dLdF ] = sigmoid_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -162,14 +165,15 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
%
e_mx = exp(-pre_acts * pre_weights);
dLdA = (post_grads * post_weights') + act_grads;
dAdF = e_mx ./ (1 + e_mx).^2;
dLdF = dLdA .* dAdF;
dLdF = (dLdA .* act_masks) .* dAdF;
return
end

Expand All @@ -187,7 +191,7 @@
end

function [ dLdF ] = tanh_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -202,13 +206,14 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
%
dAdF = 1 - (tanh(pre_acts * pre_weights)).^2;
dLdA = (post_grads * post_weights') + act_grads;
dLdF = dLdA .* dAdF;
dLdF = (dLdA .* act_masks) .* dAdF;
return
end

Expand All @@ -226,7 +231,7 @@
end

function [ dLdF ] = logexp_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -241,14 +246,15 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
%
exp_vals = exp(pre_acts * pre_weights);
dAdF = exp_vals ./ (exp_vals + 1);
dLdA = (post_grads * post_weights') + act_grads;
dLdF = dLdA .* dAdF;
dLdF = (dLdA .* act_masks) .* dAdF;
return
end

Expand All @@ -267,7 +273,7 @@
end

function [ dLdF ] = relu_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -282,13 +288,14 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
%
dAdF = (pre_acts * pre_weights) > 0;
dLdA = (post_grads * post_weights') + act_grads;
dLdF = dLdA .* dAdF;
dLdF = (dLdA .* act_masks) .* dAdF;
return
end

Expand All @@ -311,7 +318,7 @@
end

function [ dLdF ] = rehu_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -326,6 +333,7 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
Expand All @@ -334,7 +342,7 @@
dAdF = 2 * dAdF;
dAdF(dAdF > 1) = 1;
dLdA = (post_grads * post_weights') + act_grads;
dLdF = dLdA .* dAdF;
dLdF = (dLdA .* act_masks) .* dAdF;
return
end

Expand All @@ -360,7 +368,7 @@
end

function [ dLdF ] = norm_rehu_bp(post_grads, post_weights, ...
pre_acts, pre_weights, act_grads)
pre_acts, pre_weights, act_grads, act_masks)
% Compute the gradients w.r.t. the pre-transform activations at all
% nodes in the current layer.
%
Expand All @@ -375,6 +383,7 @@
% current layer (pre_dim x cur_dim)
% act_grads: direct loss gradients on post-transform activations
% for each node in current layer (obs_count x cur_dim)
% act_masks: dropout masks applied to activations
% Outputs:
% dLdF: gradients w.r.t pre-transform node activations at current
% layer (obs_count x cur_dim)
Expand All @@ -391,6 +400,7 @@
% Compute
dA1dF = 2*(quad_mask .* F) + line_mask;
dLdA2 = (post_grads * post_weights') + act_grads;
dLdA2 = dLdA2 .* act_masks;
V = dLdA2 .* A1;
V = sum(V, 2);
dLdA1 = bsxfun(@rdivide, dLdA2, A1N) - ...
Expand Down
Binary file modified LayerNet/.DS_Store
Binary file not shown.
74 changes: 55 additions & 19 deletions LayerNet/LDLayer.m
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,17 @@
% the pre and post transform values.
A_pre = X * Wm';
% Pass linear function outputs through self.act_trans.
A_post = self.act_trans(A_pre, 'ff');
A_post = self.act_trans(A_pre, 'ff', [], [], []);
% Update timing info
self.ff_evals = self.ff_evals + size(X,1);
return
end

function [ dLdW dLdX ] = ...
backprop(self, dLdA_post, dLdA_pre, A_post, X, Wm)
function [ dLdW dLdX ] = backprop(self, dLdA, A, X, Wm)
% Backprop through the linear functions and post-linear transforms
% for this layer.
%
dAdF = self.act_trans(A_post, 'bp');
dLdF = (dLdA_post .* dAdF) + dLdA_pre;
dLdF = self.act_trans(A, 'bp', dLdA, X, Wm);
% Compute gradients with respect to linear function parameters
dLdW = dLdF' * X;
% Compute gradients with respect to input matrix X
Expand All @@ -142,36 +140,36 @@

methods (Static = true)

function [ F ] = tanh_trans(X, comp_type)
% Transform the elements of X by hypertangent.
function [ F ] = tanh_trans(X, comp_type, dLdA, Xin, Win)
% Transform the elements of X by hypertangent, or do backprop
assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
if (strcmp(comp_type,'ff'))
% Do feedforward
F = tanh(X);
else
% Do backprop
F = 1 - X.^2;
dAdF = 1 - X.^2;
F = dLdA .* dAdF;
end
return
end

function [ F ] = relu_trans(X, comp_type)
% Leave the values in X unchanged. Or, backprop through the
% non-transform.
function [ F ] = relu_trans(X, comp_type, dLdA, Xin, Win)
% Transform the elements of X by ReLU, or do backprop
assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
if (strcmp(comp_type,'ff'))
% Do feedforward
F = max(X, 0);
else
% Do backprop
F = single(X > 0);
dAdF = single(X > 0);
F = dLdA .* dAdF;
end
return
end

function [ F ] = rehu_trans(X, comp_type)
% Leave the values in X unchanged. Or, backprop through the
% non-transform.
function [ F ] = rehu_trans(X, comp_type, dLdA, Xin, Win)
% Transform the elements of X by ReHu, or do backprop
assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
if (strcmp(comp_type,'ff'))
% Do feedforward
Expand All @@ -181,13 +179,50 @@
else
% Do backprop
mask = (X < 0.25) & (X > 1e-10);
F = single(X > 0);
F(mask) = 2*sqrt(X(mask));
dAdF = single(X > 0);
dAdF(mask) = 2*sqrt(X(mask));
F = dLdA .* dAdF;
end
return
end

function [ F ] = norm_rehu_trans(X, comp_type, dLdA2, Xin, Win)
% Transform the elements of X by normed ReHu, or do backprop
assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
EPS = 1e-3;
if (strcmp(comp_type,'ff'))
% Do feedforward
cur_acts = X;
cur_acts = bsxfun(@max, cur_acts, 0);
quad_mask = bsxfun(@lt, cur_acts, 0.5);
line_mask = bsxfun(@ge, cur_acts, 0.5);
cur_acts = (quad_mask .* cur_acts.^2) + ...
(line_mask .* (cur_acts - 0.25));
act_norms = sqrt(sum(cur_acts.^2,2) + EPS);
F = bsxfun(@rdivide, cur_acts, act_norms);
else
% Do backprop
F = Xin * Win';
F = bsxfun(@max, F, 0);
quad_mask = bsxfun(@lt, F, 0.5);
line_mask = bsxfun(@ge, F, 0.5);
A1 = (quad_mask .* F.^2) + ...
(line_mask .* (F - 0.25));
A1N = sqrt(sum(A1.^2,2) + EPS);
A2 = bsxfun(@rdivide, A1, A1N);
% Compute
dA1dF = 2*(quad_mask .* F) + line_mask;
V = dLdA2 .* A1;
V = sum(V, 2);
dLdA1 = bsxfun(@rdivide, dLdA2, A1N) - ...
bsxfun(@times, A2, (V ./ (A1N.^2.0)));
F = dLdA1 .* dA1dF;
end
return
end


function [ F ] = line_trans(X, comp_type)
function [ F ] = line_trans(X, comp_type, dLdA, Xin, Win)
% Leave the values in X unchanged. Or, backprop through the
% non-transform.
assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
Expand All @@ -196,7 +231,8 @@
F = X;
else
% Do backprop
F = ones(size(X),'single');
dAdF = ones(size(X),'single');
F = dLdA .* dAdF;
end
return
end
Expand Down
Loading

0 comments on commit 8892979

Please sign in to comment.