Significant changes, to make LDNet more usable

Added a correct implementation of normalized ReHu to LDLayers. Cleaned up LDNet code a bit, changed it's default plotting to training set loss and added weight/bias fuzzing.
Philip-Bachman · May 30, 2014 · 8892979 · 8892979
1 parent a026d8f
commit 8892979
Show file tree

Hide file tree

Showing 11 changed files with 360 additions and 263 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/ActFunc.m b/ActFunc.m
@@ -42,7 +42,7 @@
         end
 
         function [ node_grads ] = backprop(self, post_grads, post_weights,...
-                pre_values, pre_weights, act_grads)
+                pre_values, pre_weights, act_grads, act_masks)
             % Backpropagate gradients through some activation function
             %
             % BP functions take the following arguments:
@@ -56,6 +56,8 @@
             %                current layer's nodes. size: (pre_dim x cur_dim)
             %   act_grads: gradients directly on the post-transform activations
             %              at the current layer's nodes.
+            %   act_masks: dropout masks that were applied to activations
+            %
             obs_count = size(post_grads,1);
             cur_dim = max(size(pre_weights,2),size(post_weights,1));
             if ~exist('act_grads','var')
@@ -64,25 +66,25 @@
             switch self.func_type
                 case 1
                     node_grads = ActFunc.linear_bp(post_grads, post_weights,...
-                        pre_values, pre_weights, act_grads);
+                        pre_values, pre_weights, act_grads, act_masks);
                 case 2
                     node_grads = ActFunc.sigmoid_bp(post_grads, post_weights,...
-                        pre_values, pre_weights, act_grads);
+                        pre_values, pre_weights, act_grads, act_masks);
                 case 3
                     node_grads = ActFunc.tanh_bp(post_grads, post_weights,...
-                        pre_values, pre_weights, act_grads);
+                        pre_values, pre_weights, act_grads, act_masks);
                 case 4
                     node_grads = ActFunc.logexp_bp(post_grads, post_weights,...
-                        pre_values, pre_weights, act_grads);
+                        pre_values, pre_weights, act_grads, act_masks);
                 case 5
                     node_grads = ActFunc.relu_bp(post_grads, post_weights,...
-                        pre_values, pre_weights, act_grads);
+                        pre_values, pre_weights, act_grads, act_masks);
                 case 6
                     node_grads = ActFunc.rehu_bp(post_grads, post_weights,...
-                        pre_values, pre_weights, act_grads);
+                        pre_values, pre_weights, act_grads, act_masks);
                 case 7
-                    node_grads = ActFunc.norm_rehu_bp(post_grads,...
-                        post_weights, pre_values, pre_weights, act_grads);
+                    node_grads = ActFunc.norm_rehu_bp(post_grads, post_weights,...
+                        pre_values, pre_weights, act_grads, act_masks);
                 otherwise
                     error('No valid activation function type selected.');
             end
@@ -108,7 +110,7 @@
         end
 
         function [ dLdF ] = linear_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -123,13 +125,14 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
             %
             dLdA = (post_grads * post_weights') + act_grads;
             dAdF = 1;
-            dLdF = dLdA .* dAdF;
+            dLdF = (dLdA .* act_masks) .* dAdF;
             return
         end
 
@@ -147,7 +150,7 @@
         end
 
         function [ dLdF ] = sigmoid_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -162,14 +165,15 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
             %
             e_mx = exp(-pre_acts * pre_weights);
             dLdA = (post_grads * post_weights') + act_grads;
             dAdF = e_mx ./ (1 + e_mx).^2;
-            dLdF = dLdA .* dAdF;
+            dLdF = (dLdA .* act_masks) .* dAdF;
             return
         end
 
@@ -187,7 +191,7 @@
         end
 
         function [ dLdF ] = tanh_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -202,13 +206,14 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
             %
             dAdF = 1 - (tanh(pre_acts * pre_weights)).^2;
             dLdA = (post_grads * post_weights') + act_grads;
-            dLdF = dLdA .* dAdF;
+            dLdF = (dLdA .* act_masks) .* dAdF;
             return
         end
 
@@ -226,7 +231,7 @@
         end
 
         function [ dLdF ] = logexp_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -241,14 +246,15 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
             %
             exp_vals = exp(pre_acts * pre_weights);
             dAdF = exp_vals ./ (exp_vals + 1);
             dLdA = (post_grads * post_weights') + act_grads;
-            dLdF = dLdA .* dAdF;
+            dLdF = (dLdA .* act_masks) .* dAdF;
             return
         end
 
@@ -267,7 +273,7 @@
         end
 
         function [ dLdF ] = relu_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -282,13 +288,14 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
             %
             dAdF = (pre_acts * pre_weights) > 0;
             dLdA = (post_grads * post_weights') + act_grads;
-            dLdF = dLdA .* dAdF;
+            dLdF = (dLdA .* act_masks) .* dAdF;
             return
         end
 
@@ -311,7 +318,7 @@
         end
 
         function [ dLdF ] = rehu_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -326,6 +333,7 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
@@ -334,7 +342,7 @@
             dAdF = 2 * dAdF;
             dAdF(dAdF > 1) = 1;
             dLdA = (post_grads * post_weights') + act_grads;
-            dLdF = dLdA .* dAdF;
+            dLdF = (dLdA .* act_masks) .* dAdF;
             return
         end
 
@@ -360,7 +368,7 @@
         end
 
         function [ dLdF ] = norm_rehu_bp(post_grads, post_weights, ...
-                pre_acts, pre_weights, act_grads)
+                pre_acts, pre_weights, act_grads, act_masks)
             % Compute the gradients w.r.t. the pre-transform activations at all
             % nodes in the current layer.
             % 
@@ -375,6 +383,7 @@
             %                current layer (pre_dim x cur_dim)
             %   act_grads: direct loss gradients on post-transform activations
             %              for each node in current layer (obs_count x cur_dim)
+            %   act_masks: dropout masks applied to activations
             % Outputs:
             %   dLdF: gradients w.r.t pre-transform node activations at current
             %         layer (obs_count x cur_dim)
@@ -391,6 +400,7 @@
             % Compute 
             dA1dF = 2*(quad_mask .* F) + line_mask;
             dLdA2 = (post_grads * post_weights') + act_grads;
+            dLdA2 = dLdA2 .* act_masks;
             V = dLdA2 .* A1;
             V = sum(V, 2);
             dLdA1 = bsxfun(@rdivide, dLdA2, A1N) - ...

diff --git a/LayerNet/.DS_Store b/LayerNet/.DS_Store
diff --git a/LayerNet/LDLayer.m b/LayerNet/LDLayer.m
@@ -116,19 +116,17 @@
             % the pre and post transform values.
             A_pre = X * Wm';
             % Pass linear function outputs through self.act_trans.
-            A_post = self.act_trans(A_pre, 'ff');
+            A_post = self.act_trans(A_pre, 'ff', [], [], []);
             % Update timing info
             self.ff_evals = self.ff_evals + size(X,1);
             return
         end
 
-        function [ dLdW dLdX ] = ...
-                backprop(self, dLdA_post, dLdA_pre, A_post, X, Wm)
+        function [ dLdW dLdX ] = backprop(self, dLdA, A, X, Wm)
             % Backprop through the linear functions and post-linear transforms
             % for this layer.
             %
-            dAdF = self.act_trans(A_post, 'bp');
-            dLdF = (dLdA_post .* dAdF) + dLdA_pre;
+            dLdF = self.act_trans(A, 'bp', dLdA, X, Wm);
             % Compute gradients with respect to linear function parameters
             dLdW = dLdF' * X;
             % Compute gradients with respect to input matrix X
@@ -142,36 +140,36 @@
 
     methods (Static = true)
 
-        function [ F ] = tanh_trans(X, comp_type)
-            % Transform the elements of X by hypertangent.
+        function [ F ] = tanh_trans(X, comp_type, dLdA, Xin, Win)
+            % Transform the elements of X by hypertangent, or do backprop
             assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
             if (strcmp(comp_type,'ff'))
                 % Do feedforward
                 F = tanh(X);
             else
                 % Do backprop
-                F = 1 - X.^2;
+                dAdF = 1 - X.^2;
+                F = dLdA .* dAdF;
             end
             return
         end
 
-        function [ F ] = relu_trans(X, comp_type)
-            % Leave the values in X unchanged. Or, backprop through the
-            % non-transform.
+        function [ F ] = relu_trans(X, comp_type, dLdA, Xin, Win)
+            % Transform the elements of X by ReLU, or do backprop
             assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
             if (strcmp(comp_type,'ff'))
                 % Do feedforward
                 F = max(X, 0);
             else
                 % Do backprop
-                F = single(X > 0);
+                dAdF = single(X > 0);
+                F = dLdA .* dAdF;
             end
             return
         end
 
-        function [ F ] = rehu_trans(X, comp_type)
-            % Leave the values in X unchanged. Or, backprop through the
-            % non-transform.
+        function [ F ] = rehu_trans(X, comp_type, dLdA, Xin, Win)
+            % Transform the elements of X by ReHu, or do backprop
             assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
             if (strcmp(comp_type,'ff'))
                 % Do feedforward
@@ -181,13 +179,50 @@
             else
                 % Do backprop
                 mask = (X < 0.25) & (X > 1e-10);
-                F = single(X > 0);
-                F(mask) = 2*sqrt(X(mask));
+                dAdF = single(X > 0);
+                dAdF(mask) = 2*sqrt(X(mask));
+                F = dLdA .* dAdF;
+            end
+            return
+        end
+
+        function [ F ] = norm_rehu_trans(X, comp_type, dLdA2, Xin, Win)
+            % Transform the elements of X by normed ReHu, or do backprop
+            assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
+            EPS = 1e-3;
+            if (strcmp(comp_type,'ff'))
+                % Do feedforward
+                cur_acts = X;
+                cur_acts = bsxfun(@max, cur_acts, 0);
+                quad_mask = bsxfun(@lt, cur_acts, 0.5);
+                line_mask = bsxfun(@ge, cur_acts, 0.5);
+                cur_acts = (quad_mask .* cur_acts.^2) + ...
+                    (line_mask .* (cur_acts - 0.25));
+                act_norms = sqrt(sum(cur_acts.^2,2) + EPS);
+                F = bsxfun(@rdivide, cur_acts, act_norms);
+            else
+                % Do backprop
+                F = Xin * Win';
+                F = bsxfun(@max, F, 0);
+                quad_mask = bsxfun(@lt, F, 0.5);
+                line_mask = bsxfun(@ge, F, 0.5);
+                A1 = (quad_mask .* F.^2) + ...
+                    (line_mask .* (F - 0.25));
+                A1N = sqrt(sum(A1.^2,2) + EPS);
+                A2 = bsxfun(@rdivide, A1, A1N);
+                % Compute 
+                dA1dF = 2*(quad_mask .* F) + line_mask;
+                V = dLdA2 .* A1;
+                V = sum(V, 2);
+                dLdA1 = bsxfun(@rdivide, dLdA2, A1N) - ...
+                    bsxfun(@times, A2, (V ./ (A1N.^2.0)));
+                F = dLdA1 .* dA1dF;
             end
             return
         end
+
 
-        function [ F ] = line_trans(X, comp_type)
+        function [ F ] = line_trans(X, comp_type, dLdA, Xin, Win)
             % Leave the values in X unchanged. Or, backprop through the
             % non-transform.
             assert((strcmp(comp_type,'ff')||strcmp(comp_type,'bp')),'ff/bp?');
@@ -196,7 +231,8 @@
                 F = X;
             else
                 % Do backprop
-                F = ones(size(X),'single');
+                dAdF = ones(size(X),'single');
+                F = dLdA .* dAdF;
             end
             return
         end