From a7cbe8beebda7f955995c3982135688107e2b945 Mon Sep 17 00:00:00 2001
From: paul <pmichel31415@gmail.com>
Date: Tue, 25 Oct 2016 17:06:27 -0400
Subject: [PATCH 1/5] Added set_s method for rnns

---
 dynet/fast-lstm.cc | 12 ++++++++++
 dynet/fast-lstm.h  |  1 +
 dynet/gru.cc       | 19 +++++++++++++---
 dynet/gru.h        |  2 ++
 dynet/lstm.cc      | 57 +++++++++++++++++++++++++++-------------------
 dynet/lstm.h       |  2 +-
 dynet/rnn.h        | 43 +++++++++++++++++++++-------------
 7 files changed, 93 insertions(+), 43 deletions(-)
diff --git a/dynet/fast-lstm.cc b/dynet/fast-lstm.cc
index abc55e4a4..4d53ad6f7 100644
--- a/dynet/fast-lstm.cc
+++ b/dynet/fast-lstm.cc
@@ -105,6 +105,18 @@ Expression FastLSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new
   return h[t].back();
 }
 
+Expression FastLSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  if (s_new.size()) { assert(s_new.size() == layers); }
+  const unsigned t = h.size();
+  h.push_back(vector<Expression>(layers));
+  for (unsigned i = 0; i < layers; ++i) {
+    Expression y = s_new[i];
+    h[t][i + layers] = y;
+  }
+  return h[t].back();
+}
+
+
 Expression FastLSTMBuilder::add_input_impl(int prev, const Expression& x) {
   h.push_back(vector<Expression>(layers));
   c.push_back(vector<Expression>(layers));
diff --git a/dynet/fast-lstm.h b/dynet/fast-lstm.h
index 891b46021..16733a9e9 100644
--- a/dynet/fast-lstm.h
+++ b/dynet/fast-lstm.h
@@ -43,6 +43,7 @@ struct FastLSTMBuilder : public RNNBuilder {
   void start_new_sequence_impl(const std::vector<Expression>& h0) override;
   Expression add_input_impl(int prev, const Expression& x) override;
   Expression set_h_impl(int prev, const std::vector<Expression>& h_new) override;
+  Expression set_s_impl(int prev, const std::vector<Expression>& s_new) override;
 
  public:
   // first index is layer, then ...
diff --git a/dynet/gru.cc b/dynet/gru.cc
index 6f4ded7ca..c86a3f574 100644
--- a/dynet/gru.cc
+++ b/dynet/gru.cc
@@ -86,9 +86,20 @@ Expression GRUBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   return h[t].back();
 }
 
+Expression GRUBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  if (s_new.size()) { assert(s_new.size() == layers); }
+  const unsigned t = h.size();
+  h.push_back(vector<Expression>(layers));
+  for (unsigned i = 0; i < layers; ++i) {
+    Expression y = s_new[i];
+    h[t][i + layers] = y;
+  }
+  return h[t].back();
+}
+
 Expression GRUBuilder::add_input_impl(int prev, const Expression& x) {
-  if(dropout_rate != 0.f)
-    throw std::runtime_error("GRUBuilder doesn't support dropout yet");
+	//if(dropout_rate != 0.f)
+	//throw std::runtime_error("GRUBuilder doesn't support dropout yet");
   const bool has_initial_state = (h0.size() > 0);
   h.push_back(vector<Expression>(layers));
   vector<Expression>& ht = h.back();
@@ -101,6 +112,7 @@ Expression GRUBuilder::add_input_impl(int prev, const Expression& x) {
     if (prev >= 0 || has_initial_state) {
       h_tprev = (prev < 0) ? h0[i] : h[prev][i];
     } else { prev_zero = true; }
+    if (dropout_rate) in = dropout(in, dropout_rate);
     // update gate
     Expression zt;
     if (prev_zero)
@@ -134,7 +146,8 @@ Expression GRUBuilder::add_input_impl(int prev, const Expression& x) {
       in = ht[i] = crt + nwt;
     }
   }
-  return ht.back();
+  if (dropout_rate) return dropout(ht.back(), dropout_rate);
+  else return ht.back();
 }
 
 void GRUBuilder::copy(const RNNBuilder & rnn) {
diff --git a/dynet/gru.h b/dynet/gru.h
index 3d3e15012..8d907454a 100644
--- a/dynet/gru.h
+++ b/dynet/gru.h
@@ -22,11 +22,13 @@ struct GRUBuilder : public RNNBuilder {
   unsigned num_h0_components() const override { return layers; }
   void copy(const RNNBuilder & params) override;
 
+
  protected:
   void new_graph_impl(ComputationGraph& cg) override;
   void start_new_sequence_impl(const std::vector<Expression>& h0) override;
   Expression add_input_impl(int prev, const Expression& x) override;
   Expression set_h_impl(int prev, const std::vector<Expression>& h_new) override;
+  Expression set_s_impl(int prev, const std::vector<Expression>& s_new) override;
 
   // first index is layer, then ...
   std::vector<std::vector<Parameter>> params;
diff --git a/dynet/lstm.cc b/dynet/lstm.cc
index a64ce8231..7b131db76 100644
--- a/dynet/lstm.cc
+++ b/dynet/lstm.cc
@@ -48,29 +48,29 @@ LSTMBuilder::LSTMBuilder(unsigned layers,
     vector<Parameter> ps = {p_x2i, p_h2i, p_c2i, p_bi, p_x2o, p_h2o, p_c2o, p_bo, p_x2c, p_h2c, p_bc};
     params.push_back(ps);
   }  // layers
-  dropout_rate = 0.f;  
+  dropout_rate = 0.f;
 }
 
-void LSTMBuilder::new_graph_impl(ComputationGraph& cg){
+void LSTMBuilder::new_graph_impl(ComputationGraph& cg) {
   param_vars.clear();
 
-  for (unsigned i = 0; i < layers; ++i){
+  for (unsigned i = 0; i < layers; ++i) {
     auto& p = params[i];
 
     //i
-    Expression i_x2i = parameter(cg,p[X2I]);
-    Expression i_h2i = parameter(cg,p[H2I]);
-    Expression i_c2i = parameter(cg,p[C2I]);
-    Expression i_bi = parameter(cg,p[BI]);
+    Expression i_x2i = parameter(cg, p[X2I]);
+    Expression i_h2i = parameter(cg, p[H2I]);
+    Expression i_c2i = parameter(cg, p[C2I]);
+    Expression i_bi = parameter(cg, p[BI]);
     //o
-    Expression i_x2o = parameter(cg,p[X2O]);
-    Expression i_h2o = parameter(cg,p[H2O]);
-    Expression i_c2o = parameter(cg,p[C2O]);
-    Expression i_bo = parameter(cg,p[BO]);
+    Expression i_x2o = parameter(cg, p[X2O]);
+    Expression i_h2o = parameter(cg, p[H2O]);
+    Expression i_c2o = parameter(cg, p[C2O]);
+    Expression i_bo = parameter(cg, p[BO]);
     //c
-    Expression i_x2c = parameter(cg,p[X2C]);
-    Expression i_h2c = parameter(cg,p[H2C]);
-    Expression i_bc = parameter(cg,p[BC]);
+    Expression i_x2c = parameter(cg, p[X2C]);
+    Expression i_h2c = parameter(cg, p[H2C]);
+    Expression i_bc = parameter(cg, p[BC]);
 
     vector<Expression> vars = {i_x2i, i_h2i, i_c2i, i_bi, i_x2o, i_h2o, i_c2o, i_bo, i_x2c, i_h2c, i_bc};
     param_vars.push_back(vars);
@@ -83,7 +83,7 @@ void LSTMBuilder::start_new_sequence_impl(const vector<Expression>& hinit) {
   h.clear();
   c.clear();
   if (hinit.size() > 0) {
-    assert(layers*2 == hinit.size());
+    assert(layers * 2 == hinit.size());
     h0.resize(layers);
     c0.resize(layers);
     for (unsigned i = 0; i < layers; ++i) {
@@ -108,6 +108,17 @@ Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   return h[t].back();
 }
 
+Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  if (s_new.size()) { assert(s_new.size() == layers); }
+  const unsigned t = h.size();
+  h.push_back(vector<Expression>(layers));
+  for (unsigned i = 0; i < layers; ++i) {
+    Expression y = s_new[i];
+    h[t][i + layers] = y;
+  }
+  return h[t].back();
+}
+
 Expression LSTMBuilder::add_input_impl(int prev, const Expression& x) {
   h.push_back(vector<Expression>(layers));
   c.push_back(vector<Expression>(layers));
@@ -149,11 +160,11 @@ Expression LSTMBuilder::add_input_impl(int prev, const Expression& x) {
     Expression i_wt = tanh(i_awt);
     // output
     if (has_prev_state) {
-      Expression i_nwt = cwise_multiply(i_it,i_wt);
-      Expression i_crt = cwise_multiply(i_ft,i_c_tm1);
+      Expression i_nwt = cwise_multiply(i_it, i_wt);
+      Expression i_crt = cwise_multiply(i_ft, i_c_tm1);
       ct[i] = i_crt + i_nwt;
     } else {
-      ct[i] = cwise_multiply(i_it,i_wt);
+      ct[i] = cwise_multiply(i_it, i_wt);
     }
 
     Expression i_aot;
@@ -163,18 +174,18 @@ Expression LSTMBuilder::add_input_impl(int prev, const Expression& x) {
       i_aot = affine_transform({vars[BO], vars[X2O], in, vars[C2O], ct[i]});
     Expression i_ot = logistic(i_aot);
     Expression ph_t = tanh(ct[i]);
-    in = ht[i] = cwise_multiply(i_ot,ph_t);
+    in = ht[i] = cwise_multiply(i_ot, ph_t);
   }
   if (dropout_rate) return dropout(ht.back(), dropout_rate);
-    else return ht.back();
+  else return ht.back();
 }
 
 void LSTMBuilder::copy(const RNNBuilder & rnn) {
   const LSTMBuilder & rnn_lstm = (const LSTMBuilder&)rnn;
   assert(params.size() == rnn_lstm.params.size());
-  for(size_t i = 0; i < params.size(); ++i)
-      for(size_t j = 0; j < params[i].size(); ++j)
-        params[i][j] = rnn_lstm.params[i][j];
+  for (size_t i = 0; i < params.size(); ++i)
+    for (size_t j = 0; j < params[i].size(); ++j)
+      params[i][j] = rnn_lstm.params[i][j];
 }
 
 void LSTMBuilder::save_parameters_pretraining(const string& fname) const {
diff --git a/dynet/lstm.h b/dynet/lstm.h
index 64a9c647c..ce92e153b 100644
--- a/dynet/lstm.h
+++ b/dynet/lstm.h
@@ -43,7 +43,7 @@ struct LSTMBuilder : public RNNBuilder {
   void start_new_sequence_impl(const std::vector<Expression>& h0) override;
   Expression add_input_impl(int prev, const Expression& x) override;
   Expression set_h_impl(int prev, const std::vector<Expression>& h_new) override;
-
+  Expression set_s_impl(int prev, const std::vector<Expression>& s_new) override;
  public:
   // first index is layer, then ...
   std::vector<std::vector<Parameter>> params;
diff --git a/dynet/rnn.h b/dynet/rnn.h
index 2cfe31352..09d11c3f6 100644
--- a/dynet/rnn.h
+++ b/dynet/rnn.h
@@ -36,7 +36,7 @@ struct RNNBuilder {
   // call this before add_input and after new_graph,
   // when starting a new sequence on the same hypergraph.
   // h_0 is used to initialize hidden layers at timestep 0 to given values
-  void start_new_sequence(const std::vector<Expression>& h_0={}) {
+  void start_new_sequence(const std::vector<Expression>& h_0 = {}) {
     sm.transition(RNNOp::start_new_sequence);
     cur = RNNPointer(-1);
     head.clear();
@@ -44,13 +44,21 @@ struct RNNBuilder {
   }
 
   // explicitly set the output state of a node
-  Expression set_h(const RNNPointer& prev, const std::vector<Expression>& h_new={}) {
+  Expression set_h(const RNNPointer& prev, const std::vector<Expression>& h_new = {}) {
     sm.transition(RNNOp::add_input);
     head.push_back(prev);
     cur = head.size() - 1;
     return set_h_impl(prev, h_new);
   }
 
+  // Set the internal state of a node (for lstms/grus)
+  Expression set_s(const RNNPointer& prev, const std::vector<Expression>& s_new = {}) {
+    sm.transition(RNNOp::add_input);
+    head.push_back(prev);
+    cur = head.size() - 1;
+    return set_s_impl(prev, s_new);
+  }
+
   // add another timestep by reading in the variable x
   // return the hidden representation of the deepest layer
   Expression add_input(const Expression& x) {
@@ -104,14 +112,16 @@ struct RNNBuilder {
   virtual void save_parameters_pretraining(const std::string& fname) const;
   virtual void load_parameters_pretraining(const std::string& fname);
 
- protected:
+
+protected:
   virtual void new_graph_impl(ComputationGraph& cg) = 0;
   virtual void start_new_sequence_impl(const std::vector<Expression>& h_0) = 0;
   virtual Expression add_input_impl(int prev, const Expression& x) = 0;
   virtual Expression set_h_impl(int prev, const std::vector<Expression>& h_new) = 0;
+  virtual Expression set_s_impl(int prev, const std::vector<Expression>& c_new) = 0;
   RNNPointer cur;
-  float dropout_rate;  
- private:
+  float dropout_rate;
+private:
   // the state machine ensures that the caller is behaving
   RNNStateMachine sm;
   std::vector<RNNPointer> head; // head[i] returns the head position
@@ -127,15 +137,16 @@ struct SimpleRNNBuilder : public RNNBuilder {
                             unsigned input_dim,
                             unsigned hidden_dim,
                             Model* model,
-                            bool support_lags=false);
+                            bool support_lags = false);
 
- protected:
+protected:
   void new_graph_impl(ComputationGraph& cg) override;
   void start_new_sequence_impl(const std::vector<Expression>& h_0) override;
   Expression add_input_impl(int prev, const Expression& x) override;
   Expression set_h_impl(int prev, const std::vector<Expression>& h_new) override;
+  Expression set_s_impl(int prev, const std::vector<Expression>& s_new) {return set_h_impl(prev, s_new);}
 
- public:
+public:
   Expression add_auxiliary_input(const Expression& x, const Expression &aux);
 
   Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); }
@@ -151,7 +162,7 @@ struct SimpleRNNBuilder : public RNNBuilder {
   void save_parameters_pretraining(const std::string& fname) const override;
   void load_parameters_pretraining(const std::string& fname) override;
 
- private:
+private:
   // first index is layer, then x2h h2h hb
   std::vector<std::vector<Parameter>> params;
 
@@ -178,13 +189,13 @@ struct SimpleRNNBuilder : public RNNBuilder {
 
 
 namespace boost {
-  namespace serialization {
-    template<class Archive>
-    void serialize(Archive& ar, dynet::RNNPointer& p, const unsigned int version)
-    {
-        ar & p.t;
-    }
-  } // namespace serialization
+namespace serialization {
+template<class Archive>
+void serialize(Archive& ar, dynet::RNNPointer& p, const unsigned int version)
+{
+  ar & p.t;
+}
+} // namespace serialization
 } // namespace boost
 
 BOOST_CLASS_EXPORT_KEY(dynet::RNNBuilder)

From 729bdafc5f028e126c6c126e9fef80a5e0dc3708 Mon Sep 17 00:00:00 2001
From: paul <pmichel31415@gmail.com>
Date: Tue, 25 Oct 2016 17:15:18 -0400
Subject: [PATCH 2/5] python bindings for set_s

---
 python/dynet.pxd |  1 +
 python/dynet.pyx | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/python/dynet.pxd b/python/dynet.pxd
index bbb9b2d7c..06c44f2e3 100644
--- a/python/dynet.pxd
+++ b/python/dynet.pxd
@@ -247,6 +247,7 @@ cdef extern from "dynet/rnn.h" namespace "dynet":
         CExpression add_input(CExpression &x)
         CExpression add_input(CRNNPointer prev, CExpression &x)
         CExpression set_h(CRNNPointer prev, vector[CExpression] ces)
+        CExpression set_s(CRNNPointer prev, vector[CExpression] ces)
         void rewind_one_step()
         CExpression back()
         vector[CExpression] final_h()
diff --git a/python/dynet.pyx b/python/dynet.pyx
index 13699347c..837b61781 100644
--- a/python/dynet.pyx
+++ b/python/dynet.pyx
@@ -931,6 +931,16 @@ cdef class _RNNBuilder: # {{{
                 ces.push_back(e.c())
         return Expression.from_cexpr(self.cg_version, self.thisptr.set_h(prev, ces))
 
+    cdef set_s(self, CRNNPointer prev, es=None):
+        if self.cg_version != _cg.version(): raise ValueError("Using stale builder. Create .new_graph() after computation graph is renewed.")
+        cdef vector[CExpression] ces = vector[CExpression]()
+        cdef Expression e
+        if es:
+            for e in es:
+                ensure_freshness(e)
+                ces.push_back(e.c())
+        return Expression.from_cexpr(self.cg_version, self.thisptr.set_s(prev, ces))
+
     cdef rewind_one_step(self):
         if self.cg_version != _cg.version(): raise ValueError("Using stale builder. Create .new_graph() after computation graph is renewed.")
         self.thisptr.rewind_one_step()
@@ -1158,6 +1168,11 @@ cdef class RNNState: # {{{
         cdef int state_idx = <int>self.builder.thisptr.state()
         return RNNState(self.builder, state_idx, self, res)
 
+    cpdef RNNState set_s(self, es=None):
+        cdef Expression res = self.builder.set_s(CRNNPointer(self.state_idx), es)
+        cdef int state_idx = <int>self.builder.thisptr.state()
+        return RNNState(self.builder, state_idx, self, res)
+
     cpdef RNNState add_input(self, Expression x):
         cdef Expression res = self.builder.add_input_to_prev(CRNNPointer(self.state_idx), x)
         cdef int state_idx = <int>self.builder.thisptr.state()

From 9ba2b49f1a70b09b3b5a94f1e310f4ddd0b0e426 Mon Sep 17 00:00:00 2001
From: paul <pmichel31415@gmail.com>
Date: Wed, 26 Oct 2016 10:16:54 -0400
Subject: [PATCH 3/5] Bug correction in set_h,set_s

---
 dynet/fast-lstm.cc | 30 ++++++++++++++++++++----------
 dynet/gru.cc       | 30 +++++++++++++++++++++---------
 dynet/lstm.cc      | 25 ++++++++++++++++++-------
 3 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/dynet/fast-lstm.cc b/dynet/fast-lstm.cc
index 4d53ad6f7..e4b6a83e9 100644
--- a/dynet/fast-lstm.cc
+++ b/dynet/fast-lstm.cc
@@ -94,29 +94,39 @@ void FastLSTMBuilder::start_new_sequence_impl(const vector<Expression>& hinit) {
 }
 
 // TO DO - Make this correct
-Expression FastLSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
+// Copied c from the previous step (otherwise c.size()< h.size())
+// Also is creating a new step something we want? 
+// wouldn't overwriting the current one be better?
+Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   if (h_new.size()) { assert(h_new.size() == layers); }
   const unsigned t = h.size();
   h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
-    Expression y = h_new[i];
-    h[t][i] = y;
+    Expression h_i = h_new[i];
+    Expression c_i = c[t - 1][i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
   }
   return h[t].back();
 }
-
-Expression FastLSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
-  if (s_new.size()) { assert(s_new.size() == layers); }
-  const unsigned t = h.size();
+// Current implementation : s_new is either {new_c[0],...,new_c[n]}
+// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
+Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); }
+  bool only_c = s_new.size() == layers;
+  const unsigned t = c.size();
   h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
-    Expression y = s_new[i];
-    h[t][i + layers] = y;
+    Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers];
+    Expression c_i = s_new[i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
   }
   return h[t].back();
 }
 
-
 Expression FastLSTMBuilder::add_input_impl(int prev, const Expression& x) {
   h.push_back(vector<Expression>(layers));
   c.push_back(vector<Expression>(layers));
diff --git a/dynet/gru.cc b/dynet/gru.cc
index c86a3f574..d3efed328 100644
--- a/dynet/gru.cc
+++ b/dynet/gru.cc
@@ -75,24 +75,36 @@ void GRUBuilder::start_new_sequence_impl(const std::vector<Expression>& h_0) {
   }
 }
 
-Expression GRUBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
+// TO DO - Make this correct
+// Copied c from the previous step (otherwise c.size()< h.size())
+// Also is creating a new step something we want? 
+// wouldn't overwriting the current one be better?
+Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   if (h_new.size()) { assert(h_new.size() == layers); }
   const unsigned t = h.size();
   h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
-    Expression y = h_new[i];
-    h[t][i] = y;
+    Expression h_i = h_new[i];
+    Expression c_i = c[t - 1][i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
   }
   return h[t].back();
 }
-
-Expression GRUBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
-  if (s_new.size()) { assert(s_new.size() == layers); }
-  const unsigned t = h.size();
+// Current implementation : s_new is either {new_c[0],...,new_c[n]}
+// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
+Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); }
+  bool only_c = s_new.size() == layers;
+  const unsigned t = c.size();
   h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
-    Expression y = s_new[i];
-    h[t][i + layers] = y;
+    Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers];
+    Expression c_i = s_new[i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
   }
   return h[t].back();
 }
diff --git a/dynet/lstm.cc b/dynet/lstm.cc
index 7b131db76..fe014ff2e 100644
--- a/dynet/lstm.cc
+++ b/dynet/lstm.cc
@@ -97,24 +97,35 @@ void LSTMBuilder::start_new_sequence_impl(const vector<Expression>& hinit) {
 }
 
 // TO DO - Make this correct
+// Copied c from the previous step (otherwise c.size()< h.size())
+// Also is creating a new step something we want? 
+// wouldn't overwriting the current one be better?
 Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   if (h_new.size()) { assert(h_new.size() == layers); }
   const unsigned t = h.size();
   h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
-    Expression y = h_new[i];
-    h[t][i] = y;
+    Expression h_i = h_new[i];
+    Expression c_i = c[t - 1][i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
   }
   return h[t].back();
 }
-
+// Current implementation : s_new is either {new_c[0],...,new_c[n]}
+// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
 Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
-  if (s_new.size()) { assert(s_new.size() == layers); }
-  const unsigned t = h.size();
+  if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); }
+  bool only_c = s_new.size() == layers;
+  const unsigned t = c.size();
   h.push_back(vector<Expression>(layers));
+  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
-    Expression y = s_new[i];
-    h[t][i + layers] = y;
+    Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers];
+    Expression c_i = s_new[i];
+    h[t][i] = h_i;
+    c[t][i] = c_i;
   }
   return h[t].back();
 }

From 406040db0b3260c2239f2269791939d08be6e51b Mon Sep 17 00:00:00 2001
From: paul <pmichel31415@gmail.com>
Date: Wed, 26 Oct 2016 10:30:52 -0400
Subject: [PATCH 4/5] Bugfix

---
 dynet/fast-lstm.cc |  5 +++--
 dynet/gru.cc       | 50 +++++++++++++++++-----------------------------
 2 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/dynet/fast-lstm.cc b/dynet/fast-lstm.cc
index e4b6a83e9..e9f693761 100644
--- a/dynet/fast-lstm.cc
+++ b/dynet/fast-lstm.cc
@@ -97,7 +97,7 @@ void FastLSTMBuilder::start_new_sequence_impl(const vector<Expression>& hinit) {
 // Copied c from the previous step (otherwise c.size()< h.size())
 // Also is creating a new step something we want? 
 // wouldn't overwriting the current one be better?
-Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
+Expression FastLSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   if (h_new.size()) { assert(h_new.size() == layers); }
   const unsigned t = h.size();
   h.push_back(vector<Expression>(layers));
@@ -112,7 +112,7 @@ Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
 }
 // Current implementation : s_new is either {new_c[0],...,new_c[n]}
 // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
-Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+Expression FastLSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
   if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); }
   bool only_c = s_new.size() == layers;
   const unsigned t = c.size();
@@ -127,6 +127,7 @@ Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_ne
   return h[t].back();
 }
 
+
 Expression FastLSTMBuilder::add_input_impl(int prev, const Expression& x) {
   h.push_back(vector<Expression>(layers));
   c.push_back(vector<Expression>(layers));
diff --git a/dynet/gru.cc b/dynet/gru.cc
index d3efed328..eff952c86 100644
--- a/dynet/gru.cc
+++ b/dynet/gru.cc
@@ -48,19 +48,19 @@ void GRUBuilder::new_graph_impl(ComputationGraph& cg) {
     auto& p = params[i];
 
     // z
-    Expression x2z = parameter(cg,p[X2Z]);
-    Expression h2z = parameter(cg,p[H2Z]);
-    Expression bz = parameter(cg,p[BZ]);
+    Expression x2z = parameter(cg, p[X2Z]);
+    Expression h2z = parameter(cg, p[H2Z]);
+    Expression bz = parameter(cg, p[BZ]);
 
     // r
-    Expression x2r = parameter(cg,p[X2R]);
-    Expression h2r = parameter(cg,p[H2R]);
-    Expression br = parameter(cg,p[BR]);
+    Expression x2r = parameter(cg, p[X2R]);
+    Expression h2r = parameter(cg, p[H2R]);
+    Expression br = parameter(cg, p[BR]);
 
     // h
-    Expression x2h = parameter(cg,p[X2H]);
-    Expression h2h = parameter(cg,p[H2H]);
-    Expression bh = parameter(cg,p[BH]);
+    Expression x2h = parameter(cg, p[X2H]);
+    Expression h2h = parameter(cg, p[H2H]);
+    Expression bh = parameter(cg, p[BH]);
 
     vector<Expression> vars = {x2z, h2z, bz, x2r, h2r, br, x2h, h2h, bh};
     param_vars.push_back(vars);
@@ -77,41 +77,27 @@ void GRUBuilder::start_new_sequence_impl(const std::vector<Expression>& h_0) {
 
 // TO DO - Make this correct
 // Copied c from the previous step (otherwise c.size()< h.size())
-// Also is creating a new step something we want? 
+// Also is creating a new step something we want?
 // wouldn't overwriting the current one be better?
-Expression LSTMBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
+Expression GRUBuilder::set_h_impl(int prev, const vector<Expression>& h_new) {
   if (h_new.size()) { assert(h_new.size() == layers); }
   const unsigned t = h.size();
   h.push_back(vector<Expression>(layers));
-  c.push_back(vector<Expression>(layers));
   for (unsigned i = 0; i < layers; ++i) {
     Expression h_i = h_new[i];
-    Expression c_i = c[t - 1][i];
     h[t][i] = h_i;
-    c[t][i] = c_i;
   }
   return h[t].back();
 }
 // Current implementation : s_new is either {new_c[0],...,new_c[n]}
 // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]}
-Expression LSTMBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
-  if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); }
-  bool only_c = s_new.size() == layers;
-  const unsigned t = c.size();
-  h.push_back(vector<Expression>(layers));
-  c.push_back(vector<Expression>(layers));
-  for (unsigned i = 0; i < layers; ++i) {
-    Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers];
-    Expression c_i = s_new[i];
-    h[t][i] = h_i;
-    c[t][i] = c_i;
-  }
-  return h[t].back();
+Expression GRUBuilder::set_s_impl(int prev, const std::vector<Expression>& s_new) {
+  return set_h_impl(prev, s_new);
 }
 
 Expression GRUBuilder::add_input_impl(int prev, const Expression& x) {
-	//if(dropout_rate != 0.f)
-	//throw std::runtime_error("GRUBuilder doesn't support dropout yet");
+  //if(dropout_rate != 0.f)
+  //throw std::runtime_error("GRUBuilder doesn't support dropout yet");
   const bool has_initial_state = (h0.size() > 0);
   h.push_back(vector<Expression>(layers));
   vector<Expression>& ht = h.back();
@@ -165,9 +151,9 @@ Expression GRUBuilder::add_input_impl(int prev, const Expression& x) {
 void GRUBuilder::copy(const RNNBuilder & rnn) {
   const GRUBuilder & rnn_gru = (const GRUBuilder&)rnn;
   assert(params.size() == rnn_gru.params.size());
-  for(size_t i = 0; i < params.size(); ++i)
-      for(size_t j = 0; j < params[i].size(); ++j)
-        params[i][j] = rnn_gru.params[i][j];
+  for (size_t i = 0; i < params.size(); ++i)
+    for (size_t j = 0; j < params[i].size(); ++j)
+      params[i][j] = rnn_gru.params[i][j];
 }
 
 } // namespace dynet

From 1893191dfd36271dd73c9cdc2e154350d38f943a Mon Sep 17 00:00:00 2001
From: paul <pmichel31415@gmail.com>
Date: Wed, 26 Oct 2016 13:57:11 -0400
Subject: [PATCH 5/5] First documentation for RNNbuilders

---
 doc/doxygen/Doxyfile    |   2 +-
 doc/source/builders.rst |   4 +-
 dynet/expr.h            |   2 +-
 dynet/rnn.h             | 236 +++++++++++++++++++++++++++++++++++-----
 4 files changed, 211 insertions(+), 33 deletions(-)

diff --git a/doc/doxygen/Doxyfile b/doc/doxygen/Doxyfile
index 526276130..1c1cf3b1a 100644
--- a/doc/doxygen/Doxyfile
+++ b/doc/doxygen/Doxyfile
@@ -758,7 +758,7 @@ WARN_LOGFILE           =
 # spaces.
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  =../../dynet/expr.h ../../dynet/training.h
+INPUT                  =../../dynet/expr.h ../../dynet/training.h ../../dynet/rnn.h
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/doc/source/builders.rst b/doc/source/builders.rst
index d6139ae1a..3ea25c8e2 100644
--- a/doc/source/builders.rst
+++ b/doc/source/builders.rst
@@ -4,4 +4,6 @@ DyNet Builders
 Builders combine together various operations to implement more
 complicated things such as recurrent and LSTM networks
 
-TODO: Create documentation
+.. doxygengroup:: rnnbuilders
+	:members:
+	:content-only:
\ No newline at end of file
diff --git a/dynet/expr.h b/dynet/expr.h
index 8a155d5c8..b4672c315 100644
--- a/dynet/expr.h
+++ b/dynet/expr.h
@@ -3,7 +3,7 @@
  * \defgroup operations
  * \brief The various operations that you can use in building a DyNet graph
  * 
- * TODO: Create documentation and explain expressions, etc...
+ * \details TODO: Create documentation and explain expressions, etc...
  */
 
 #ifndef DYNET_EXPR_H
diff --git a/dynet/rnn.h b/dynet/rnn.h
index 09d11c3f6..d2d9497da 100644
--- a/dynet/rnn.h
+++ b/dynet/rnn.h
@@ -1,3 +1,11 @@
+/**
+ * \file rnn.h
+ * \defgroup rnnbuilders
+ * \brief Helper structures to build recurrent units
+ * 
+ * \details TODO: Create documentation and explain rnns, etc...
+ */
+
 #ifndef DYNET_RNN_H_
 #define DYNET_RNN_H_
 
@@ -18,24 +26,48 @@ inline void swap(RNNPointer& i1, RNNPointer& i2) {
   RNNPointer t = i1; i1 = i2; i2 = t;
 }
 
-// interface for constructing an RNN, LSTM, GRU, etc.
+/**
+ * \ingroup rnnbuilders
+ * \brief interface for constructing an RNN, LSTM, GRU, etc.
+ * \details [long description]
+ */
 struct RNNBuilder {
+  /**
+   * 
+   * \brief Default constructor
+   */
   RNNBuilder() : cur(-1) {}
   virtual ~RNNBuilder();
 
+  /**
+   * 
+   * \brief Get pointer to the current state
+   * 
+   * \return Pointer to the current state
+   */
   RNNPointer state() const { return cur; }
 
-  // call this to reset the builder when you are working with a newly
-  // created ComputationGraph object
+  /**
+   * 
+   * \brief Initialize with new computation graph
+   * \details call this to reset the builder when you are working with a newly
+   * created ComputationGraph object
+   * 
+   * \param cg Computation graph
+   */
   void new_graph(ComputationGraph& cg) {
     sm.transition(RNNOp::new_graph);
     new_graph_impl(cg);
   }
 
-  // Reset for new sequence
-  // call this before add_input and after new_graph,
-  // when starting a new sequence on the same hypergraph.
-  // h_0 is used to initialize hidden layers at timestep 0 to given values
+  /**
+   * 
+   * \brief Reset for new sequence
+   * \details call this before add_input and after new_graph,
+   * when starting a new sequence on the same hypergraph.
+   * 
+   * \param h_0 `h_0` is used to initialize hidden layers at timestep 0 to given values
+   */
   void start_new_sequence(const std::vector<Expression>& h_0 = {}) {
     sm.transition(RNNOp::start_new_sequence);
     cur = RNNPointer(-1);
@@ -43,7 +75,16 @@ struct RNNBuilder {
     start_new_sequence_impl(h_0);
   }
 
-  // explicitly set the output state of a node
+  // 
+  /**
+   * 
+   * \brief Explicitly set the output state of a node
+   * 
+   * \param prev Pointer to the previous state
+   * \param h_new The new hidden state
+   * 
+   * \return The hidden representation of the deepest layer
+   */
   Expression set_h(const RNNPointer& prev, const std::vector<Expression>& h_new = {}) {
     sm.transition(RNNOp::add_input);
     head.push_back(prev);
@@ -51,7 +92,19 @@ struct RNNBuilder {
     return set_h_impl(prev, h_new);
   }
 
-  // Set the internal state of a node (for lstms/grus)
+  // 
+  /**
+   * 
+   * \brief Set the internal state of a node (for lstms/grus)
+   * \details For RNNs without internal states (SimpleRNN, GRU...), 
+   * this has the same behaviour as `set_h` 
+   * 
+   * \param prev Pointer to the previous state
+   * \param s_new The new state. Can be `{new_c[0],...,new_c[n]}`
+   *  or `{new_c[0],...,new_c[n], new_h[0],...,new_h[n]}`
+   * 
+   * \return The hidden representation of the deepest layer
+   */
   Expression set_s(const RNNPointer& prev, const std::vector<Expression>& s_new = {}) {
     sm.transition(RNNOp::add_input);
     head.push_back(prev);
@@ -59,8 +112,14 @@ struct RNNBuilder {
     return set_s_impl(prev, s_new);
   }
 
-  // add another timestep by reading in the variable x
-  // return the hidden representation of the deepest layer
+  /**
+   * 
+   * \brief Add another timestep by reading in the variable x
+   * 
+   * \param x Input variable
+   * 
+   * \return The hidden representation of the deepest layer
+   */
   Expression add_input(const Expression& x) {
     sm.transition(RNNOp::add_input);
     head.push_back(cur);
@@ -69,9 +128,18 @@ struct RNNBuilder {
     return add_input_impl(rcp, x);
   }
 
-  // add another timestep, but define recurrent connection to prev
-  // rather than to head[cur]
-  // this can be used to construct trees, implement beam search, etc.
+   /**
+    * 
+   * \brief Add another timestep, with arbitrary recurrent connection.
+   * \details This allows to define a recurrent connection to `prev`
+   * rather than to head[cur].
+   * This can be used to construct trees, implement beam search, etc.
+   * 
+   * \param prev Pointer to the previous state
+   * \param x Input variable
+   * 
+   * \return The hidden representation of the deepest layer
+   */
   Expression add_input(const RNNPointer& prev, const Expression& x) {
     sm.transition(RNNOp::add_input);
     head.push_back(prev);
@@ -79,37 +147,116 @@ struct RNNBuilder {
     return add_input_impl(prev, x);
   }
 
-  // rewind the last timestep - this DOES NOT remove the variables
-  // from the computation graph, it just means the next time step will
-  // see a different previous state. You can remind as many times as
-  // you want.
+  /**
+   * 
+   * \brief Rewind the last timestep
+   * \details - this DOES NOT remove the variables from the computation graph,
+   * it just means the next time step will see a different previous state.
+   * You can remind as many times as you want.
+   */
   void rewind_one_step() {
     cur = head[cur];
   }
 
-  // Set dropout. In general, you should disable dropout at test time
+  /**
+   * 
+   * \brief Set Dropout
+   * 
+   * \param d Dropout rate
+   */
   void set_dropout(float d) { dropout_rate = d; }
+  /**
+   * 
+   * \brief Disable Dropout
+   * \details In general, you should disable dropout at test time
+   */
   void disable_dropout() { dropout_rate = 0; }
 
-  // returns node (index) of most recent output
+  /**
+   * 
+   * \brief Returns node (index) of most recent output
+   * 
+   * \return Node (index) of most recent output
+   */
   virtual Expression back() const = 0;
-  // access the final output of each hidden layer
+  /**
+   * 
+   * \brief Access the final output of each hidden layer
+   * 
+   * \return Final output of each hidden layer
+   */
   virtual std::vector<Expression> final_h() const = 0;
+  /**
+   * 
+   * \brief Access the output of any hidden layer
+   * 
+   * \param i Pointer to the step which output you want to access
+   * 
+   * \return Output of each hidden layer at the given step
+   */
   virtual std::vector<Expression> get_h(RNNPointer i) const = 0;
-  // access the state of each hidden layer, in a format that can be used in
-  // start_new_sequence
+
+  /**
+   * 
+   * \brief Access the final state of each hidden layer
+   * \details This returns the state of each hidden layer, 
+   * in a format that can be used in start_new_sequence
+   * (i.e. including any internal cell for LSTMs and the likes)
+   * 
+   * \return vector containing, if it exists, the list of final 
+   * internal states, followed by the list of final outputs for 
+   * each layer
+   */
   virtual std::vector<Expression> final_s() const = 0;
-  virtual unsigned num_h0_components() const  = 0;
+  /**
+   * 
+   * \brief Access the state of any hidden layer
+   * \details See `final_s` for details
+   * 
+   * \param i Pointer to the step which state you want to access
+   * 
+   * \return Internal state of each hidden layer at the given step
+   */
   virtual std::vector<Expression> get_s(RNNPointer i) const = 0;
-  // copy the parameters of another builder
+
+  /**
+   * 
+   * \brief Number of components in `h_0`
+   * 
+   * \return Number of components in `h_0`
+   */
+  virtual unsigned num_h0_components() const  = 0;
+  /**
+   * 
+   * \brief Copy the parameters of another builder.
+   * 
+   * \param params RNNBuilder you want to copy parameters from.
+   */
   virtual void copy(const RNNBuilder & params) = 0;
 
-  // the following functions save all the parameters associated with a particular
-  // RNNBuilder's derived class to a file. These should not be used to seralize
-  // models, they should only be used to load and save parameters for pretraining.
-  // If you are interested in serializing models, use the boost serialization
-  // API against your model class
+  /**
+   * 
+   * \brief This function saves all the parameters associated with
+   * a particular RNNBuilder's derived class to a file.
+   * \details This should not be used to seralize models, it should
+   * only be used to save parameters for pretraining.
+   * If you are interested in serializing models, use the boost
+   * serialization API against your model class.
+   * 
+   * \param fname File you want to save your model to.
+   */
   virtual void save_parameters_pretraining(const std::string& fname) const;
+  /**
+   * 
+   * \brief Loads all the parameters associated with a particular RNNBuilder's
+   * derived class from a file.
+   * \details This should not be used to seralize models, it should
+   * only be used to load parameters from pretraining.
+   * If you are interested in serializing models, use the boost
+   * serialization API against your model class.
+   * 
+   * \param fname File you want to read your model from.
+   */
   virtual void load_parameters_pretraining(const std::string& fname);
 
 
@@ -131,8 +278,25 @@ struct RNNBuilder {
   void serialize(Archive& ar, const unsigned int);
 };
 
+/**
+ * \ingroup rnnbuilders
+ * \brief This provides a builder for the simplest RNN with tanh nonlinearity
+ * \details The equation for this RNN is :
+ * \f$h_t=\tanh(W_x x_t + W_h h_{t-1} + b)\f$
+ * 
+ */
 struct SimpleRNNBuilder : public RNNBuilder {
   SimpleRNNBuilder() = default;
+  /**
+   * 
+   * \brief Builds a simple RNN
+   * 
+   * \param layers Number of layers
+   * \param input_dim Dimension of the input
+   * \param hidden_dim Hiddent layer (and output) size
+   * \param model Model holding the parameters
+   * \param support_lags Allow for auxiliary output?
+   */
   explicit SimpleRNNBuilder(unsigned layers,
                             unsigned input_dim,
                             unsigned hidden_dim,
@@ -147,6 +311,18 @@ struct SimpleRNNBuilder : public RNNBuilder {
   Expression set_s_impl(int prev, const std::vector<Expression>& s_new) {return set_h_impl(prev, s_new);}
 
 public:
+  /**
+   * 
+   * \brief Add auxiliary output
+   * \details Returns \f$h_t=\tanh(W_x x_t + W_h h_{t-1} + W_y y + b)\f$
+   * where \f$y\f$ is an auxiliary output
+   * TODO : clarify
+   * 
+   * \param x Input expression
+   * \param aux Auxiliary output expression
+   * 
+   * \return The hidden representation of the deepest layer
+   */
   Expression add_auxiliary_input(const Expression& x, const Expression &aux);
 
   Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); }