From a7cbe8beebda7f955995c3982135688107e2b945 Mon Sep 17 00:00:00 2001 From: paul Date: Tue, 25 Oct 2016 17:06:27 -0400 Subject: [PATCH 1/5] Added set_s method for rnns --- dynet/fast-lstm.cc | 12 ++++++++++ dynet/fast-lstm.h | 1 + dynet/gru.cc | 19 +++++++++++++--- dynet/gru.h | 2 ++ dynet/lstm.cc | 57 +++++++++++++++++++++++++++------------------- dynet/lstm.h | 2 +- dynet/rnn.h | 43 +++++++++++++++++++++------------- 7 files changed, 93 insertions(+), 43 deletions(-) diff --git a/dynet/fast-lstm.cc b/dynet/fast-lstm.cc index abc55e4a4..4d53ad6f7 100644 --- a/dynet/fast-lstm.cc +++ b/dynet/fast-lstm.cc @@ -105,6 +105,18 @@ Expression FastLSTMBuilder::set_h_impl(int prev, const vector& h_new return h[t].back(); } +Expression FastLSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { + if (s_new.size()) { assert(s_new.size() == layers); } + const unsigned t = h.size(); + h.push_back(vector(layers)); + for (unsigned i = 0; i < layers; ++i) { + Expression y = s_new[i]; + h[t][i + layers] = y; + } + return h[t].back(); +} + + Expression FastLSTMBuilder::add_input_impl(int prev, const Expression& x) { h.push_back(vector(layers)); c.push_back(vector(layers)); diff --git a/dynet/fast-lstm.h b/dynet/fast-lstm.h index 891b46021..16733a9e9 100644 --- a/dynet/fast-lstm.h +++ b/dynet/fast-lstm.h @@ -43,6 +43,7 @@ struct FastLSTMBuilder : public RNNBuilder { void start_new_sequence_impl(const std::vector& h0) override; Expression add_input_impl(int prev, const Expression& x) override; Expression set_h_impl(int prev, const std::vector& h_new) override; + Expression set_s_impl(int prev, const std::vector& s_new) override; public: // first index is layer, then ... diff --git a/dynet/gru.cc b/dynet/gru.cc index 6f4ded7ca..c86a3f574 100644 --- a/dynet/gru.cc +++ b/dynet/gru.cc @@ -86,9 +86,20 @@ Expression GRUBuilder::set_h_impl(int prev, const vector& h_new) { return h[t].back(); } +Expression GRUBuilder::set_s_impl(int prev, const std::vector& s_new) { + if (s_new.size()) { assert(s_new.size() == layers); } + const unsigned t = h.size(); + h.push_back(vector(layers)); + for (unsigned i = 0; i < layers; ++i) { + Expression y = s_new[i]; + h[t][i + layers] = y; + } + return h[t].back(); +} + Expression GRUBuilder::add_input_impl(int prev, const Expression& x) { - if(dropout_rate != 0.f) - throw std::runtime_error("GRUBuilder doesn't support dropout yet"); + //if(dropout_rate != 0.f) + //throw std::runtime_error("GRUBuilder doesn't support dropout yet"); const bool has_initial_state = (h0.size() > 0); h.push_back(vector(layers)); vector& ht = h.back(); @@ -101,6 +112,7 @@ Expression GRUBuilder::add_input_impl(int prev, const Expression& x) { if (prev >= 0 || has_initial_state) { h_tprev = (prev < 0) ? h0[i] : h[prev][i]; } else { prev_zero = true; } + if (dropout_rate) in = dropout(in, dropout_rate); // update gate Expression zt; if (prev_zero) @@ -134,7 +146,8 @@ Expression GRUBuilder::add_input_impl(int prev, const Expression& x) { in = ht[i] = crt + nwt; } } - return ht.back(); + if (dropout_rate) return dropout(ht.back(), dropout_rate); + else return ht.back(); } void GRUBuilder::copy(const RNNBuilder & rnn) { diff --git a/dynet/gru.h b/dynet/gru.h index 3d3e15012..8d907454a 100644 --- a/dynet/gru.h +++ b/dynet/gru.h @@ -22,11 +22,13 @@ struct GRUBuilder : public RNNBuilder { unsigned num_h0_components() const override { return layers; } void copy(const RNNBuilder & params) override; + protected: void new_graph_impl(ComputationGraph& cg) override; void start_new_sequence_impl(const std::vector& h0) override; Expression add_input_impl(int prev, const Expression& x) override; Expression set_h_impl(int prev, const std::vector& h_new) override; + Expression set_s_impl(int prev, const std::vector& s_new) override; // first index is layer, then ... std::vector> params; diff --git a/dynet/lstm.cc b/dynet/lstm.cc index a64ce8231..7b131db76 100644 --- a/dynet/lstm.cc +++ b/dynet/lstm.cc @@ -48,29 +48,29 @@ LSTMBuilder::LSTMBuilder(unsigned layers, vector ps = {p_x2i, p_h2i, p_c2i, p_bi, p_x2o, p_h2o, p_c2o, p_bo, p_x2c, p_h2c, p_bc}; params.push_back(ps); } // layers - dropout_rate = 0.f; + dropout_rate = 0.f; } -void LSTMBuilder::new_graph_impl(ComputationGraph& cg){ +void LSTMBuilder::new_graph_impl(ComputationGraph& cg) { param_vars.clear(); - for (unsigned i = 0; i < layers; ++i){ + for (unsigned i = 0; i < layers; ++i) { auto& p = params[i]; //i - Expression i_x2i = parameter(cg,p[X2I]); - Expression i_h2i = parameter(cg,p[H2I]); - Expression i_c2i = parameter(cg,p[C2I]); - Expression i_bi = parameter(cg,p[BI]); + Expression i_x2i = parameter(cg, p[X2I]); + Expression i_h2i = parameter(cg, p[H2I]); + Expression i_c2i = parameter(cg, p[C2I]); + Expression i_bi = parameter(cg, p[BI]); //o - Expression i_x2o = parameter(cg,p[X2O]); - Expression i_h2o = parameter(cg,p[H2O]); - Expression i_c2o = parameter(cg,p[C2O]); - Expression i_bo = parameter(cg,p[BO]); + Expression i_x2o = parameter(cg, p[X2O]); + Expression i_h2o = parameter(cg, p[H2O]); + Expression i_c2o = parameter(cg, p[C2O]); + Expression i_bo = parameter(cg, p[BO]); //c - Expression i_x2c = parameter(cg,p[X2C]); - Expression i_h2c = parameter(cg,p[H2C]); - Expression i_bc = parameter(cg,p[BC]); + Expression i_x2c = parameter(cg, p[X2C]); + Expression i_h2c = parameter(cg, p[H2C]); + Expression i_bc = parameter(cg, p[BC]); vector vars = {i_x2i, i_h2i, i_c2i, i_bi, i_x2o, i_h2o, i_c2o, i_bo, i_x2c, i_h2c, i_bc}; param_vars.push_back(vars); @@ -83,7 +83,7 @@ void LSTMBuilder::start_new_sequence_impl(const vector& hinit) { h.clear(); c.clear(); if (hinit.size() > 0) { - assert(layers*2 == hinit.size()); + assert(layers * 2 == hinit.size()); h0.resize(layers); c0.resize(layers); for (unsigned i = 0; i < layers; ++i) { @@ -108,6 +108,17 @@ Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { return h[t].back(); } +Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { + if (s_new.size()) { assert(s_new.size() == layers); } + const unsigned t = h.size(); + h.push_back(vector(layers)); + for (unsigned i = 0; i < layers; ++i) { + Expression y = s_new[i]; + h[t][i + layers] = y; + } + return h[t].back(); +} + Expression LSTMBuilder::add_input_impl(int prev, const Expression& x) { h.push_back(vector(layers)); c.push_back(vector(layers)); @@ -149,11 +160,11 @@ Expression LSTMBuilder::add_input_impl(int prev, const Expression& x) { Expression i_wt = tanh(i_awt); // output if (has_prev_state) { - Expression i_nwt = cwise_multiply(i_it,i_wt); - Expression i_crt = cwise_multiply(i_ft,i_c_tm1); + Expression i_nwt = cwise_multiply(i_it, i_wt); + Expression i_crt = cwise_multiply(i_ft, i_c_tm1); ct[i] = i_crt + i_nwt; } else { - ct[i] = cwise_multiply(i_it,i_wt); + ct[i] = cwise_multiply(i_it, i_wt); } Expression i_aot; @@ -163,18 +174,18 @@ Expression LSTMBuilder::add_input_impl(int prev, const Expression& x) { i_aot = affine_transform({vars[BO], vars[X2O], in, vars[C2O], ct[i]}); Expression i_ot = logistic(i_aot); Expression ph_t = tanh(ct[i]); - in = ht[i] = cwise_multiply(i_ot,ph_t); + in = ht[i] = cwise_multiply(i_ot, ph_t); } if (dropout_rate) return dropout(ht.back(), dropout_rate); - else return ht.back(); + else return ht.back(); } void LSTMBuilder::copy(const RNNBuilder & rnn) { const LSTMBuilder & rnn_lstm = (const LSTMBuilder&)rnn; assert(params.size() == rnn_lstm.params.size()); - for(size_t i = 0; i < params.size(); ++i) - for(size_t j = 0; j < params[i].size(); ++j) - params[i][j] = rnn_lstm.params[i][j]; + for (size_t i = 0; i < params.size(); ++i) + for (size_t j = 0; j < params[i].size(); ++j) + params[i][j] = rnn_lstm.params[i][j]; } void LSTMBuilder::save_parameters_pretraining(const string& fname) const { diff --git a/dynet/lstm.h b/dynet/lstm.h index 64a9c647c..ce92e153b 100644 --- a/dynet/lstm.h +++ b/dynet/lstm.h @@ -43,7 +43,7 @@ struct LSTMBuilder : public RNNBuilder { void start_new_sequence_impl(const std::vector& h0) override; Expression add_input_impl(int prev, const Expression& x) override; Expression set_h_impl(int prev, const std::vector& h_new) override; - + Expression set_s_impl(int prev, const std::vector& s_new) override; public: // first index is layer, then ... std::vector> params; diff --git a/dynet/rnn.h b/dynet/rnn.h index 2cfe31352..09d11c3f6 100644 --- a/dynet/rnn.h +++ b/dynet/rnn.h @@ -36,7 +36,7 @@ struct RNNBuilder { // call this before add_input and after new_graph, // when starting a new sequence on the same hypergraph. // h_0 is used to initialize hidden layers at timestep 0 to given values - void start_new_sequence(const std::vector& h_0={}) { + void start_new_sequence(const std::vector& h_0 = {}) { sm.transition(RNNOp::start_new_sequence); cur = RNNPointer(-1); head.clear(); @@ -44,13 +44,21 @@ struct RNNBuilder { } // explicitly set the output state of a node - Expression set_h(const RNNPointer& prev, const std::vector& h_new={}) { + Expression set_h(const RNNPointer& prev, const std::vector& h_new = {}) { sm.transition(RNNOp::add_input); head.push_back(prev); cur = head.size() - 1; return set_h_impl(prev, h_new); } + // Set the internal state of a node (for lstms/grus) + Expression set_s(const RNNPointer& prev, const std::vector& s_new = {}) { + sm.transition(RNNOp::add_input); + head.push_back(prev); + cur = head.size() - 1; + return set_s_impl(prev, s_new); + } + // add another timestep by reading in the variable x // return the hidden representation of the deepest layer Expression add_input(const Expression& x) { @@ -104,14 +112,16 @@ struct RNNBuilder { virtual void save_parameters_pretraining(const std::string& fname) const; virtual void load_parameters_pretraining(const std::string& fname); - protected: + +protected: virtual void new_graph_impl(ComputationGraph& cg) = 0; virtual void start_new_sequence_impl(const std::vector& h_0) = 0; virtual Expression add_input_impl(int prev, const Expression& x) = 0; virtual Expression set_h_impl(int prev, const std::vector& h_new) = 0; + virtual Expression set_s_impl(int prev, const std::vector& c_new) = 0; RNNPointer cur; - float dropout_rate; - private: + float dropout_rate; +private: // the state machine ensures that the caller is behaving RNNStateMachine sm; std::vector head; // head[i] returns the head position @@ -127,15 +137,16 @@ struct SimpleRNNBuilder : public RNNBuilder { unsigned input_dim, unsigned hidden_dim, Model* model, - bool support_lags=false); + bool support_lags = false); - protected: +protected: void new_graph_impl(ComputationGraph& cg) override; void start_new_sequence_impl(const std::vector& h_0) override; Expression add_input_impl(int prev, const Expression& x) override; Expression set_h_impl(int prev, const std::vector& h_new) override; + Expression set_s_impl(int prev, const std::vector& s_new) {return set_h_impl(prev, s_new);} - public: +public: Expression add_auxiliary_input(const Expression& x, const Expression &aux); Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); } @@ -151,7 +162,7 @@ struct SimpleRNNBuilder : public RNNBuilder { void save_parameters_pretraining(const std::string& fname) const override; void load_parameters_pretraining(const std::string& fname) override; - private: +private: // first index is layer, then x2h h2h hb std::vector> params; @@ -178,13 +189,13 @@ struct SimpleRNNBuilder : public RNNBuilder { namespace boost { - namespace serialization { - template - void serialize(Archive& ar, dynet::RNNPointer& p, const unsigned int version) - { - ar & p.t; - } - } // namespace serialization +namespace serialization { +template +void serialize(Archive& ar, dynet::RNNPointer& p, const unsigned int version) +{ + ar & p.t; +} +} // namespace serialization } // namespace boost BOOST_CLASS_EXPORT_KEY(dynet::RNNBuilder) From 729bdafc5f028e126c6c126e9fef80a5e0dc3708 Mon Sep 17 00:00:00 2001 From: paul Date: Tue, 25 Oct 2016 17:15:18 -0400 Subject: [PATCH 2/5] python bindings for set_s --- python/dynet.pxd | 1 + python/dynet.pyx | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/python/dynet.pxd b/python/dynet.pxd index bbb9b2d7c..06c44f2e3 100644 --- a/python/dynet.pxd +++ b/python/dynet.pxd @@ -247,6 +247,7 @@ cdef extern from "dynet/rnn.h" namespace "dynet": CExpression add_input(CExpression &x) CExpression add_input(CRNNPointer prev, CExpression &x) CExpression set_h(CRNNPointer prev, vector[CExpression] ces) + CExpression set_s(CRNNPointer prev, vector[CExpression] ces) void rewind_one_step() CExpression back() vector[CExpression] final_h() diff --git a/python/dynet.pyx b/python/dynet.pyx index 13699347c..837b61781 100644 --- a/python/dynet.pyx +++ b/python/dynet.pyx @@ -931,6 +931,16 @@ cdef class _RNNBuilder: # {{{ ces.push_back(e.c()) return Expression.from_cexpr(self.cg_version, self.thisptr.set_h(prev, ces)) + cdef set_s(self, CRNNPointer prev, es=None): + if self.cg_version != _cg.version(): raise ValueError("Using stale builder. Create .new_graph() after computation graph is renewed.") + cdef vector[CExpression] ces = vector[CExpression]() + cdef Expression e + if es: + for e in es: + ensure_freshness(e) + ces.push_back(e.c()) + return Expression.from_cexpr(self.cg_version, self.thisptr.set_s(prev, ces)) + cdef rewind_one_step(self): if self.cg_version != _cg.version(): raise ValueError("Using stale builder. Create .new_graph() after computation graph is renewed.") self.thisptr.rewind_one_step() @@ -1158,6 +1168,11 @@ cdef class RNNState: # {{{ cdef int state_idx = self.builder.thisptr.state() return RNNState(self.builder, state_idx, self, res) + cpdef RNNState set_s(self, es=None): + cdef Expression res = self.builder.set_s(CRNNPointer(self.state_idx), es) + cdef int state_idx = self.builder.thisptr.state() + return RNNState(self.builder, state_idx, self, res) + cpdef RNNState add_input(self, Expression x): cdef Expression res = self.builder.add_input_to_prev(CRNNPointer(self.state_idx), x) cdef int state_idx = self.builder.thisptr.state() From 9ba2b49f1a70b09b3b5a94f1e310f4ddd0b0e426 Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 26 Oct 2016 10:16:54 -0400 Subject: [PATCH 3/5] Bug correction in set_h,set_s --- dynet/fast-lstm.cc | 30 ++++++++++++++++++++---------- dynet/gru.cc | 30 +++++++++++++++++++++--------- dynet/lstm.cc | 25 ++++++++++++++++++------- 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/dynet/fast-lstm.cc b/dynet/fast-lstm.cc index 4d53ad6f7..e4b6a83e9 100644 --- a/dynet/fast-lstm.cc +++ b/dynet/fast-lstm.cc @@ -94,29 +94,39 @@ void FastLSTMBuilder::start_new_sequence_impl(const vector& hinit) { } // TO DO - Make this correct -Expression FastLSTMBuilder::set_h_impl(int prev, const vector& h_new) { +// Copied c from the previous step (otherwise c.size()< h.size()) +// Also is creating a new step something we want? +// wouldn't overwriting the current one be better? +Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { if (h_new.size()) { assert(h_new.size() == layers); } const unsigned t = h.size(); h.push_back(vector(layers)); + c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { - Expression y = h_new[i]; - h[t][i] = y; + Expression h_i = h_new[i]; + Expression c_i = c[t - 1][i]; + h[t][i] = h_i; + c[t][i] = c_i; } return h[t].back(); } - -Expression FastLSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { - if (s_new.size()) { assert(s_new.size() == layers); } - const unsigned t = h.size(); +// Current implementation : s_new is either {new_c[0],...,new_c[n]} +// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} +Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { + if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); } + bool only_c = s_new.size() == layers; + const unsigned t = c.size(); h.push_back(vector(layers)); + c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { - Expression y = s_new[i]; - h[t][i + layers] = y; + Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; + Expression c_i = s_new[i]; + h[t][i] = h_i; + c[t][i] = c_i; } return h[t].back(); } - Expression FastLSTMBuilder::add_input_impl(int prev, const Expression& x) { h.push_back(vector(layers)); c.push_back(vector(layers)); diff --git a/dynet/gru.cc b/dynet/gru.cc index c86a3f574..d3efed328 100644 --- a/dynet/gru.cc +++ b/dynet/gru.cc @@ -75,24 +75,36 @@ void GRUBuilder::start_new_sequence_impl(const std::vector& h_0) { } } -Expression GRUBuilder::set_h_impl(int prev, const vector& h_new) { +// TO DO - Make this correct +// Copied c from the previous step (otherwise c.size()< h.size()) +// Also is creating a new step something we want? +// wouldn't overwriting the current one be better? +Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { if (h_new.size()) { assert(h_new.size() == layers); } const unsigned t = h.size(); h.push_back(vector(layers)); + c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { - Expression y = h_new[i]; - h[t][i] = y; + Expression h_i = h_new[i]; + Expression c_i = c[t - 1][i]; + h[t][i] = h_i; + c[t][i] = c_i; } return h[t].back(); } - -Expression GRUBuilder::set_s_impl(int prev, const std::vector& s_new) { - if (s_new.size()) { assert(s_new.size() == layers); } - const unsigned t = h.size(); +// Current implementation : s_new is either {new_c[0],...,new_c[n]} +// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} +Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { + if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); } + bool only_c = s_new.size() == layers; + const unsigned t = c.size(); h.push_back(vector(layers)); + c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { - Expression y = s_new[i]; - h[t][i + layers] = y; + Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; + Expression c_i = s_new[i]; + h[t][i] = h_i; + c[t][i] = c_i; } return h[t].back(); } diff --git a/dynet/lstm.cc b/dynet/lstm.cc index 7b131db76..fe014ff2e 100644 --- a/dynet/lstm.cc +++ b/dynet/lstm.cc @@ -97,24 +97,35 @@ void LSTMBuilder::start_new_sequence_impl(const vector& hinit) { } // TO DO - Make this correct +// Copied c from the previous step (otherwise c.size()< h.size()) +// Also is creating a new step something we want? +// wouldn't overwriting the current one be better? Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { if (h_new.size()) { assert(h_new.size() == layers); } const unsigned t = h.size(); h.push_back(vector(layers)); + c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { - Expression y = h_new[i]; - h[t][i] = y; + Expression h_i = h_new[i]; + Expression c_i = c[t - 1][i]; + h[t][i] = h_i; + c[t][i] = c_i; } return h[t].back(); } - +// Current implementation : s_new is either {new_c[0],...,new_c[n]} +// or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { - if (s_new.size()) { assert(s_new.size() == layers); } - const unsigned t = h.size(); + if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); } + bool only_c = s_new.size() == layers; + const unsigned t = c.size(); h.push_back(vector(layers)); + c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { - Expression y = s_new[i]; - h[t][i + layers] = y; + Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; + Expression c_i = s_new[i]; + h[t][i] = h_i; + c[t][i] = c_i; } return h[t].back(); } From 406040db0b3260c2239f2269791939d08be6e51b Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 26 Oct 2016 10:30:52 -0400 Subject: [PATCH 4/5] Bugfix --- dynet/fast-lstm.cc | 5 +++-- dynet/gru.cc | 50 +++++++++++++++++----------------------------- 2 files changed, 21 insertions(+), 34 deletions(-) diff --git a/dynet/fast-lstm.cc b/dynet/fast-lstm.cc index e4b6a83e9..e9f693761 100644 --- a/dynet/fast-lstm.cc +++ b/dynet/fast-lstm.cc @@ -97,7 +97,7 @@ void FastLSTMBuilder::start_new_sequence_impl(const vector& hinit) { // Copied c from the previous step (otherwise c.size()< h.size()) // Also is creating a new step something we want? // wouldn't overwriting the current one be better? -Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { +Expression FastLSTMBuilder::set_h_impl(int prev, const vector& h_new) { if (h_new.size()) { assert(h_new.size() == layers); } const unsigned t = h.size(); h.push_back(vector(layers)); @@ -112,7 +112,7 @@ Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { } // Current implementation : s_new is either {new_c[0],...,new_c[n]} // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} -Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { +Expression FastLSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); } bool only_c = s_new.size() == layers; const unsigned t = c.size(); @@ -127,6 +127,7 @@ Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_ne return h[t].back(); } + Expression FastLSTMBuilder::add_input_impl(int prev, const Expression& x) { h.push_back(vector(layers)); c.push_back(vector(layers)); diff --git a/dynet/gru.cc b/dynet/gru.cc index d3efed328..eff952c86 100644 --- a/dynet/gru.cc +++ b/dynet/gru.cc @@ -48,19 +48,19 @@ void GRUBuilder::new_graph_impl(ComputationGraph& cg) { auto& p = params[i]; // z - Expression x2z = parameter(cg,p[X2Z]); - Expression h2z = parameter(cg,p[H2Z]); - Expression bz = parameter(cg,p[BZ]); + Expression x2z = parameter(cg, p[X2Z]); + Expression h2z = parameter(cg, p[H2Z]); + Expression bz = parameter(cg, p[BZ]); // r - Expression x2r = parameter(cg,p[X2R]); - Expression h2r = parameter(cg,p[H2R]); - Expression br = parameter(cg,p[BR]); + Expression x2r = parameter(cg, p[X2R]); + Expression h2r = parameter(cg, p[H2R]); + Expression br = parameter(cg, p[BR]); // h - Expression x2h = parameter(cg,p[X2H]); - Expression h2h = parameter(cg,p[H2H]); - Expression bh = parameter(cg,p[BH]); + Expression x2h = parameter(cg, p[X2H]); + Expression h2h = parameter(cg, p[H2H]); + Expression bh = parameter(cg, p[BH]); vector vars = {x2z, h2z, bz, x2r, h2r, br, x2h, h2h, bh}; param_vars.push_back(vars); @@ -77,41 +77,27 @@ void GRUBuilder::start_new_sequence_impl(const std::vector& h_0) { // TO DO - Make this correct // Copied c from the previous step (otherwise c.size()< h.size()) -// Also is creating a new step something we want? +// Also is creating a new step something we want? // wouldn't overwriting the current one be better? -Expression LSTMBuilder::set_h_impl(int prev, const vector& h_new) { +Expression GRUBuilder::set_h_impl(int prev, const vector& h_new) { if (h_new.size()) { assert(h_new.size() == layers); } const unsigned t = h.size(); h.push_back(vector(layers)); - c.push_back(vector(layers)); for (unsigned i = 0; i < layers; ++i) { Expression h_i = h_new[i]; - Expression c_i = c[t - 1][i]; h[t][i] = h_i; - c[t][i] = c_i; } return h[t].back(); } // Current implementation : s_new is either {new_c[0],...,new_c[n]} // or {new_c[0],...,new_c[n],new_h[0],...,new_h[n]} -Expression LSTMBuilder::set_s_impl(int prev, const std::vector& s_new) { - if (s_new.size()) { assert(s_new.size() == layers || s_new.size() == 2 * layers ); } - bool only_c = s_new.size() == layers; - const unsigned t = c.size(); - h.push_back(vector(layers)); - c.push_back(vector(layers)); - for (unsigned i = 0; i < layers; ++i) { - Expression h_i = only_c ? h[t - 1][i] : s_new[i + layers]; - Expression c_i = s_new[i]; - h[t][i] = h_i; - c[t][i] = c_i; - } - return h[t].back(); +Expression GRUBuilder::set_s_impl(int prev, const std::vector& s_new) { + return set_h_impl(prev, s_new); } Expression GRUBuilder::add_input_impl(int prev, const Expression& x) { - //if(dropout_rate != 0.f) - //throw std::runtime_error("GRUBuilder doesn't support dropout yet"); + //if(dropout_rate != 0.f) + //throw std::runtime_error("GRUBuilder doesn't support dropout yet"); const bool has_initial_state = (h0.size() > 0); h.push_back(vector(layers)); vector& ht = h.back(); @@ -165,9 +151,9 @@ Expression GRUBuilder::add_input_impl(int prev, const Expression& x) { void GRUBuilder::copy(const RNNBuilder & rnn) { const GRUBuilder & rnn_gru = (const GRUBuilder&)rnn; assert(params.size() == rnn_gru.params.size()); - for(size_t i = 0; i < params.size(); ++i) - for(size_t j = 0; j < params[i].size(); ++j) - params[i][j] = rnn_gru.params[i][j]; + for (size_t i = 0; i < params.size(); ++i) + for (size_t j = 0; j < params[i].size(); ++j) + params[i][j] = rnn_gru.params[i][j]; } } // namespace dynet From 1893191dfd36271dd73c9cdc2e154350d38f943a Mon Sep 17 00:00:00 2001 From: paul Date: Wed, 26 Oct 2016 13:57:11 -0400 Subject: [PATCH 5/5] First documentation for RNNbuilders --- doc/doxygen/Doxyfile | 2 +- doc/source/builders.rst | 4 +- dynet/expr.h | 2 +- dynet/rnn.h | 236 +++++++++++++++++++++++++++++++++++----- 4 files changed, 211 insertions(+), 33 deletions(-) diff --git a/doc/doxygen/Doxyfile b/doc/doxygen/Doxyfile index 526276130..1c1cf3b1a 100644 --- a/doc/doxygen/Doxyfile +++ b/doc/doxygen/Doxyfile @@ -758,7 +758,7 @@ WARN_LOGFILE = # spaces. # Note: If this tag is empty the current directory is searched. -INPUT =../../dynet/expr.h ../../dynet/training.h +INPUT =../../dynet/expr.h ../../dynet/training.h ../../dynet/rnn.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/doc/source/builders.rst b/doc/source/builders.rst index d6139ae1a..3ea25c8e2 100644 --- a/doc/source/builders.rst +++ b/doc/source/builders.rst @@ -4,4 +4,6 @@ DyNet Builders Builders combine together various operations to implement more complicated things such as recurrent and LSTM networks -TODO: Create documentation +.. doxygengroup:: rnnbuilders + :members: + :content-only: \ No newline at end of file diff --git a/dynet/expr.h b/dynet/expr.h index 8a155d5c8..b4672c315 100644 --- a/dynet/expr.h +++ b/dynet/expr.h @@ -3,7 +3,7 @@ * \defgroup operations * \brief The various operations that you can use in building a DyNet graph * - * TODO: Create documentation and explain expressions, etc... + * \details TODO: Create documentation and explain expressions, etc... */ #ifndef DYNET_EXPR_H diff --git a/dynet/rnn.h b/dynet/rnn.h index 09d11c3f6..d2d9497da 100644 --- a/dynet/rnn.h +++ b/dynet/rnn.h @@ -1,3 +1,11 @@ +/** + * \file rnn.h + * \defgroup rnnbuilders + * \brief Helper structures to build recurrent units + * + * \details TODO: Create documentation and explain rnns, etc... + */ + #ifndef DYNET_RNN_H_ #define DYNET_RNN_H_ @@ -18,24 +26,48 @@ inline void swap(RNNPointer& i1, RNNPointer& i2) { RNNPointer t = i1; i1 = i2; i2 = t; } -// interface for constructing an RNN, LSTM, GRU, etc. +/** + * \ingroup rnnbuilders + * \brief interface for constructing an RNN, LSTM, GRU, etc. + * \details [long description] + */ struct RNNBuilder { + /** + * + * \brief Default constructor + */ RNNBuilder() : cur(-1) {} virtual ~RNNBuilder(); + /** + * + * \brief Get pointer to the current state + * + * \return Pointer to the current state + */ RNNPointer state() const { return cur; } - // call this to reset the builder when you are working with a newly - // created ComputationGraph object + /** + * + * \brief Initialize with new computation graph + * \details call this to reset the builder when you are working with a newly + * created ComputationGraph object + * + * \param cg Computation graph + */ void new_graph(ComputationGraph& cg) { sm.transition(RNNOp::new_graph); new_graph_impl(cg); } - // Reset for new sequence - // call this before add_input and after new_graph, - // when starting a new sequence on the same hypergraph. - // h_0 is used to initialize hidden layers at timestep 0 to given values + /** + * + * \brief Reset for new sequence + * \details call this before add_input and after new_graph, + * when starting a new sequence on the same hypergraph. + * + * \param h_0 `h_0` is used to initialize hidden layers at timestep 0 to given values + */ void start_new_sequence(const std::vector& h_0 = {}) { sm.transition(RNNOp::start_new_sequence); cur = RNNPointer(-1); @@ -43,7 +75,16 @@ struct RNNBuilder { start_new_sequence_impl(h_0); } - // explicitly set the output state of a node + // + /** + * + * \brief Explicitly set the output state of a node + * + * \param prev Pointer to the previous state + * \param h_new The new hidden state + * + * \return The hidden representation of the deepest layer + */ Expression set_h(const RNNPointer& prev, const std::vector& h_new = {}) { sm.transition(RNNOp::add_input); head.push_back(prev); @@ -51,7 +92,19 @@ struct RNNBuilder { return set_h_impl(prev, h_new); } - // Set the internal state of a node (for lstms/grus) + // + /** + * + * \brief Set the internal state of a node (for lstms/grus) + * \details For RNNs without internal states (SimpleRNN, GRU...), + * this has the same behaviour as `set_h` + * + * \param prev Pointer to the previous state + * \param s_new The new state. Can be `{new_c[0],...,new_c[n]}` + * or `{new_c[0],...,new_c[n], new_h[0],...,new_h[n]}` + * + * \return The hidden representation of the deepest layer + */ Expression set_s(const RNNPointer& prev, const std::vector& s_new = {}) { sm.transition(RNNOp::add_input); head.push_back(prev); @@ -59,8 +112,14 @@ struct RNNBuilder { return set_s_impl(prev, s_new); } - // add another timestep by reading in the variable x - // return the hidden representation of the deepest layer + /** + * + * \brief Add another timestep by reading in the variable x + * + * \param x Input variable + * + * \return The hidden representation of the deepest layer + */ Expression add_input(const Expression& x) { sm.transition(RNNOp::add_input); head.push_back(cur); @@ -69,9 +128,18 @@ struct RNNBuilder { return add_input_impl(rcp, x); } - // add another timestep, but define recurrent connection to prev - // rather than to head[cur] - // this can be used to construct trees, implement beam search, etc. + /** + * + * \brief Add another timestep, with arbitrary recurrent connection. + * \details This allows to define a recurrent connection to `prev` + * rather than to head[cur]. + * This can be used to construct trees, implement beam search, etc. + * + * \param prev Pointer to the previous state + * \param x Input variable + * + * \return The hidden representation of the deepest layer + */ Expression add_input(const RNNPointer& prev, const Expression& x) { sm.transition(RNNOp::add_input); head.push_back(prev); @@ -79,37 +147,116 @@ struct RNNBuilder { return add_input_impl(prev, x); } - // rewind the last timestep - this DOES NOT remove the variables - // from the computation graph, it just means the next time step will - // see a different previous state. You can remind as many times as - // you want. + /** + * + * \brief Rewind the last timestep + * \details - this DOES NOT remove the variables from the computation graph, + * it just means the next time step will see a different previous state. + * You can remind as many times as you want. + */ void rewind_one_step() { cur = head[cur]; } - // Set dropout. In general, you should disable dropout at test time + /** + * + * \brief Set Dropout + * + * \param d Dropout rate + */ void set_dropout(float d) { dropout_rate = d; } + /** + * + * \brief Disable Dropout + * \details In general, you should disable dropout at test time + */ void disable_dropout() { dropout_rate = 0; } - // returns node (index) of most recent output + /** + * + * \brief Returns node (index) of most recent output + * + * \return Node (index) of most recent output + */ virtual Expression back() const = 0; - // access the final output of each hidden layer + /** + * + * \brief Access the final output of each hidden layer + * + * \return Final output of each hidden layer + */ virtual std::vector final_h() const = 0; + /** + * + * \brief Access the output of any hidden layer + * + * \param i Pointer to the step which output you want to access + * + * \return Output of each hidden layer at the given step + */ virtual std::vector get_h(RNNPointer i) const = 0; - // access the state of each hidden layer, in a format that can be used in - // start_new_sequence + + /** + * + * \brief Access the final state of each hidden layer + * \details This returns the state of each hidden layer, + * in a format that can be used in start_new_sequence + * (i.e. including any internal cell for LSTMs and the likes) + * + * \return vector containing, if it exists, the list of final + * internal states, followed by the list of final outputs for + * each layer + */ virtual std::vector final_s() const = 0; - virtual unsigned num_h0_components() const = 0; + /** + * + * \brief Access the state of any hidden layer + * \details See `final_s` for details + * + * \param i Pointer to the step which state you want to access + * + * \return Internal state of each hidden layer at the given step + */ virtual std::vector get_s(RNNPointer i) const = 0; - // copy the parameters of another builder + + /** + * + * \brief Number of components in `h_0` + * + * \return Number of components in `h_0` + */ + virtual unsigned num_h0_components() const = 0; + /** + * + * \brief Copy the parameters of another builder. + * + * \param params RNNBuilder you want to copy parameters from. + */ virtual void copy(const RNNBuilder & params) = 0; - // the following functions save all the parameters associated with a particular - // RNNBuilder's derived class to a file. These should not be used to seralize - // models, they should only be used to load and save parameters for pretraining. - // If you are interested in serializing models, use the boost serialization - // API against your model class + /** + * + * \brief This function saves all the parameters associated with + * a particular RNNBuilder's derived class to a file. + * \details This should not be used to seralize models, it should + * only be used to save parameters for pretraining. + * If you are interested in serializing models, use the boost + * serialization API against your model class. + * + * \param fname File you want to save your model to. + */ virtual void save_parameters_pretraining(const std::string& fname) const; + /** + * + * \brief Loads all the parameters associated with a particular RNNBuilder's + * derived class from a file. + * \details This should not be used to seralize models, it should + * only be used to load parameters from pretraining. + * If you are interested in serializing models, use the boost + * serialization API against your model class. + * + * \param fname File you want to read your model from. + */ virtual void load_parameters_pretraining(const std::string& fname); @@ -131,8 +278,25 @@ struct RNNBuilder { void serialize(Archive& ar, const unsigned int); }; +/** + * \ingroup rnnbuilders + * \brief This provides a builder for the simplest RNN with tanh nonlinearity + * \details The equation for this RNN is : + * \f$h_t=\tanh(W_x x_t + W_h h_{t-1} + b)\f$ + * + */ struct SimpleRNNBuilder : public RNNBuilder { SimpleRNNBuilder() = default; + /** + * + * \brief Builds a simple RNN + * + * \param layers Number of layers + * \param input_dim Dimension of the input + * \param hidden_dim Hiddent layer (and output) size + * \param model Model holding the parameters + * \param support_lags Allow for auxiliary output? + */ explicit SimpleRNNBuilder(unsigned layers, unsigned input_dim, unsigned hidden_dim, @@ -147,6 +311,18 @@ struct SimpleRNNBuilder : public RNNBuilder { Expression set_s_impl(int prev, const std::vector& s_new) {return set_h_impl(prev, s_new);} public: + /** + * + * \brief Add auxiliary output + * \details Returns \f$h_t=\tanh(W_x x_t + W_h h_{t-1} + W_y y + b)\f$ + * where \f$y\f$ is an auxiliary output + * TODO : clarify + * + * \param x Input expression + * \param aux Auxiliary output expression + * + * \return The hidden representation of the deepest layer + */ Expression add_auxiliary_input(const Expression& x, const Expression &aux); Expression back() const override { return (cur == -1 ? h0.back() : h[cur].back()); }