semi-final, 3 WERs missing

doetsch · Apr 19, 2017 · 88a6fc4 · 88a6fc4
1 parent 51e412a
commit 88a6fc4
Show file tree

Hide file tree

Showing 6 changed files with 43,848 additions and 39 deletions.
diff --git a/code/run.py b/code/run.py
@@ -21,7 +21,7 @@
                     help='number of stacked RNN layers')
 parser.add_argument('--num-hidden', type=int, default=512,
                     help='hidden layer size')
-parser.add_argument('--bidirectional', type=bool, default=True,
+parser.add_argument('--bidirectional', type=bool, default=False,
                     help='whether to use bidirectional layers')
 parser.add_argument('--gpus', type=str,
                     help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \
@@ -30,15 +30,15 @@
                     help='key-value store type')
 parser.add_argument('--num-epochs', type=int, default=25,
                     help='max num of epochs')
-parser.add_argument('--lr', type=float, default=0.1,
+parser.add_argument('--lr', type=float, default=0.01,
                     help='initial learning rate')
-parser.add_argument('--optimizer', type=str, default='sgd',
+parser.add_argument('--optimizer', type=str, default='adam',
                     help='the optimizer type')
 parser.add_argument('--mom', type=float, default=0.9,
                     help='momentum for sgd')
 parser.add_argument('--wd', type=float, default=0.00001,
                     help='weight decay for sgd')
-parser.add_argument('--batch-size', type=int, default=25,
+parser.add_argument('--batch-size', type=int, default=10,
                     help='the batch size.')
 parser.add_argument('--disp-batches', type=int, default=100,
                     help='show progress for every n batches')
@@ -80,7 +80,11 @@ class UtteranceIter(DataIter):
   def __init__(self, utterances, states, names, batch_size, sampling, data_name='data', label_name='labels', shuffle=True):
     super(UtteranceIter, self).__init__()
     if not sampling:
-      sampling = [i for i, j in enumerate([len(x) for x in utterances])] #[500::500]
+      minpad = 100
+      #sampling = [i for i, j in enumerate(np.bincount([len(s) for s in utterances]))]#[j for i, j in enumerate(set([len(x) for x in utterances]))] #[500::500]
+      sampling = range(minpad,max([len(s) for s in utterances]),minpad)
+    print sampling
+    #assert False
 
     self.idx = []
     if isinstance(sampling, list):
@@ -150,9 +154,9 @@ def next(self):
       i, j = self.idx[self.curr_idx]
 
       data = self.nddata[i][j:j + self.batch_size]
-      label = self.ndlabel[i][j:j + self.batch_size].T
-      data = ndarray.swapaxes(data, 1, 0) # TBD
-      #label = ndarray.swapaxes(label, 1, 0)
+      label = self.ndlabel[i][j:j + self.batch_size]
+      data = ndarray.swapaxes(data, 0, 1) # TBD
+      label = ndarray.swapaxes(label, 0, 1)
 
       batch = DataBatch([data], [label], pad=0,
                         bucket_key=self.sampling[i],
@@ -170,7 +174,7 @@ def read_hdf5(filename, batching='default'):
   xin = h5['inputs'][...]
   yin = h5['targets/data']['classes'][...]
   n_out = h5['targets/size'].attrs['classes']
-  
+
   utterances = []
   states = []
   offset = 0
@@ -254,13 +258,6 @@ def train(args):
         cell = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout,
                                    mode='lstm', bidirectional=args.bidirectional)
 
-    def sym_gen2(seq_len):
-        sym = lstm_unroll(1, seq_len, 16, num_hidden=128,
-                          num_label=1501, num_hidden_proj=128)
-        data_names = ['data'] + state_names
-        label_names = ['softmax_label']
-        return (sym, data_names, label_names)
-
     def sym_gen(seq_len):
         data = mx.sym.Variable('data')
         label = mx.sym.Variable('labels')

diff --git a/paper/paper.bib b/paper/paper.bib
@@ -640,4 +640,12 @@ @InProceedings { menne16:chime4System
 	address= {San Francisco, CA, USA},	
 	month= sep,	
 	year= 2016,	
+}
+@article{hochreiter1997lstm,
+  title ={{Long Short-Term Memory}},
+  author={Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
+  journal={Neural computation, {\rm Vol. 9, No. 8}},
+  pages={1735--1780},
+  year={1997},
+  publisher={MIT Press}
 }
diff --git a/paper/paper.tex b/paper/paper.tex
@@ -74,16 +74,15 @@
   	 a bucketing approach to improve the parallelization capabilities of the recurrent training process, 
   	 we propose several simple ordering strategies that are able to compete both in training time and 
   	 recognition performance while being conceptually simpler. We compare our method with various other
-  	 batch construction strategies on the CHiME-4 noisy speech recognition corpus. The experiments were
-  	 done in MXNet and a demo implementation will be provided for free use.
+  	 batch construction strategies on the CHiME-4 noisy speech recognition corpus. 
   \end{abstract}
   \noindent{\bf Index Terms}: MXNet, bucketing, batches, recurrent neural networks
 
   \section{Introduction}
   	Neural network based acoustic modeling became the de-facto standard in automatic speech recognition (ASR)
   	and related tasks. Modeling contextual information over long distances in the input signal hereby showed to 
   	be of fundamental importance for optimal system performance. Modern acoustic models therefore use recurrent 
-  	neural networks (RNN) to model long temporal dependencies. In particular the long short-term memory (LSTM)
+  	neural networks (RNN) to model long temporal dependencies. In particular the long short-term memory (LSTM)~\cite{hochreiter1997lstm}
   	has been shown to work very well on these tasks and most current state-of-the-art systems incorporate LSTMs
   	in their acoustic models. While it is common practice to train the
   	models on a frame-by-frame labeling obtained from a 
@@ -118,8 +117,9 @@
     produces batches with large variability of sequences while at the same time reducing irrelevant computation to a minimum.
     The following sections we are going to give an overview over current batch construction strategies and compare them 
     w.r.t.~training time and variability. We will then derive our proposed method and discuss its properties on a theoretical
-    level, followed by an empirical evaluation on the CHiME-4 noisy speech recognition task. All experiments were implemented 
-    within the MXNet framework and are available for download on our website.
+    level, followed by an empirical evaluation on the CHiME-4 noisy speech recognition task. 
+    %All experiments were implemented 
+    %within the MXNet framework and are available for download on our website.
 
   \section{Related Work} \label{sec:related}
   	While mini-batch training was studied extensively for feed-forward networks \cite{Li16}, authors rarely reveal the batch construction strategy they used during training when 
@@ -141,13 +141,22 @@
   In each iteration of the mini-batch training a bucket is then selected randomly. Within the selected bucket a random span of sequences is chosen to be used as data batch. Note that this random shuffling only ensures a large inter-batch variance w.r.t.~the sequence length, while the variance 
   within each batch is small.
 
-  Bucketing is especially suited if the RNN model itself is not able to handle arbitrary 
+  Bucketing is especially useful if the RNN model itself does not support dynamic unrolling and 
+  is not able to handle arbitrary 
   long sequences but instead requires to store an unrolled version of the network for every possible 
   length. In those cases bucketing allows the framework to assign each batch to the shortest possible 
   unrolled network, while still optimizing the same shared weights.
 
   \section{Proposed Approach} \label{sec:approach}
 
+  \begin{figure}
+   	\resizebox{0.45\textwidth}{!}{\includegraphics{plot}}
+   	\caption{Resulting sequence ordering for different batch construction strategies. The graphs 
+   	show the length distribution for 1000 randomly selected samples of the CHiME-4 training set. On the 
+   	vertical axis the length is plotted and on the X axis the utterance order. Bucketing was done with a bucket size of 250 and in the proposed approach we used 12 bins.}
+   	  \label{fig:plot}
+   \end{figure}
+
   In order to improve the intra-batch variability we propose a stochastic bucketing process. At the beginning
   of each epoch the utterances are arranged randomly and then partitioned into bins of equal size. Each bin
   is then sorted in alternating directions such that two consecutive bins are sorted in reverse order to each 
@@ -163,15 +172,19 @@
  			\item in increasing order if $n$ is odd
  			\item in decreasing order if $n$ is even
  		\end{itemize}
- 		\item partition into batches of desired size \\[1ex]
+ 		\item partition resulting order into batches of desired size \\[1ex]
  	\end{enumerate}
  \end{minipage}
 
  Due to the initial shuffling and subsequent partitioning the probability for two sequences of any length being 
- put into the same bin in $\frac{1}{N*(N-1)}$, so by increasing the number of bins, the variability within a 
+ put into the same bin in $\frac{1}{N\cdot(N-1)}$, so by increasing the number of bins, the variability within a 
  partition decreases quadratically while the variability among different partitions increases. The alternated
  sorting approach ensures that utterances at the boundaries of two consecutive bins are of similar length such 
  that the final partitioning into batches requires minimal zero padding.
+
+ Figure \ref{fig:plot} shows the length distribution for random and sorted sequence ordering as well as for bucketing in MXNet and the proposed approach. Note that in the case of bucketing batches are put
+ together by randomly choosing one of the buckets first, so the ordering does not directly represent 
+ the final set of batches.
 
   \section{Experimental Setup} \label{sec:setup}
   The 4th CHiME Speech Separation and Recognition Challenge
@@ -192,9 +205,12 @@
   \section{Experiments} \label{sec:experiments}
    In order to provide some perspective on the required context on the CHiME-4 task, 
    we provide results for experiments on sub-utterance (chunk) in 
-   Table \ref{tab:chime:chunk}. In the first columns we evaluate different sub-sequence lengths, while measuring the utterances that were processed per second, the memory required and the word error rate (WER) on the evaluation set of the CHiME-4 database. We can observe that 
+   Table \ref{tab:chime:chunk}. In the first columns we evaluate different sub-sequence lengths, while measuring the utterances that were processed per second, the memory required and the word error rate (WER) on the evaluation set of the CHiME-4 database. We hereby constrained batches to only 
+   contain 5,000 frames in total, such that the same number of updates is constant in all experiments. We can observe that 
    while large speed-ups can be obtained when training is done in this fashion, full-utterance context is
-   required for optimal performance.
+   required for optimal performance. However, it is worth noting that the memory requirement decreases 
+   significantly when sub-utterance training is applied and especially for unusually long utterances, 
+   sub-utterance training might be the only way to fit the data into GPU memory.
 
    For training on full sequences we conducted experiments with different batch construction strategies.
    The results are reported in Table \ref{tab:chime:batch}, where the first three rows show results for trivial sequence ordering 
@@ -212,11 +228,11 @@
 			\hline
 			Chunk size              & Utt./sec & Memory [GB] & WER [\%] \\
 			\hline
-			10     					&				 &			   &			 \\
-			50  					&				 &			   &    		 \\
-			100 					& 				 &			   &			 \\
-			500						& 				 &			   &			 \\
-			$\max$					&				 &			   &			 \\
+			10     					&	36.7   & 1.6         &			 \\
+			50  					&	31.1   & 1.6		 &    		 \\
+			100 					& 	29.6   & 1.6		 &			 \\
+			500						& 	17.3   & 1.6         &			 \\
+			$\max$					&	7.0	   & 5.4         &	8.9		 \\
 			\hline
 		\end{tabular}
 	\end{table}
@@ -230,19 +246,21 @@
 			\hline
 			Approach                & Utt./sec & Memory [GB] & WER [\%]          \\
 			\hline
-			Random 					&	       & 			 & 6.8		 \\
-			Sorted					& 		   &			 & 7.4		 \\
+			Random 					& $\sim$ 7.0     & $\sim$ 5.4		 & $\sim$ 8.9		 \\
+			Sorted					& 10.2    &	4.2 & 10.2		 \\
 			\hline
-			Bucketing				&		   &			 & 7.2		 \\
-			Proposed (4 bins)	    &		   &			 & 7.0		 \\
-			Proposed (64 bins)	    &		   &			 & 7.0		 \\
-			Proposed (256 bins)	    &		   &			 & 6.9		 \\
+			Bucketing				& 9.5     &	6.3		 & 9.6		 \\
+			Proposed (8 bins)	    & 10.1	   & 4.8     & 9.9		 \\
+			Proposed (64 bins)	    & 10.0     & 5.3     & 9.5		 \\
+			Proposed (256 bins)	    & 8.8	   & 6.0     & 9.1		 \\
 			\hline
 		\end{tabular}
 	\end{table}
 
   As expected, sorting the entire training set by sequence length reduces the required time 
-  per epoch to a minimum, while the best overall performance is obtained when sequences are shuffled randomly.
+  per epoch to a minimum, while the best overall performance is obtained when sequences are shuffled randomly. Both bucketing and the proposed 
+  approach are in between. We can observe that our method is able to reach almost the same recognition performance as using a randomly shuffled sequence
+  ordering, while being almost as fast as the sorted sequence scheduler. This allows for a good trade-off between runtime and system performance.
 
 
   \section{Conclusions}
@@ -252,7 +270,14 @@
 
    \section{Acknowledgements}
 
-   apptek?
+   This work has received funding from the European Research Council
+   (ERC) under the European Union’s Horizon 2020 research and innovation
+   program (SEQCLAS, grant agreement No 694537).  The work reflects
+   only the authors' views and the European Research Council Executive
+   Agency is not responsible for any use that may be made of the
+   information it contains.  We further want to thank Jahn Heymann, Lukas
+   Drude and Reinhold H\"ab-Umbach from University of Paderborn, Germany
+   for their CHiME-4 front-end which we used in this work.
 
    %\newpage
    \ninept

diff --git a/paper/plot.pdf b/paper/plot.pdf