cgraywang · Ishitori · Aug 2, 2018 · Aug 2, 2018 · Aug 2, 2018 · Aug 31, 2018
diff --git a/gluonnlp/data/question_answering.py b/gluonnlp/data/question_answering.py
@@ -80,7 +80,7 @@ def __init__(self, segment='train', root=os.path.join('~', '.mxnet', 'datasets',
         self._segment = segment
         self._get_data()
 
-        super(SQuAD, self).__init__(self._read_data())
+        super(SQuAD, self).__init__(SQuAD._get_records(self._read_data()))
 
     def _get_data(self):
         """Load data from the file. Does nothing if data was loaded before
@@ -116,9 +116,9 @@ def _read_data(self):
         _, data_file_name, _ = self._data_file[self._segment]
 
         with open(os.path.join(self._root, data_file_name)) as f:
-            samples = json.load(f)
+            json_data = json.load(f)
 
-        return SQuAD._get_records(samples)
+        return json_data
 
     @staticmethod
     def _get_records(json_dict):

diff --git a/scripts/question_answering/attention_flow.py b/scripts/question_answering/attention_flow.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Attention Flow Layer"""
+from mxnet import gluon
+
+from .similarity_function import DotProductSimilarity
+
+
+class AttentionFlow(gluon.HybridBlock):
+    """
+    This ``block`` takes two ndarrays as input and returns a ndarray of attentions.
+
+    We compute the similarity between each row in each matrix and return unnormalized similarity
+    scores.  Because these scores are unnormalized, we don't take a mask as input; it's up to the
+    caller to deal with masking properly when this output is used.
+
+    By default similarity is computed with a dot product, but you can alternatively use a
+    parameterized similarity function if you wish.
+
+
+    Input:
+        - ndarray_1: ``(batch_size, num_rows_1, embedding_dim)``
+        - ndarray_2: ``(batch_size, num_rows_2, embedding_dim)``
+
+    Output:
+        - ``(batch_size, num_rows_1, num_rows_2)``
+
+    Parameters
+    ----------
+    similarity_function: ``SimilarityFunction``, optional (default=``DotProductSimilarity``)
+        The similarity function to use when computing the attention.
+    """
+    def __init__(self, similarity_function, batch_size, passage_length,
+                 question_length, embedding_size, **kwargs):
+        super(AttentionFlow, self).__init__(**kwargs)
+
+        self._similarity_function = similarity_function or DotProductSimilarity()
+        self._batch_size = batch_size
+        self._passage_length = passage_length
+        self._question_length = question_length
+        self._embedding_size = embedding_size
+
+    def hybrid_forward(self, F, matrix_1, matrix_2):
+        # pylint: disable=arguments-differ
+        tiled_matrix_1 = matrix_1.expand_dims(2).broadcast_to(shape=(self._batch_size,
+                                                                     self._passage_length,
+                                                                     self._question_length,
+                                                                     self._embedding_size))
+        tiled_matrix_2 = matrix_2.expand_dims(1).broadcast_to(shape=(self._batch_size,
+                                                                     self._passage_length,
+                                                                     self._question_length,
+                                                                     self._embedding_size))
+        return self._similarity_function(tiled_matrix_1, tiled_matrix_2)
diff --git a/scripts/question_answering/bidaf.py b/scripts/question_answering/bidaf.py
@@ -0,0 +1,121 @@
+# coding: utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Bidirectional attention flow layer"""
+from mxnet import gluon
+import numpy as np
+
+from .utils import last_dim_softmax, weighted_sum, replace_masked_values, masked_softmax
+
+
+class BidirectionalAttentionFlow(gluon.HybridBlock):
+    """
+    This class implements Minjoon Seo's `Bidirectional Attention Flow model
+    <https://www.semanticscholar.org/paper/Bidirectional-Attention-Flow-for-Machine-Seo-Kembhavi/7586b7cca1deba124af80609327395e613a20e9d>`_
+    for answering reading comprehension questions (ICLR 2017).
+    """
+
+    def __init__(self,
+                 batch_size,
+                 passage_length,
+                 question_length,
+                 encoding_dim,
+                 **kwargs):
+        super(BidirectionalAttentionFlow, self).__init__(**kwargs)
+
+        self._batch_size = batch_size
+        self._passage_length = passage_length
+        self._question_length = question_length
+        self._encoding_dim = encoding_dim
+
+    def _get_big_negative_value(self):
+        """Provides maximum negative Float32 value
+        Returns
+        -------
+        value : float32
+            Maximum negative float32 value
+        """
+        return np.finfo(np.float32).min
+
+    def _get_small_positive_value(self):
+        """Provides minimal possible Float32 value
+        Returns
+        -------
+        value : float32
+            Minimal float32 value
+        """
+        return np.finfo(np.float32).eps
+
+    def hybrid_forward(self, F, passage_question_similarity,
+                       encoded_passage, encoded_question, question_mask, passage_mask):
+        # pylint: disable=arguments-differ
+        # Shape: (batch_size, passage_length, question_length)
+        passage_question_similarity_shape = (self._batch_size, self._passage_length,
+                                             self._question_length)
+
+        question_mask_shape = (self._batch_size, self._question_length)
+        # Shape: (batch_size, passage_length, question_length)
+        passage_question_attention = last_dim_softmax(F,
+                                                      passage_question_similarity,
+                                                      question_mask,
+                                                      passage_question_similarity_shape,
+                                                      question_mask_shape,
+                                                      epsilon=self._get_small_positive_value())
+        # Shape: (batch_size, passage_length, encoding_dim)
+        encoded_question_shape = (self._batch_size, self._question_length, self._encoding_dim)
+        passage_question_attention_shape = (self._batch_size, self._passage_length,
+                                            self._question_length)
+        passage_question_vectors = weighted_sum(F, encoded_question, passage_question_attention,
+                                                encoded_question_shape,
+                                                passage_question_attention_shape)
+
+        # We replace masked values with something really negative here, so they don't affect the
+        # max below.
+        masked_similarity = passage_question_similarity if question_mask is None else \
+            replace_masked_values(F,
+                                  passage_question_similarity,
+                                  question_mask.expand_dims(1),
+                                  replace_with=self._get_big_negative_value())
+
+        # Shape: (batch_size, passage_length)
+        question_passage_similarity = masked_similarity.max(axis=-1)
+
+        # Shape: (batch_size, passage_length)
+        question_passage_attention = masked_softmax(F, question_passage_similarity, passage_mask,
+                                                    epsilon=self._get_small_positive_value())
+
+        # Shape: (batch_size, encoding_dim)
+        encoded_passage_shape = (self._batch_size, self._passage_length, self._encoding_dim)
+        question_passage_attention_shape = (self._batch_size, self._passage_length)
+        question_passage_vector = weighted_sum(F, encoded_passage, question_passage_attention,
+                                               encoded_passage_shape,
+                                               question_passage_attention_shape)
+
+        # Shape: (batch_size, passage_length, encoding_dim)
+        tiled_question_passage_vector = question_passage_vector.expand_dims(1)
+
+        # Shape: (batch_size, passage_length, encoding_dim * 4)
+        final_merged_passage = F.concat(encoded_passage,
+                                        passage_question_vectors,
+                                        encoded_passage * passage_question_vectors,
+                                        F.broadcast_mul(encoded_passage,
+                                                        tiled_question_passage_vector),
+                                        dim=-1)
+
+        return final_merged_passage