diff --git a/Jenkinsfile b/Jenkinsfile
index a53a71b0f6..6834843575 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -62,7 +62,7 @@ stage("Deploy") {
       export LD_LIBRARY_PATH=/usr/local/cuda/lib64
       make clean
       make release
-      make -C docs html"""
+      make -C docs html SPHINXOPTS=-W"""
 
       if (env.BRANCH_NAME.startsWith("PR-")) {
         sh """#!/bin/bash
diff --git a/Makefile b/Makefile
index 8398131a03..0d0d2055ab 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ ROOTDIR = $(CURDIR)
 pylint:
 	pylint --rcfile=$(ROOTDIR)/.pylintrc gluonnlp scripts/*/*.py
 
-docs:
+docs: release
 	make -C docs html
 
 clean:
diff --git a/docs/api/model.rst b/docs/api/model.rst
index f2ad4a2716..d4e035fb32 100644
--- a/docs/api/model.rst
+++ b/docs/api/model.rst
@@ -6,7 +6,6 @@ all requested pre-trained weights are downloaded from public repo and stored in
 
 .. currentmodule:: gluonnlp.model
 
-
 Language Modeling
 -----------------
 
diff --git a/docs/conf.py b/docs/conf.py
index d387f2c977..0157213b5a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -111,7 +111,7 @@
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**.ipynb_checkpoints']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
diff --git a/docs/index.rst b/docs/index.rst
index c249e05509..a808337d5f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -29,7 +29,7 @@ GluonNLP relies on the recent version of MXNet. The easiest way to install MXNet
 is through `pip <https://pip.pypa.io/en/stable/installing/>`_. The following
 command installs a nightly built CPU version of MXNet.
 
-.. code-block:: bash
+.. code-block:: console
 
    pip install --pre --upgrade mxnet
 
@@ -45,7 +45,7 @@ command installs a nightly built CPU version of MXNet.
 
 Then install the GluonNLP toolkit by
 
-.. code-block:: bash
+.. code-block:: console
 
    pip install gluonnlp
 
diff --git a/env/doc.yml b/env/doc.yml
index 5977f9f9fa..0794b2f07f 100644
--- a/env/doc.yml
+++ b/env/doc.yml
@@ -4,6 +4,7 @@ channels:
   - defaults
 dependencies:
   - python
+  - ipython
   - sphinx=1.7.2
   - sphinx-gallery
   - sphinx_rtd_theme
diff --git a/gluonnlp/data/word_embedding_evaluation.py b/gluonnlp/data/word_embedding_evaluation.py
index 852564d0f6..3535e6c7bb 100644
--- a/gluonnlp/data/word_embedding_evaluation.py
+++ b/gluonnlp/data/word_embedding_evaluation.py
@@ -410,6 +410,7 @@ class SimLex999(WordSimilarityEvaluationDataset):
     License: Unspecified
 
     The dataset contains
+
     - word1: The first concept in the pair.
     - word2: The second concept in the pair. Note that the order is only
       relevant to the column Assoc(USF). These values (free association scores)
diff --git a/gluonnlp/embedding/evaluation.py b/gluonnlp/embedding/evaluation.py
index af444467ea..bb4dddf98a 100644
--- a/gluonnlp/embedding/evaluation.py
+++ b/gluonnlp/embedding/evaluation.py
@@ -234,6 +234,7 @@ class ThreeCosMul(WordEmbeddingAnalogyFunction):
         \\arg\\max_{b^* ∈ V}\\frac{\\cos(b^∗, b) \\cos(b^*, a)}{cos(b^*, a^*) + ε}
 
     See the following paper for more details:
+
     - Levy, O., & Goldberg, Y. (2014). Linguistic regularities in sparse and
       explicit word representations. In R. Morante, & W. Yih, Proceedings of the
       Eighteenth Conference on Computational Natural Language Learning, CoNLL 2014,
diff --git a/gluonnlp/embedding/token_embedding.py b/gluonnlp/embedding/token_embedding.py
index 7d3d051c0f..98d77ebb99 100644
--- a/gluonnlp/embedding/token_embedding.py
+++ b/gluonnlp/embedding/token_embedding.py
@@ -132,7 +132,6 @@ def list_sources(embedding_name=None):
 class TokenEmbedding(object):
     """Token embedding base class.
 
-
     To load token embedding from an externally hosted pre-trained token embedding file, such as
     those of GloVe and FastText, use :func:`gluonnlp.embedding.create`.
     To get all the available `embedding_name` and `source`, use
@@ -150,14 +149,12 @@ class TokenEmbedding(object):
     If a token is encountered multiple times in the pre-trained token embedding file, only the
     first-encountered token embedding vector will be loaded and the rest will be skipped.
 
-
     Parameters
     ----------
     unknown_token : hashable object or None, default '<unk>'
         The representation for any unknown token. In other words, any unknown token will be indexed
         as the same representation.
 
-
     Properties
     ----------
     idx_to_token : list of strs
@@ -624,7 +621,6 @@ def deserialize(cls, file_path):
 class GloVe(TokenEmbedding):
     """The GloVe word embedding.
 
-
     GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
     Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
     the resulting representations showcase interesting linear substructures of the word vector
@@ -647,7 +643,6 @@ class GloVe(TokenEmbedding):
 
     https://opendatacommons.org/licenses/pddl/
 
-
     Parameters
     ----------
     source : str, default 'glove.6B.50d'
@@ -657,7 +652,6 @@ class GloVe(TokenEmbedding):
     init_unknown_vec : callback
         The callback used to initialize the embedding vector for the unknown token.
 
-
     Properties
     ----------
     idx_to_vec : mxnet.ndarray.NDArray
@@ -723,7 +717,6 @@ class FastText(TokenEmbedding):
 
     https://creativecommons.org/licenses/by-sa/3.0/
 
-
     Parameters
     ----------
     source : str, default 'glove.6B.50d'
@@ -733,7 +726,6 @@ class FastText(TokenEmbedding):
     init_unknown_vec : callback
         The callback used to initialize the embedding vector for the unknown token.
 
-
     Properties
     ----------
     idx_to_vec : mxnet.ndarray.NDArray
diff --git a/gluonnlp/model/attention_cell.py b/gluonnlp/model/attention_cell.py
index 8342907e80..3cb8b7acba 100644
--- a/gluonnlp/model/attention_cell.py
+++ b/gluonnlp/model/attention_cell.py
@@ -401,7 +401,9 @@ def create_operator(self, ctx, in_shapes, in_dtypes):
 # pylint: enable=unused-argument
 
 class DotProductAttentionCell(AttentionCell):
-    r"""Dot product attention between the query and the key::
+    r"""Dot product attention between the query and the key.
+
+    Depending on parameters, defined as::
 
         units is None:
             score = <h_q, h_k>
diff --git a/gluonnlp/vocab.py b/gluonnlp/vocab.py
index 5dd5f9ccba..5f8693e42a 100644
--- a/gluonnlp/vocab.py
+++ b/gluonnlp/vocab.py
@@ -38,7 +38,6 @@
 class Vocab(object):
     """Indexing and embedding attachment for text tokens.
 
-
     Parameters
     ----------
     counter : Counter or None, default None
@@ -72,7 +71,6 @@ class Vocab(object):
         and values of `reserved_tokens` must be of the same hashable type. Examples: str, int, and
         tuple.
 
-
     Properties
     ----------
     embedding : instance of :class:`gluonnlp.embedding.TokenEmbedding`
diff --git a/scripts/beam_search/beam_search_generator.rst b/scripts/beam_search/beam_search_generator.rst
index 189678f73a..d68cfee223 100644
--- a/scripts/beam_search/beam_search_generator.rst
+++ b/scripts/beam_search/beam_search_generator.rst
@@ -7,13 +7,13 @@ This script can be used to generate sentences using beam search from a pretraine
 
 Use the following command to generate the sentences
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python beam_search_generator.py --bos I love it --beam_size 5
 
 Output is
 
-.. code-block:: log
+.. code-block:: console
 
    Beam Seach Parameters: beam_size=5, alpha=0.0, K=5
    Generation Result:
@@ -23,13 +23,13 @@ Output is
 
 You can also try a larger beam size.
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python beam_search_generator.py --bos I love it --beam_size 10
 
 Output is
 
-.. code-block:: log
+.. code-block:: console
 
    Beam Seach Parameters: beam_size=10, alpha=0.0, K=5
    Generation Result:
@@ -40,13 +40,13 @@ Output is
 Try beam size equals to 15
 
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python beam_search_generator.py --bos I love it --beam_size 15
 
 Output is
 
-.. code-block:: log
+.. code-block:: console
 
    Beam Seach Parameters: beam_size=15, alpha=0.0, K=5
    Generation Result:
diff --git a/scripts/language_model/word_language_model.rst b/scripts/language_model/word_language_model.rst
index add6b74465..43205244e9 100644
--- a/scripts/language_model/word_language_model.rst
+++ b/scripts/language_model/word_language_model.rst
@@ -45,30 +45,30 @@ The dataset used for training the models is wikitext-2.
 
 [1] awd_lstm_lm_1150_wikitext-2 (Val PPL 73.32 Test PPL 69.74)
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python word_language_model.py --gpus 0 --tied --save awd_lstm_lm_1150_wikitext-2
 
 [2] awd_lstm_lm_600_wikitext-2 (Val PPL 84.61 Test PPL 80.96)
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python word_language_model.py -gpus 0 --emsize 200 --nhid 600 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --save awd_lstm_lm_600_wikitext-2
 
 [3] standard_lstm_lm_1500_wikitext-2 (Val PPL 98.29 Test PPL 92.83)
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python word_language_model.py --gpus 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --save standard_lstm_lm_1500_wikitext-2
 
 [4] standard_lstm_lm_650_wikitext-2 (Val PPL 98.96 Test PPL 93.90)
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python word_language_model.py --gpus 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --save standard_lstm_lm_650_wikitext-2
 
 [5] standard_lstm_lm_200_wikitext-2 (Val PPL 108.25 Test PPL 102.26)
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python word_language_model.py --gpus 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --save standard_lstm_lm_200_wikitext-2
diff --git a/scripts/nmt/machine_translation.rst b/scripts/nmt/machine_translation.rst
index c3696904af..a4f5feda21 100644
--- a/scripts/nmt/machine_translation.rst
+++ b/scripts/nmt/machine_translation.rst
@@ -5,7 +5,7 @@ Machine Translation
 
 Use the following command to train the GNMT model on the IWSLT2015 dataset.
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python gnmt.py --src_lang en --tgt_lang vi --batch_size 64 \
                     --optimizer adam --lr 0.001 --lr_update_factor 0.5 --beam_size 10 \
diff --git a/scripts/sentiment_analysis/sentiment_analysis.rst b/scripts/sentiment_analysis/sentiment_analysis.rst
index 4ecc7b55f1..a5ce06dc6d 100644
--- a/scripts/sentiment_analysis/sentiment_analysis.rst
+++ b/scripts/sentiment_analysis/sentiment_analysis.rst
@@ -9,13 +9,13 @@ bucketing strategies to speed up training.
 
 Use the following command to run without using pretrained model
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python sentiment_analysis.py --gpu 0 --batch_size 16 --bucket_type fixed --epochs 3 --dropout 0 --no_pretrained --lr 0.005 --valid_ratio 0.1 --save-prefix imdb_lstm_200  # Test Accuracy 85.36
 
 Use the following command to run with pretrained model
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python sentiment_analysis.py --gpu 0 --batch_size 16 --bucket_type fixed --epochs 3 --dropout 0 --lr 0.005 --valid_ratio 0.1 --save-prefix imdb_lstm_200  # Test Accuracy 87.41
 
diff --git a/scripts/word_embedding_evaluation/extended_results.ipynb b/scripts/word_embedding_evaluation/extended_results.ipynb
index 2d9ba2bf7a..89f3465e33 100644
--- a/scripts/word_embedding_evaluation/extended_results.ipynb
+++ b/scripts/word_embedding_evaluation/extended_results.ipynb
@@ -7024,6 +7024,9 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.4"
+  },
+  "nbsphinx": {
+   "orphan": true
   }
  },
  "nbformat": 4,
diff --git a/scripts/word_embedding_evaluation/word_embedding_evaluation.rst b/scripts/word_embedding_evaluation/word_embedding_evaluation.rst
index 19a3eeac1e..50ce74c670 100644
--- a/scripts/word_embedding_evaluation/word_embedding_evaluation.rst
+++ b/scripts/word_embedding_evaluation/word_embedding_evaluation.rst
@@ -14,7 +14,7 @@ the respective datasets.
 We include a `run_all.sh` script to reproduce the results.
 
 
-.. code-block:: bash
+.. code-block:: console
 
    $ run_all.sh
 
@@ -23,7 +23,7 @@ To evaluate a specific embedding on one or multiple datasets you can use the
 included `word_embedding_evaluation.py` as follows.
 
 
-.. code-block:: bash
+.. code-block:: console
 
    $ python word_embedding_evaluation.py
 
diff --git a/setup.py b/setup.py
index 3a1143fd8d..b36f2d1eb1 100644
--- a/setup.py
+++ b/setup.py
@@ -57,5 +57,12 @@ def find_version(*file_paths):
             'nltk',
             'scipy',
         ],
+        'dev': [
+            'pytest',
+            'recommonmark',
+            'sphinx-gallery',
+            'sphinx_rtd_theme',
+            'nbsphinx',
+        ],
     },
 )