updates on new model architectures

utterworks · Nov 28, 2019 · 12c0dbc · 12c0dbc
1 parent 6063ae7
commit 12c0dbc
Show file tree

Hide file tree

Showing 16 changed files with 1,238 additions and 337 deletions.
diff --git a/.gitignore b/.gitignore
@@ -102,3 +102,7 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+.output
+cache/*
+cached*
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,4 @@
 {
-    "python.formatting.provider": "black"
+    "python.formatting.provider": "black",
+    "python.pythonPath": "/Users/kaushaltrivedi/anaconda3/envs/transformer/bin/python"
 }
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
 # Fast-Bert
 
 [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/deepmipt/DeepPavlov/blob/master/LICENSE)
-[![PyPI version](https://badge.fury.io/py/fast-bert.svg)](https://badge.fury.io/py/fast-bert) 
+[![PyPI version](https://badge.fury.io/py/fast-bert.svg)](https://badge.fury.io/py/fast-bert)
 ![Python 3.6, 3.7](https://img.shields.io/badge/python-3.6%20%7C%203.7-green.svg)
 
-**NEW Architectures**
+**New model architectures: ALBERT, CamemBERT, DistilRoberta**
 
 **DistilBERT (from HuggingFace), Smaller, faster, cheaper, lighter**
 

diff --git a/container/Dockerfile_gpu b/container/Dockerfile_gpu
@@ -11,18 +11,18 @@ RUN test $py_version || exit 1
 RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common && \
     add-apt-repository ppa:deadsnakes/ppa -y && \
     apt-get update && apt-get install -y --no-install-recommends \
-        build-essential \
-        curl \
-        jq \
-        git \
-        libsm6 \
-        libxext6 \
-        libxrender-dev \
-        nginx && \
+    build-essential \
+    curl \
+    jq \
+    git \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    nginx && \
     if [ $py_version -eq 3 ]; \
-       then apt-get install -y --no-install-recommends python3.7-dev \
-           && ln -s -f /usr/bin/python3.7 /usr/bin/python; \
-       else apt-get install -y --no-install-recommends python-dev; fi && \
+    then apt-get install -y --no-install-recommends python3.7-dev \
+    && ln -s -f /usr/bin/python3.7 /usr/bin/python; \
+    else apt-get install -y --no-install-recommends python-dev; fi && \
     rm -rf /var/lib/apt/lists/*
 
 #ENV CUDNN_VERSION 7.5.0.56
@@ -55,16 +55,16 @@ RUN nvcc --version
 RUN which nvcc
 
 RUN pip --no-cache-dir install \
-        flask \
-        pathlib \
-        gunicorn \
-        gevent \
-        scipy \
-        sklearn \
-        pandas \
-        fastprogress \
-        python-box \
-        tensorboardX
+    flask \
+    pathlib \
+    gunicorn \
+    gevent \
+    scipy \
+    sklearn \
+    pandas \
+    fastprogress \
+    python-box \
+    tensorboardX
 
 RUN ls
 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
@@ -79,7 +79,7 @@ WORKDIR /opt/ml/code
 
 RUN cd $WORKDIR
 
-RUN python download_pretrained_models.py --location_dir ./pretrained_models/ --models bert-base-uncased roberta-base distilbert-base-uncased
+RUN python download_pretrained_models.py --location_dir ./pretrained_models/ --models bert-base-uncased roberta-base distilbert-base-uncased distilroberta-base
 
 
 
diff --git a/container/bert/predictor.py b/container/bert/predictor.py
@@ -99,7 +99,6 @@ def ping():
     return flask.Response(response="\n", status=status, mimetype="application/json")
 
 
-
 # @app.route("/execution-parameters", method=["GET"])
 # def get_execution_parameters():
 #     params = {
@@ -112,7 +111,6 @@ def ping():
 #     )
 
 
-
 @app.route("/invocations", methods=["POST"])
 def transformation():
     """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert

diff --git a/fast_bert/data_cls.py b/fast_bert/data_cls.py
@@ -17,14 +17,18 @@
                           XLNetForSequenceClassification,
                           XLNetTokenizer,
                           RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
+                          CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer,
+                          AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer,
                           DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
 
 MODEL_CLASSES = {
     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
     'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
-    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
+    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    'camembert': (CamembertConfig, CamembertForSequenceClassification, CamembertTokenizer)
 }
 
 
@@ -372,7 +376,7 @@ def __init__(self, data_dir, label_dir, tokenizer, train_file='train.csv', val_f
                 val_examples = processor.get_dev_examples(
                     val_file, text_col=text_col, label_col=label_col)
 
-            val_dataset = self.get_dataset_from_examples(val_examples, 'dev')
+            val_dataset = self.get_dataset_from_examples(val_examples, 'dev', no_cache=self.no_cache)
 
             # no grads necessary, hence double val batch size
             self.val_batch_size = self.batch_size_per_gpu * \
@@ -394,7 +398,7 @@ def __init__(self, data_dir, label_dir, tokenizer, train_file='train.csv', val_f
                 })
 
             test_dataset = self.get_dataset_from_examples(
-                test_examples, 'test', is_test=True)
+                test_examples, 'test', is_test=True, no_cache=self.no_cache)
 
             self.test_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu)
             test_sampler = SequentialSampler(test_dataset)
@@ -432,7 +436,7 @@ def get_dataset_from_examples(self, examples, set_type='train', is_test=False, n
         elif set_type == 'dev':
             file_name = self.val_file
         elif set_type == 'test':
-            file_name = self.test_data
+            file_name = 'test' # test is not supposed to be a file - just a list of texts
 
         cached_features_file = os.path.join(self.cache_dir, 'cached_{}_{}_{}_{}_{}'.format(
             self.model_type,
@@ -464,7 +468,7 @@ def get_dataset_from_examples(self, examples, set_type='train', is_test=False, n
                 logger=self.logger)
 
             # Create folder if it doesn't exist
-            if self.no_cache == False or no_cache == False:
+            if no_cache == False:
                 self.cache_dir.mkdir(exist_ok=True)
                 self.logger.info(
                     "Saving features into cached file %s", cached_features_file)
-Original file line number
+Diff line change
@@ Expand Up / @@ -102,3 +102,7 @@ venv.bak/ @@
     # mypy
     .mypy_cache/
+    .output
+    cache/*
+    cached*