Merge branch 'master' into bug-examples-bcewithlogits

Lightning-AI · Apr 27, 2021 · 9df85f7 · 9df85f7
2 parents bed700b + 6be0a85
commit 9df85f7
Show file tree

Hide file tree

Showing 61 changed files with 759 additions and 538 deletions.
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -83,10 +83,6 @@ jobs:
             req = open(fname).read().replace('>=', '==')
             open(fname, 'w').write(req)
 
-        # remove Fairscale from requirements
-        fname = 'requirements/extra.txt'
-        lines = [line for line in open(fname).readlines() if 'fairscale' not in line]
-        open(fname, 'w').writelines(lines)
       shell: python
 
     # Note: This uses an internal pip API and may not always work

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942))
 
 
+- Added utils for metrics to scalar conversions ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180))
+
+
 - Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834/))
 
 
@@ -105,6 +108,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `max_time` Trainer argument to limit training time ([#6823](https://github.com/PyTorchLightning/pytorch-lightning/pull/6823))
 
 
+- Added `on_predict_{batch,epoch}_{start,end}` hooks ([#7141](https://github.com/PyTorchLightning/pytorch-lightning/pull/7141))
+
+
 - Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868))
 
 
@@ -143,6 +149,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Deprecated
 
+- Deprecated `LightningModule.write_predictions` and `LigtningModule.write_predictions_dict` ([#7066](https://github.com/PyTorchLightning/pytorch-lightning/pull/7066))
+
+
+- Deprecated `TrainerLoggingMixin` in favor of a separate utilities module for metric handling ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180))
+
+
 - Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834/))
 
 
@@ -178,6 +190,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Removed
 
+- Removed `automatic_optimization` as a property from the training loop in favor of `LightningModule.automatic_optimization` ([#7130](https://github.com/PyTorchLightning/pytorch-lightning/pull/7130))
+
+
 - Removed evaluation loop legacy returns for `*_epoch_end` hooks ([#6973](https://github.com/PyTorchLightning/pytorch-lightning/pull/6973))
 
 
@@ -218,6 +233,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Fixed
 
+- Added a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/PyTorchLightning/pytorch-lightning/pull/6814))
+
+
 - Fixed multi-node DDP sub-process launch by using `local_rank` instead of `global_rank` for main process assertion ([#7061](https://github.com/PyTorchLightning/pytorch-lightning/pull/7061))
 
 
@@ -301,6 +319,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed process rank not being available right away after `Trainer` instantiation ([#6941](https://github.com/PyTorchLightning/pytorch-lightning/pull/6941))
 
 
+- Fixed `lr_find` trying beyond `num_training` steps and suggesting a too high learning rate ([#7076](https://github.com/PyTorchLightning/pytorch-lightning/pull/7076))
+
+
 - Fixed logger creating incorrect version folder in DDP with repeated `Trainer.fit` calls ([#7077](https://github.com/PyTorchLightning/pytorch-lightning/pull/7077))
 
 
@@ -313,6 +334,15 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `CombinedLoader` in distributed settings for validation / testing ([#7102](https://github.com/PyTorchLightning/pytorch-lightning/pull/7102))
 
 
+- Fixed the save_dir in `WandbLogger` when the run was initiated externally ([#7106](https://github.com/PyTorchLightning/pytorch-lightning/pull/7106))
+
+
+- Fixed parsing for pre-release package versions ([#6999](https://github.com/PyTorchLightning/pytorch-lightning/pull/6999))
+
+
+- Fixed resetting device after `fitting/evaluating/predicting` ([#7188](https://github.com/PyTorchLightning/pytorch-lightning/pull/7188))
+
+
 ## [1.2.7] - 2021-04-06
 
 ### Fixed

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -36,7 +36,7 @@ jobs:
     #container: "pytorchlightning/pytorch_lightning:base-cuda-py$[ variables['python.version'] ]-torch1.6"
     container:
       # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.8-torch1.6"
       #endpoint: azureContainerRegistryConnection
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all"
 
@@ -59,9 +59,9 @@ jobs:
         #sudo apt-get install -y cmake
         # python -m pip install "pip==20.1"
         pip install --requirement requirements.txt
-        python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'fairscale' not in line] ; open(fname, 'w').writelines(lines)"
         python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
         pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed
+        pip install fairscale>=0.3.4 --upgrade-strategy only-if-needed
         pip list
       displayName: 'Install dependencies'
 
@@ -114,8 +114,10 @@ jobs:
 
     - script: |
         set -e
+        python setup.py install --user
+        rm -rf pytorch_lightning
+        pip list
         python -m pytest pl_examples -v --maxfail=2 --durations=0
-        pip install . --user --quiet
         bash pl_examples/run_examples-args.sh --trainer.gpus 1 --trainer.max_epochs 1 --data.batch_size 64 --trainer.limit_train_batches 5 --trainer.limit_val_batches 3
         bash pl_examples/run_ddp-examples.sh --trainer.max_epochs 1 --data.batch_size 32 --trainer.limit_train_batches 2 --trainer.limit_val_batches 2
         # cd pl_examples/basic_examples

diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile
@@ -112,9 +112,14 @@ RUN \
     # TODO: later commits break CI when cpp extensions are compiling. Unset when fixed
     pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex@705cba9
 
+RUN \
+    # install FairScale
+    pip install fairscale>=0.3.4
+
 RUN \
     # install DeepSpeed
-    pip install deepspeed>=0.3.14
+    # TODO(@SeanNaren): 0.3.15 is broken - skipping to unblock
+    pip install 'deepspeed>=0.3.14,!=0.3.15'
 
 RUN \
     # Show what we have

diff --git a/docs/source/advanced/amp.rst b/docs/source/advanced/amp.rst
@@ -48,9 +48,6 @@ To use 16-bit precision, do two things:
 
 .. code-block:: bash
 
-    $ git clone https://github.com/NVIDIA/apex
-    $ cd apex
-
     # ------------------------
     # OPTIONAL: on your cluster you might need to load CUDA 10 or 9
     # depending on how you installed PyTorch
@@ -65,7 +62,7 @@ To use 16-bit precision, do two things:
     # make sure you've loaded a cuda version > 4.0 and < 7.0
     module load gcc-6.1.0
 
-    $ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+    $ pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex
 
 .. warning:: NVIDIA Apex and DDP have instability problems. We recommend native 16-bit in PyTorch 1.6+
 

diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
@@ -710,6 +710,12 @@ print
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.print
     :noindex:
 
+predict_step
+~~~~~~~~~~~~
+
+.. automethod:: pytorch_lightning.core.lightning.LightningModule.predict_step
+    :noindex:
+
 save_hyperparameters
 ~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/common/trainer.rst b/docs/source/common/trainer.rst
@@ -1598,12 +1598,24 @@ fit
 .. automethod:: pytorch_lightning.trainer.Trainer.fit
    :noindex:
 
+validate
+********
+
+.. automethod:: pytorch_lightning.trainer.Trainer.validate
+   :noindex:
+
 test
 ****
 
 .. automethod:: pytorch_lightning.trainer.Trainer.test
    :noindex:
 
+predict
+*******
+
+.. automethod:: pytorch_lightning.trainer.Trainer.predict
+   :noindex:
+
 tune
 ****
 

diff --git a/docs/source/extensions/datamodules.rst b/docs/source/extensions/datamodules.rst
@@ -184,6 +184,8 @@ To define a DataModule define 5 methods:
 - val_dataloader(s)
 - test_dataloader(s)
 
+and optionally one or multiple predict_dataloader(s).
+
 
 prepare_data
 ^^^^^^^^^^^^
@@ -297,6 +299,21 @@ Use this method to generate the test dataloader. Usually you just wrap the datas
             return DataLoader(self.mnist_test, batch_size=64)
 
 
+predict_dataloader
+^^^^^^^^^^^^^^^^^^
+Returns a special dataloader for inference. This is the dataloader that the Trainer
+:meth:`~pytorch_lightning.trainer.trainer.Trainer.predict` method uses.
+
+.. code-block:: python
+
+    import pytorch_lightning as pl
+
+
+    class MNISTDataModule(pl.LightningDataModule):
+        def predict_dataloader(self):
+            return DataLoader(self.mnist_test, batch_size=64)
+
+
 transfer_batch_to_device
 ^^^^^^^^^^^^^^^^^^^^^^^^
 Override to define how you want to move an arbitrary batch to a device.

diff --git a/pl_examples/basic_examples/dali_image_classifier.py b/pl_examples/basic_examples/dali_image_classifier.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from abc import ABC
-from distutils.version import LooseVersion
 from random import shuffle
 from warnings import warn
 
 import numpy as np
 import torch
+from packaging.version import Version
 from torch.nn import functional as F
 from torch.utils.data import random_split
 
@@ -39,7 +39,7 @@
     from nvidia.dali.pipeline import Pipeline
     from nvidia.dali.plugin.pytorch import DALIClassificationIterator
 
-    NEW_DALI_API = LooseVersion(dali_version) >= LooseVersion('0.28.0')
+    NEW_DALI_API = Version(dali_version) >= Version('0.28.0')
     if NEW_DALI_API:
         from nvidia.dali.plugin.base_iterator import LastBatchPolicy
 else: