Merge branch 'master' into cli-load-from-checkpoint

Lightning-AI · Feb 21, 2024 · c71406f · c71406f
2 parents a892efb + 39a86f8
commit c71406f
Show file tree

Hide file tree

Showing 228 changed files with 2,196 additions and 1,254 deletions.
diff --git a/.github/workflows/ci-examples-app.yml b/.github/workflows/ci-examples-app.yml
@@ -67,7 +67,7 @@ jobs:
         run: python .actions/assistant.py replace_oldest_ver
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels

diff --git a/.github/workflows/ci-tests-app.yml b/.github/workflows/ci-tests-app.yml
@@ -73,7 +73,7 @@ jobs:
         run: python .actions/assistant.py replace_oldest_ver
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels

diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml
@@ -46,8 +46,8 @@ jobs:
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.1" }
-          - { os: "macOS-12", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
-          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
+          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
+          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" }
           # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues
           - { os: "macOS-12", pkg-name: "fabric", python-version: "3.11", pytorch-version: "2.0" }
@@ -114,7 +114,7 @@ jobs:
           done
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels

diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml
@@ -50,8 +50,8 @@ jobs:
           - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" }
-          - { os: "macOS-12", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
-          - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
+          - { os: "macOS-11", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
+          - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
           - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.2" }
           # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues
           - { os: "macOS-12", pkg-name: "pytorch", python-version: "3.11", pytorch-version: "2.0" }
@@ -120,7 +120,7 @@ jobs:
           cat requirements/pytorch/base.txt
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels
@@ -161,7 +161,7 @@ jobs:
           cache-key: "pypi_wheels"
 
       - name: Cache datasets
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: Datasets
           key: pl-dataset

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
@@ -34,7 +34,7 @@ jobs:
           python-version: "3.10.6"
 
       - name: Mypy cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: .mypy_cache
           key: mypy-${{ hashFiles('requirements/typing.txt') }}

diff --git a/.github/workflows/docs-build.yml b/.github/workflows/docs-build.yml
@@ -80,7 +80,7 @@ jobs:
           pip install lai-sphinx-theme -U -f ${PYPI_LOCAL_DIR}
 
       - name: pip wheels cache
-        uses: actions/cache/restore@v3
+        uses: actions/cache/restore@v4
         with:
           path: ${{ env.PYPI_CACHE_DIR }}
           key: pypi_wheels

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -84,10 +84,12 @@ repos:
           - flake8-return
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: "v0.1.15"
+    rev: "v0.2.0"
     hooks:
       - id: ruff
         args: ["--fix", "--preview"]
+      - id: ruff-format
+        args: ["--preview"]
 
   - repo: https://github.com/executablebooks/mdformat
     rev: 0.7.17

diff --git a/README.md b/README.md
@@ -92,12 +92,14 @@ pip install -iU https://test.pypi.org/simple/ pytorch-lightning
 
 ______________________________________________________________________
 
-## Lightning has 3 core packages
+## Lightning has 4 core packages
 
 [PyTorch Lightning: Train and deploy PyTorch at scale](#pytorch-lightning-train-and-deploy-pytorch-at-scale).
 <br/>
 [Lightning Fabric: Expert control](#lightning-fabric-expert-control).
 <br/>
+[Lightning Data: Blazing fast, distributed streaming of training data from cloud storage](https://github.com/Lightning-AI/pytorch-lightning/tree/master/src/lightning/data).
+<br/>
 [Lightning Apps: Build AI products and ML workflows](#lightning-apps-build-ai-products-and-ml-workflows).
 
 Lightning gives you granular control over how much abstraction you want to add over PyTorch.

diff --git a/docs/source-fabric/advanced/distributed_communication.rst b/docs/source-fabric/advanced/distributed_communication.rst
@@ -236,6 +236,11 @@ Full example:
     result = fabric.all_gather(data)
     print("Result of all-gather:", result)  # tensor([ 0, 10, 20, 30])
 
+.. warning::
+
+    For the special case where ``world_size`` is 1, no additional dimension is added to the tensor(s). This inconsistency
+    is kept for backward compatibility and you may need to handle this special case in your code to make it agnostic.
+
 
 ----
 

diff --git a/docs/source-fabric/fundamentals/convert.rst b/docs/source-fabric/fundamentals/convert.rst
@@ -90,6 +90,21 @@ Check out our before-and-after example for `image classification <https://github
 ----
 
 
+****************
+Optional changes
+****************
+
+Here are a few optional upgrades you can make to your code, if applicable:
+
+- Replace ``torch.save()`` and ``torch.load()`` with Fabric's :doc:`save and load methods <../guide/checkpoint/checkpoint>`.
+- Replace collective operations from ``torch.distributed`` (barrier, broadcast, etc.) with Fabric's :doc:`collective methods <../advanced/distributed_communication>`.
+- Use Fabric's :doc:`no_backward_sync() context manager <../advanced/gradient_accumulation>` if you implemented gradient accumulation.
+- Initialize your model under the :doc:`init_module() <../advanced/model_init>` context manager.
+
+
+----
+
+
 **********
 Next steps
 **********

diff --git a/docs/source-pytorch/conf.py b/docs/source-pytorch/conf.py
@@ -92,7 +92,7 @@ def _load_py_module(name: str, location: str) -> ModuleType:
 assist_local.AssistantCLI.pull_docs_files(
     gh_user_repo="Lightning-AI/lightning-Habana",
     target_dir="docs/source-pytorch/integrations/hpu",
-    checkout="refs/tags/1.3.0",
+    checkout="refs/tags/1.4.0",
 )
 
 # Copy strategies docs as single pages
@@ -610,4 +610,5 @@ def package_list_from_file(file):
     "https://deepgenerativemodels.github.io/assets/slides/cs236_lecture11.pdf",
     "https://www.intel.com/content/www/us/en/products/docs/processors/what-is-a-gpu.html",
     "https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/",  # noqa: E501
+    "https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop",
 ]
diff --git a/docs/source-pytorch/debug/debugging_basic.rst b/docs/source-pytorch/debug/debugging_basic.rst
@@ -114,11 +114,11 @@ this generate a table like:
 
 .. code-block:: text
 
-      | Name  | Type        | Params
-    ----------------------------------
-    0 | net   | Sequential  | 132 K
-    1 | net.0 | Linear      | 131 K
-    2 | net.1 | BatchNorm1d | 1.0 K
+      | Name  | Type        | Params | Mode
+    -------------------------------------------
+    0 | net   | Sequential  | 132 K  | train
+    1 | net.0 | Linear      | 131 K  | train
+    2 | net.1 | BatchNorm1d | 1.0 K  | train
 
 To add the child modules to the summary add a :class:`~lightning.pytorch.callbacks.model_summary.ModelSummary`:
 
@@ -162,10 +162,10 @@ With the input array, the summary table will include the input and output layer
 
 .. code-block:: text
 
-      | Name  | Type        | Params | In sizes  | Out sizes
-    --------------------------------------------------------------
-    0 | net   | Sequential  | 132 K  | [10, 256] | [10, 512]
-    1 | net.0 | Linear      | 131 K  | [10, 256] | [10, 512]
-    2 | net.1 | BatchNorm1d | 1.0 K  | [10, 512] | [10, 512]
+      | Name  | Type        | Params | Mode  | In sizes  | Out sizes
+    ----------------------------------------------------------------------
+    0 | net   | Sequential  | 132 K  | train | [10, 256] | [10, 512]
+    1 | net.0 | Linear      | 131 K  | train | [10, 256] | [10, 512]
+    2 | net.1 | BatchNorm1d | 1.0 K  | train | [10, 512] | [10, 512]
 
 when you call ``.fit()`` on the Trainer. This can help you find bugs in the composition of your layers.
diff --git a/examples/app/dag/app.py b/examples/app/dag/app.py
@@ -65,9 +65,9 @@ def __init__(self, models_paths: list):
         )
 
         # Step 3: Create the work to train the models_paths in parallel.
-        self.dict = Dict(
-            **{model_path.split(".")[-1]: ModelWork(model_path, parallel=True) for model_path in models_paths}
-        )
+        self.dict = Dict(**{
+            model_path.split(".")[-1]: ModelWork(model_path, parallel=True) for model_path in models_paths
+        })
 
         # Step 4: Some element to track components progress.
         self.has_completed = False

diff --git a/examples/app/server/app.py b/examples/app/server/app.py
@@ -20,13 +20,11 @@ def setup(self):
     def predict(self, request):
         image = base64.b64decode(request.image.encode("utf-8"))
         image = Image.open(io.BytesIO(image))
-        transforms = torchvision.transforms.Compose(
-            [
-                torchvision.transforms.Resize(224),
-                torchvision.transforms.ToTensor(),
-                torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-            ]
-        )
+        transforms = torchvision.transforms.Compose([
+            torchvision.transforms.Resize(224),
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ])
         image = transforms(image)
         image = image.to(self._device)
         prediction = self._model(image.unsqueeze(0))

diff --git a/examples/app/server_with_auto_scaler/app.py b/examples/app/server_with_auto_scaler/app.py
@@ -34,13 +34,11 @@ def setup(self):
         self._model = torchvision.models.resnet18(pretrained=True).to(self._device)
 
     def predict(self, requests: BatchRequestModel):
-        transforms = torchvision.transforms.Compose(
-            [
-                torchvision.transforms.Resize(224),
-                torchvision.transforms.ToTensor(),
-                torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-            ]
-        )
+        transforms = torchvision.transforms.Compose([
+            torchvision.transforms.Resize(224),
+            torchvision.transforms.ToTensor(),
+            torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ])
         images = []
         for request in requests.inputs:
             image = app.components.serve.types.image.Image.deserialize(request.image)

diff --git a/examples/fabric/dcgan/train_fabric.py b/examples/fabric/dcgan/train_fabric.py
@@ -4,6 +4,7 @@
 Code adapted from the official PyTorch DCGAN tutorial:
 https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
 """
+
 import os
 import time
 from pathlib import Path
@@ -55,14 +56,12 @@ def main():
         root=dataroot,
         split="all",
         download=True,
-        transform=transforms.Compose(
-            [
-                transforms.Resize(image_size),
-                transforms.CenterCrop(image_size),
-                transforms.ToTensor(),
-                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-            ]
-        ),
+        transform=transforms.Compose([
+            transforms.Resize(image_size),
+            transforms.CenterCrop(image_size),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]),
     )
 
     # Create the dataloader
@@ -227,7 +226,7 @@ def __init__(self):
             nn.ReLU(True),
             # state size. (ngf) x 32 x 32
             nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
-            nn.Tanh()
+            nn.Tanh(),
             # state size. (nc) x 64 x 64
         )
 

diff --git a/examples/fabric/dcgan/train_torch.py b/examples/fabric/dcgan/train_torch.py
@@ -4,6 +4,7 @@
 Code adapted from the official PyTorch DCGAN tutorial:
 https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
 """
+
 import os
 import random
 import time
@@ -55,14 +56,12 @@ def main():
         root=dataroot,
         split="all",
         download=True,
-        transform=transforms.Compose(
-            [
-                transforms.Resize(image_size),
-                transforms.CenterCrop(image_size),
-                transforms.ToTensor(),
-                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-            ]
-        ),
+        transform=transforms.Compose([
+            transforms.Resize(image_size),
+            transforms.CenterCrop(image_size),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]),
     )
 
     # Create the dataloader
@@ -236,7 +235,7 @@ def __init__(self):
             nn.ReLU(True),
             # state size. (ngf) x 32 x 32
             nn.ConvTranspose2d(ngf, nc, 4, 2, 1, bias=False),
-            nn.Tanh()
+            nn.Tanh(),
             # state size. (nc) x 64 x 64
         )
 

diff --git a/examples/fabric/meta_learning/train_fabric.py b/examples/fabric/meta_learning/train_fabric.py
@@ -14,6 +14,7 @@
 Run it with:
     lightning run model train_fabric.py --accelerator=cuda --devices=2 --strategy=ddp
 """
+
 import cherry
 import learn2learn as l2l
 import torch

diff --git a/examples/fabric/meta_learning/train_torch.py b/examples/fabric/meta_learning/train_torch.py
@@ -15,6 +15,7 @@
 Run it with:
     torchrun --nproc_per_node=2 --standalone train_torch.py
 """
+
 import os
 import random
 

diff --git a/examples/fabric/reinforcement_learning/train_fabric.py b/examples/fabric/reinforcement_learning/train_fabric.py
@@ -84,14 +84,10 @@ def main(args: argparse.Namespace):
     )
 
     # Environment setup
-    envs = gym.vector.SyncVectorEnv(
-        [
-            make_env(
-                args.env_id, args.seed + rank * args.num_envs + i, rank, args.capture_video, logger.log_dir, "train"
-            )
-            for i in range(args.num_envs)
-        ]
-    )
+    envs = gym.vector.SyncVectorEnv([
+        make_env(args.env_id, args.seed + rank * args.num_envs + i, rank, args.capture_video, logger.log_dir, "train")
+        for i in range(args.num_envs)
+    ])
     assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
 
     # Define the agent and the optimizer and setup them with Fabric

diff --git a/examples/fabric/reinforcement_learning/train_fabric_decoupled.py b/examples/fabric/reinforcement_learning/train_fabric_decoupled.py
@@ -59,9 +59,9 @@ def player(args, world_collective: TorchCollective, player_trainer_collective: T
     )
 
     # Environment setup
-    envs = gym.vector.SyncVectorEnv(
-        [make_env(args.env_id, args.seed + i, 0, args.capture_video, log_dir, "train") for i in range(args.num_envs)]
-    )
+    envs = gym.vector.SyncVectorEnv([
+        make_env(args.env_id, args.seed + i, 0, args.capture_video, log_dir, "train") for i in range(args.num_envs)
+    ])
     assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"
 
     # Define the agent