use colossal for tp

cached npy
hpcaitech · May 20, 2022 · bfab911 · bfab911
1 parent e796d7f
commit bfab911
Show file tree

Hide file tree

Showing 53 changed files with 76 additions and 2,540 deletions.
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ For Bert, Google reports a [super-large Bert with 481B parameters](https://mlcom
 ### Installation
 ``` bash
 $ git clone https://github.com/hpcaitech/ColossalAI-Inference.git
+$ pip install -r requirements.txt
 $ pip install .
 ```
 

diff --git a/energon/cli/service.py b/energon/cli/service.py
@@ -2,7 +2,7 @@
 import torch
 import inspect
 import energon.server as server
-from multiprocessing import Process
+import multiprocessing as mp
 
 from energon.context import Config
 
@@ -53,8 +53,9 @@ def launches(model_class=None,
     worker_rank = 1    # start from 1
 
     process_list = []
+    mp.set_start_method('spawn')
     for i in range(num_worker):
-        p = Process(target=server.launch_worker,
+        p = mp.Process(target=server.launch_worker,
                     args=(host, port, tp_init_size, pp_init_size, "nccl", 1024, True, worker_rank + i, worker_rank + i,
                           server_host, worker_port + i, log_level))
         p.start()

diff --git a/energon/communication/collective.py b/energon/communication/collective.py
@@ -6,8 +6,8 @@
 from torch.distributed import ReduceOp
 from torch import Tensor
 
-from energon.context import ParallelMode
-from energon.core import global_context as gpc
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
 from energon.utils import get_current_device
 
 

diff --git a/energon/communication/p2p.py b/energon/communication/p2p.py
@@ -5,8 +5,8 @@
 import torch
 import torch.distributed as dist
 
-from energon.context.parallel_mode import ParallelMode
-from energon.core import global_context as gpc
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
 from energon.utils import get_current_device
 from functools import reduce
 import operator

diff --git a/energon/communication/ring.py b/energon/communication/ring.py
@@ -3,8 +3,8 @@
 
 import torch
 
-from energon.context.parallel_mode import ParallelMode
-from energon.core import global_context as gpc
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
 from energon.utils import get_current_device, synchronize
 
 

diff --git a/energon/communication/utils.py b/energon/communication/utils.py
@@ -1,8 +1,8 @@
 import torch
 import torch.distributed as dist
 
-from energon.context.parallel_mode import ParallelMode
-from energon.core import global_context as gpc
+from colossalai.core import global_context as gpc
+from colossalai.context import ParallelMode
 from energon.utils import get_current_device
 
 

diff --git a/energon/context/__init__.py b/energon/context/__init__.py
@@ -1,5 +1 @@
 from .config import Config, ConfigException
-from .parallel_context import ParallelContext
-from .parallel_mode import ParallelMode
-from .process_group_initializer import *
-from .random import *