Skip to content

Commit

Permalink
Merge branch 'master' into triton-windows
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Nov 25, 2024
2 parents fd76a9e + d6410f9 commit 1f9f203
Show file tree
Hide file tree
Showing 37 changed files with 386 additions and 88 deletions.
85 changes: 85 additions & 0 deletions .github/workflows/hpu-gaudi2-nightly.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: hpu-gaudi2-nightly

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- ".github/workflows/hpu-gaudi2-nightly.yml"

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
# The type of runner that the job will run on
runs-on: [self-hosted, intel, gaudi2]
container:
image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
ports:
- 80
options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

env:
PT_HPU_LAZY_MODE: 0
TORCHINDUCTOR_COMPILE_THREADS: 1
TEST_LIST: |
test_adamw.py
test_bf16.py
test_ds_config_dict.py
test_dynamic_loss_scale.py
test_latest_checkpoint.py
test_moe_checkpoint.py
test_multi_output_model.py
test_other_optimizer.py
test_pipe.py
test_pipeline.py
test_universal_checkpoint.py
test_zero_context_return.py
test_zero_leaf_module.py
test_zero_offloadpp.py
test_zero_tiled.py
# Steps represent a sequence of tasks that will be executed as part of the job
steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
- uses: actions/checkout@v4

- name: Check container state
run: |
ldd --version
hl-smi -L
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
pip install .
- name: Install deepspeed
run: |
pip install .[dev,autotuning]
ds_report
- name: Python environment
run: |
pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
echo "TEST_LIST ${TEST_LIST}"
pytest --verbose unit/ -k "${TEST_LIST}"
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
unit-tests:
strategy:
matrix:
pyVersion: ["3.8", "3.9", "3.10", "3.11", "3.12"]
pyVersion: ["3.8", "3.9", "3.10"]
fail-fast: false

runs-on: ubuntu-24.04
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ DeepSpeed has been integrated with several different popular open-source DL fram

| | Documentation |
| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/deepspeed) |
| <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed) |
| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/projects/composer/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
Expand Down
2 changes: 1 addition & 1 deletion blogs/deepspeed-chat/japanese/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,7 @@ DeepSpeedは、以下のような機能を提供します。

DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。

またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。

DeepSpeedについてのより詳しい情報は、以下をご覧ください。

Expand Down
6 changes: 6 additions & 0 deletions csrc/xpu/adam/multi_tensor_apply.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ class multi_tensor_apply_kernel {
std::tuple<ArgTypes...> args;
};

// to make sure multi_tensor_apply_kernel can be used in sycl::buffer
namespace sycl {
template <typename T, typename U, typename... ArgTypes>
struct is_device_copyable<multi_tensor_apply_kernel<T, U, ArgTypes...>> : std::true_type {};
} // namespace sycl

template <int depth, typename T, typename... ArgTypes>
void multi_tensor_apply(int block_size,
int chunk_size,
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/autotuning/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ The Autotuner stops exploring the space when any of the following conditions mee

## Using Autotuning with Hugging Face

Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/transformers/main_classes/deepspeed.html?highlight=gradient_accumulation_steps#shared-configuration).
Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/docs/transformers/deepspeed#deepspeed-and-trainer-parameters).
`"auto"` means the value will be set to the default in Hugging Face or be overwritten using the supplied values from the command line arguments.
In DeepSpeed Autotuning, if the user-provided DeepSpeed configuration file has "auto" keywords, they are treated as the value "auto".

Expand Down
17 changes: 10 additions & 7 deletions deepspeed/checkpoint/deepspeed_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def show_transformer_file_map(self):
self._dump_mapping(self.transformer_file_map, 'rank_to_transformer_files')

def _build_global_state(self):
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)

Expand All @@ -137,14 +137,17 @@ def get_final_norm_layer_id(self):

def get_iteration(self):
if not ITERATION_KEY in self.global_state:
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)

return self.global_state[ITERATION_KEY]

def get_embedding_state(self, tp_index: int) -> Dict:
assert tp_index in self.tp_to_embedding_map.keys()
sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
sd_list = [
torch.load(fname, map_location=torch.device('cpu'), weights_only=False)
for fname in self.tp_to_embedding_map[tp_index]
]
sd = self._merge_state_dicts(sd_list)
return sd

Expand All @@ -154,7 +157,7 @@ def get_embedding_files(self, tp_index: int) -> list:

def _get_checkpoint_value(self, key):
if not key in self.global_state:
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
self.global_state[key] = sd.get(key, None)

return self.global_state[key]
Expand All @@ -169,7 +172,7 @@ def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
assert tp_index < self.tp_degree
assert pp_index < self.pp_degree
fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]

merged_sd = None
for sd in sd_list:
Expand All @@ -185,7 +188,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
assert pp_index < self.pp_degree
t_list = []
for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
sd = self._merge_state_dicts(sd_list)
t_list.append(sd)
return t_list
Expand All @@ -196,7 +199,7 @@ def get_pp_transformer_map(self, pp_index: int) -> list:

def get_final_norm_state(self, tp_index: int) -> Dict:
assert tp_index in self.tp_to_final_norm_map.keys()
sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'), weights_only=False)
return sd

def get_final_norm_files(self, tp_index: int) -> list:
Expand Down
14 changes: 7 additions & 7 deletions deepspeed/checkpoint/ds_to_universal.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D):


def extract_zero_shards_stage3(optim_files, param_shapes, dp_degree, temp_dir, dp_index):
state_dict = torch.load(optim_files[dp_index], map_location='cpu')
state_dict = torch.load(optim_files[dp_index], map_location='cpu', weights_only=False)

flat_state = dict(
exp_avg=state_dict[OPTIMIZER_STATE_DICT]['optimizer_state_dict']['state'][0]["exp_avg"],
Expand Down Expand Up @@ -214,7 +214,7 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None):
raise ValueError(f"Cannot parse dp_rank from {p}")

paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))]
shards = [torch.load(p) for p in paths]
shards = [torch.load(p, weights_only=False) for p in paths]

if state == "step":
assert all(v == shards[0] for v in shards), "All shards must have the same step value"
Expand Down Expand Up @@ -404,7 +404,7 @@ def _zero_partitioned_param_info(unpartitioned_numel, world_size):


def _parse_model_states_stage3(files):
return torch.load(files[0], map_location=torch.device('cpu'))[PARAM_SHAPES]
return torch.load(files[0], map_location=torch.device('cpu'), weights_only=False)[PARAM_SHAPES]


def _save_optimizer_state(args, ds_checkpoint):
Expand All @@ -420,7 +420,7 @@ def _save_optimizer_state(args, ds_checkpoint):


def _save_optimizer_state_stage3(args, optim_files):
sd = torch.load(optim_files[0], map_location=torch.device('cpu'))
sd = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
output_sd = sd[OPTIMIZER_STATE_DICT]
output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS]
zero_output_folder = os.path.join(args.output_folder, "zero")
Expand All @@ -446,15 +446,15 @@ def _get_checkpoint_files(checkpoint_dir, glob_pattern):


def _get_zero_stage(optim_files):
state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'))
state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
optimizer_state = state_dict[OPTIMIZER_STATE_DICT]
zero_stage = optimizer_state.get(ZERO_STAGE, 1)
return zero_stage


def _inject_missing_state(ds_checkpoint):
if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state:
sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'))
sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
if UNIVERSAL_CHECKPOINT_INFO not in sd:
ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {}
ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][
Expand Down Expand Up @@ -488,7 +488,7 @@ def main(args):

slice_shapes = []
for mp_rank_file in ds_checkpoint.mp_rank_files:
mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'), weights_only=False)
slice_shapes += mp_sd[PARAM_SHAPES]

# fix back to normal flat dict, merge duplicates for tp>1
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/checkpoint/universal_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
step = None
for key in hp_keys:
ckpt_file = os.path.join(folder, f"{key}.pt")
ckpt_dict = torch.load(ckpt_file)
ckpt_dict = torch.load(ckpt_file, weights_only=False)

if key == "step":
step = ckpt_dict
Expand Down
7 changes: 6 additions & 1 deletion deepspeed/checkpoint/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,12 @@ def clone_tensors_for_torch_save(item, device=torch.device('cpu')):
- copy of ``item`` with cloned tensors on target device
"""
if torch.is_tensor(item):
return item.detach().clone().to(device)
if type(device) is str:
device = torch.device(device)
if device == item.device:
return item.detach().clone()
else:
return item.detach().to(device)
elif isinstance(item, list):
return [clone_tensors_for_torch_save(v, device) for v in item]
elif isinstance(item, tuple):
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/checkpoint/zero_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], st
state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
merged_sd = None
for state_file in state_file_list:
sd = torch.load(state_file, map_location=torch.device('cpu'))
sd = torch.load(state_file, map_location=torch.device('cpu'), weights_only=False)
for key in keys_to_ignore:
sd.pop(key, None)

Expand Down
4 changes: 2 additions & 2 deletions deepspeed/inference/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,15 +452,15 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
checkpoint = sd_loader['checkpoints']

if type(checkpoint) is list:
self.sd = torch.load(checkpoint[0], map_location='cpu')
self.sd = torch.load(checkpoint[0], map_location='cpu', weights_only=False)
self.key_list = list(self.sd.keys())

self.load_model_with_checkpoint(self.module)

for i in range(1, len(checkpoint)):
if not dist.is_initialized() or dist.get_rank() == 0:
print(f"loading checkpoint ({i})")
self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name())
self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name(), weights_only=False)
self.key_list = list(self.sd.keys())
self.load_model_with_checkpoint(self.module)
else:
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/inference/v2/checkpoint/huggingface_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def model_has_safetensors(model_name_or_path: str) -> bool:
else:
model_param_json_fname = "pytorch_model.bin.index.json"
model_file_fname = "pytorch_model.bin"
self._checkpoint_load_fn = partial(torch.load, map_location="cpu")
self._checkpoint_load_fn = partial(torch.load, map_location="cpu", weights_only=False)

model_param_json = os.path.join(self._local_checkpoint_dir, model_param_json_fname)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def populate_model_parameters(self) -> None:
buffer_path = make_param_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
metadata_path = make_metadata_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)

buffer = torch.load(buffer_path)
buffer = torch.load(buffer_path, weights_only=False)
metadata = json.load(open(metadata_path, "r"))
metadata = ModelMetadata.parse_raw(metadata)

Expand Down
Loading

0 comments on commit 1f9f203

Please sign in to comment.