diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 3f11fa876..728f3b6bf 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -20,6 +20,8 @@ on: - 'r**' - 'dev' types: [labeled] + merge_group: + types: [checks_requested] workflow_dispatch: inputs: test_to_run: @@ -41,6 +43,7 @@ jobs: outputs: test_to_run: ${{ steps.test_to_run.outputs.main }} all: ${{ steps.all.outputs.main }} + run_ci: ${{ steps.evaluate.outputs.run_ci }} steps: - name: Parse test_to_run id: test_to_run @@ -51,9 +54,44 @@ jobs: id: all run: | echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT" - + + - name: Get changed files + id: changed-files + if: github.event_name == 'pull_request' + uses: tj-actions/changed-files@v44 + with: + files_yaml: | + doc: + - '**.md' + - docs/** + src: + - '!**.md' + - '!docs/**' + + - name: Evaluate conditions + id: evaluate + env: + DOCS_ONLY: ${{ steps.changed-files.outputs.doc_any_changed == 'true' && steps.changed-files.outputs.src_any_changed == 'false' }} + CHANGED_DOCS: ${{ steps.changed-files.outputs.doc_all_changed_files }} + CHANGED_SRC: ${{ steps.changed-files.outputs.src_all_changed_files }} + IS_PULLREQUEST: ${{ github.event_name == 'pull_request' }} + LABEL: ${{ github.event.label.name == 'Run CICD' }} + MERGE_GROUP: ${{ github.event_name == 'merge_group' }} + run: | + # Some output that's helpful for debugging + echo "Docs changed: $CHANGED_DOCS" + echo "Src changed: $CHANGED_SRC" + + echo "DOCS_ONLY: $DOCS_ONLY" + echo "LABEL: $LABEL" + echo "IS_PULLREQUEST: $IS_PULLREQUEST" + + # Run CI only (on main or if label is attached) and if it's not only docs + echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true") && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT" + build-container: - if: ${{ github.event.label.name == 'Run CICD' || github.ref == 'refs/heads/main' }} + if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} + needs: [pre-flight] uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.1.0 with: image-name: nemo_aligner_container @@ -61,13 +99,13 @@ jobs: image-label: nemo-aligner build-args: | MAX_JOBS=32 - ALIGNER_COMMIT=${{ github.event.pull_request.head.sha || github.sha }} + ALIGNER_COMMIT=${{ github.sha }} Unit_Tests: name: ${{ matrix.test_case }} needs: [build-container, pre-flight] uses: ./.github/workflows/_run_test.yml - if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'unit') || needs.pre-flight.outputs.all == 'true' + if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} strategy: matrix: test_case: @@ -85,7 +123,7 @@ jobs: name: ${{ matrix.test_case }} needs: [build-container, pre-flight] uses: ./.github/workflows/_run_test.yml - if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'functional') || needs.pre-flight.outputs.all == 'true' + if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} strategy: matrix: test_case: @@ -102,3 +140,24 @@ jobs: TIMEOUT: 8 SCRIPT: | bash /opt/NeMo-Aligner/tests/functional/test_cases/${{ matrix.test_case }} + + CI_QA_Gate: + name: CI quality check + if: always() + runs-on: ubuntu-latest + needs: + - Unit_Tests + - Functional_Tests + steps: + - name: main + env: + JOB_RESULTS: ${{ toJSON(needs) }} + ALL_SUCCESS: ${{ !contains(needs.*.result, 'failure') && !contains(needs.*.result, 'cancelled') && !contains(needs.*.result, 'skipped') }} + CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }} + run: | + + SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"') + echo '🤖: CICD Result' >> $GITHUB_STEP_SUMMARY + echo "$SUMMARY" >> $GITHUB_STEP_SUMMARY + + test "$ALL_SUCCESS" = "true" || test "$CI_SKIP" = "true" \ No newline at end of file diff --git a/.github/workflows/release-freeze.yml b/.github/workflows/release-freeze.yml index 2513928c9..10ae3386e 100644 --- a/.github/workflows/release-freeze.yml +++ b/.github/workflows/release-freeze.yml @@ -3,20 +3,25 @@ name: "Code freeze" on: workflow_dispatch: inputs: - type_of_release: + release-type: type: choice description: Type of release options: - major - minor - + freeze-commit: + type: string + description: Commit SHA to use for cut-off + required: false + default: main jobs: code-freeze: - uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.8.0 + uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_code_freeze.yml@v0.17.3 with: - name_of_library: NeMo-Aligner - type_of_release: ${{ inputs.type_of_release }} - python_package: nemo_aligner + library-name: NeMo-Aligner + python-package: nemo_aligner + release-type: ${{ inputs.release-type }} + freeze-commit: ${{ inputs.freeze-commit }} secrets: SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 6991a5cfb..9678d8db5 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -25,10 +25,14 @@ on: required: true default: true type: boolean - + version-bump-branch: + description: Branch for version bump + required: true + type: string + jobs: release: - uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.15.0 + uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.18.4 with: release-ref: ${{ inputs.release-ref }} image-name: nemo_aligner_container @@ -42,9 +46,10 @@ jobs: container-workdir: /opt/NeMo-Aligner library-name: NeMo-Aligner dry-run: ${{ inputs.dry-run }} + version-bump-branch: ${{ inputs.version-bump-branch }} secrets: TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }} - PAT: ${{ secrets.PAT }} SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} + PAT: ${{ secrets.PAT }} diff --git a/Dockerfile b/Dockerfile index 44a9f8651..6eb6aad40 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ git pull --rebase || true pip install --no-cache-dir --no-deps -e . EOF -FROM ${BASE_IMAGE} as final +FROM ${BASE_IMAGE} AS final LABEL "nemo.library"="nemo-aligner" WORKDIR /opt # needed in case git complains that it can't detect a valid email, this email is fake but works @@ -70,6 +70,10 @@ RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \ pip install -e . ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ +# TODO: This pinning of pynvml is only needed while on TRTLLM v13 since pynvml>=11.5.0 but pynvml==12.0.0 contains a +# breaking change. The last known working verison is 11.5.3 +RUN pip install pynvml==11.5.3 + # install TransformerEngine ARG MAX_JOBS ARG TE_TAG @@ -103,6 +107,11 @@ RUN git clone https://github.com/NVIDIA/NeMo.git && \ pip install -e ".[nlp]" && \ cd nemo/collections/nlp/data/language_modeling/megatron && make +# TODO: While we are on Pytorch 24.07, we need to downgrade triton since 3.2.0 introduced a breaking change +# This un-pinned requirement comes from mamba-ssm, and this pin can be removed once Pytorch base image is +# updated. +RUN pip install triton==3.1.0 + # MLM ARG MLM_TAG RUN pip uninstall -y megatron-core && \ diff --git a/tests/conftest.py b/conftest.py similarity index 98% rename from tests/conftest.py rename to conftest.py index 8ac1c2af7..c57b01f43 100644 --- a/tests/conftest.py +++ b/conftest.py @@ -22,8 +22,8 @@ from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy from nemo_aligner.models.nlp.gpt.megatron_gpt_ppo_actor import MegatronGPTActorModel +from nemo_aligner.testing.utils import Utils from nemo_aligner.utils.train_script_utils import init_distributed, resolve_and_create_trainer -from tests.test_mcore_utilities import Utils dir_path = os.path.dirname(os.path.abspath(__file__)) # TODO: This file exists because in cases where TRTLLM MPI communicators are involved, @@ -67,7 +67,7 @@ def run_only_on_device_fixture(request, device): @pytest.fixture def init_model_parallel(): - from tests.test_mcore_utilities import Utils + from nemo_aligner.testing.utils import Utils def initialize(*args, **kwargs): Utils.initialize_model_parallel(*args, **kwargs) @@ -401,7 +401,9 @@ def pytest_collection_modifyitems(config, items): def pytest_sessionstart(session): # Remove the file at the start of the session, if it exists - if os.path.exists(SUCCESS_FILE) and os.environ["LOCAL_RANK"] == "0": + if os.path.exists(SUCCESS_FILE) and ( + os.environ.get("LOCAL_RANK", None) == "0" or os.environ.get("OMPI_COMM_WORLD_LOCAL_RANK", None) == "0" + ): os.remove(SUCCESS_FILE) diff --git a/docs/user-guide-experimental/README.md b/docs/user-guide-experimental/README.md new file mode 100644 index 000000000..4801e725c --- /dev/null +++ b/docs/user-guide-experimental/README.md @@ -0,0 +1,5 @@ +# Experimental Docs + +This directory contains documentation for features that are still experimental or under development and not yet ready for general use. + +More context can be found in the [experimental/README.md](../../nemo_aligner/experimental/README.md) file. \ No newline at end of file diff --git a/docs/user-guide/dpo.rst b/docs/user-guide/dpo.rst index fa75941d0..d227df6f5 100644 --- a/docs/user-guide/dpo.rst +++ b/docs/user-guide/dpo.rst @@ -46,10 +46,14 @@ To start, we must first get a pretrained model to align. There are two models we --in-folder ./model_checkpoint \ --out-file ./mcore_gpt.nemo - .. tab-item:: LLaMa3 7B + .. tab-item:: LLaMa3 8B :sync: key2 - #. Download the `Llama 3 8B LLM model and tokenizer `__ into the models folder. + #. Download the `Llama 3 8B LLM model and tokenizer `__ into the models folder. You can use the Hugging Face CLI for this: + .. code-block:: bash + + huggingface-cli download meta-llama/Meta-Llama-3-8B --local-dir /path/to/llama + #. Convert the LLaMa3 LLM into ``.nemo`` format. .. code-block:: bash diff --git a/docs/user-guide/index.rst b/docs/user-guide/index.rst index bf80fb618..e467c335c 100644 --- a/docs/user-guide/index.rst +++ b/docs/user-guide/index.rst @@ -7,6 +7,7 @@ sft.rst knowledge-distillation.rst + reinforce.rst dpo.rst rlhf.rst steerlm.rst @@ -25,6 +26,9 @@ :ref:`Supervised Fine-Tuning (SFT) with Knowledge Distillation ` In this section, we walk through a variation of SFT using Knowledge Distillation where we train a smaller "student" model using a larger "teacher" model. +:ref:`Model Alignment by REINFORCE ` + In this tutorial, we will guide you through the process of aligning a NeMo Framework model using REINFORCE. This method can be applied to various models, including LLaMa2 and Mistral, with our scripts functioning consistently across different models. + :ref:`Model Alignment by DPO, RPO and IPO ` DPO, RPO, and IPO are simpler alignment methods compared to RLHF. DPO introduces a novel parameterization of the reward model in RLHF, which allows us to extract the corresponding optimal policy. Similarly, RPO and IPO provide alternative parameterizations or optimization strategies, each contributing unique approaches to refining model alignment. @@ -75,6 +79,14 @@ - Yes - Yes - + * - :ref:`REINFORCE ` + - Yes + - Yes + - Yes + - Yes (✓) + - Yes + - Yes + - * - :ref:`DPO ` - - Yes (✓) diff --git a/docs/user-guide/reinforce.rst b/docs/user-guide/reinforce.rst index cc3005db1..6d7897281 100644 --- a/docs/user-guide/reinforce.rst +++ b/docs/user-guide/reinforce.rst @@ -1,16 +1,16 @@ .. include:: /content/nemo.rsts -.. _model-aligner-reinforce: +.. _nemo-aligner-reinforce: Model Alignment by REINFORCE -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@ In this tutorial, we will guide you through the process of aligning a NeMo Framework model using REINFORCE. This method can be applied to various models, including LLaMa2 and Mistral, with our scripts functioning consistently across different models. REINFORCE is usually preceded by a Supervised Fine-Tuning (SFT). We should first follow the :ref:`Prerequisite guide ` and the :ref:`SFT guide `. After obtaining the SFT model, we will also need to train a reward model as in :ref:`PPO guide `. We will use the REINFORCE algorithm on the `Anthropic-HH-RLHF `__ dataset. REINFORCE Training -############ +################## After you have fine-tuned a GPT model using Supervised Fine-Tuning (SFT), and trained a reward model as explained in the preceding section, you can start aligning the policy using REINFORCE. @@ -48,7 +48,7 @@ To launch the server: The above example launches the reward model server on eight GPUs and one node. Make sure to change trainer.devices, trainer.num_nodes depending on your model size and scale. Aligner will work on any scale. Also, make sure to tune the trainer.reinforce.inference_micro_batch_size argument. This argument sets the size of the batch the REINFORCE actor is allowed to send to the reward per DP rank. Launch the Initial Policy and REINFORCE Actor Training -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% The REINFORCE Actor training job contains the master controller that makes the HTTP calls to all servers when needed. To launch the REINFORCE Actor and Initial Policy server: @@ -58,7 +58,7 @@ The REINFORCE Actor training job contains the master controller that makes the H TRAIN_DATA_PATH="/path/to/train_prompts.jsonl" VALID_DATA_PATH="/path/to/test_prompts.jsonl" - PRETRAINED_ACTOR_NEMO_FILE="/path/to/sft_checkpoint.nemo" + ACTOR_NEMO_FILE="/path/to/sft_checkpoint.nemo" RESULTS_DIR="/path/to/actor_results_dir" USE_FLASK=False @@ -73,7 +73,7 @@ The REINFORCE Actor training job contains the master controller that makes the H cd ${GPFS} export PYTHONPATH="${GPFS}:${PYTHONPATH}" \ && export HYDRA_FULL_ERROR=1 \ - && python -u examples/nlp/gpt/train_gpt_reinforce_actor.py \ + && mpirun -n 8 --allow-run-as-root python -u examples/nlp/gpt/train_gpt_reinforce_actor.py \ "model.data.data_prefix={train: [${TRAIN_DATA_PATH}], validation: [${VALID_DATA_PATH}], test: [${VALID_DATA_PATH}]}" \ pretrained_checkpoint.restore_from_path=\"${ACTOR_NEMO_FILE}\" \ exp_manager.checkpoint_callback_params.save_top_k=1 \ @@ -114,7 +114,7 @@ The REINFORCE Actor training job contains the master controller that makes the H The above command launches the initial and actor server on one node with eight GPUs. Launching Both Servers for REINFORCE training -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% You can use slurm to launch the two jobs and get them to coordinate together in a full REINFORCE job through the following: @@ -239,7 +239,7 @@ You can use slurm to launch the two jobs and get them to coordinate together in trainer.reinforce.rollout_batch_seq_length=4096 EOF - srun --het-group=1 -o $PPO_OUTFILE -e $PPO_ERRFILE --container-image=${CONTAINER} $MOUNTS bash -c "${cmd_reinforce}" & + srun --mpi=pmix --het-group=1 -o $PPO_OUTFILE -e $PPO_ERRFILE --container-image=${CONTAINER} $MOUNTS bash -c "${cmd_reinforce}" & wait @@ -251,6 +251,6 @@ It is important to launch all jobs with ``&`` after the srun command to ensure t Make sure to change the reward model arg ``trainer.reinforce.inference_micro_batch_size`` such that ``trainer.reinforce.inference_micro_batch_size * DP size <= model.reinforce.rollout_micro_batch_size``. REINFORCE Results -%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%% -After you've completed reinforce training, you can serve your model using the `megatron_gpt_eval.py `__ script from the NeMo codebase to run more rigorous evaluation of your trained model. \ No newline at end of file +After you've completed reinforce training, you can serve your model using the `megatron_gpt_eval.py `__ script from the NeMo codebase to run more rigorous evaluation of your trained model. diff --git a/docs/user-guide/rlhf.rst b/docs/user-guide/rlhf.rst index 5c68edb60..3e98e7fe2 100644 --- a/docs/user-guide/rlhf.rst +++ b/docs/user-guide/rlhf.rst @@ -383,6 +383,30 @@ NeMo-Aligner has support for accelerating RLHF with `TensorRT-LLM `__. +.. note:: + If you are running ``train_gpt_ppo_actor.py`` interactively (outside of SLURM) with TensorRT-LLM acceleration, + you must prepend ``mpirun -n 8 --allow-run-as-root`` to the python run command: + + .. code-block:: bash + + mpirun -n 8 --allow-run-as-root python -u ${GPFS}/examples/nlp/gpt/train_gpt_ppo_actor.py ... + + If you are using SLURM, you do not need to prepend ``mpirun`` since this will be handled automatically + if you run ``srun`` with ``--mpi=pmix``: + + .. code-block:: bash + + read -r -d '' cmd_ppo </ +│ ├── dataset.py <----- experimental dataset +│ ├── new_algo.py <----- experimental algo +│ ├── model.py <----- experimental model +│ └── tests/ +│ └── model_test.py <----- experimental model test +└── tests/ + └── functional/ + └── dpo.sh + └── test_cases/ + └── dpo-llama3 + └── functional_experimental/ <----- experimental functional tests (mirrors functional/ structure) + ├── new_algo.sh + └── test_cases/ + └── new_algo-llama3 +``` + +The directories below exist to organize experimental projects (source code), tests, and documentation. + +- [nemo_aligner/experimental/](../../nemo_aligner/experimental/): Main experimental sub-package containing projects under development +- [tests/functional_experimental/](../../tests/functional_experimental/): Functional tests for experimental projects +- [docs/user-guide-experimental/](../../docs/user-guide-experimental/): Documentation directory for experimental features and algorithms + +The `experimental` sub-package follows a modular structure where each project has its own directory (sub-package) containing implementation and tests. + +## Guidelines for "experimental/" Projects + +- **Scope**: Projects can include new model definitions, training loops, utilities, or unit tests. +- **Independence**: Projects should ideally be independent. Dependence on other projects signals it might benefit from being added to core with tests (and documentation if applicable). +- **Testing**: Must include at least one functional test [example](../../tests/functional/test_cases/dpo-llama3). diff --git a/nemo_aligner/experimental/__init__.py b/nemo_aligner/experimental/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nemo_aligner/package_info.py b/nemo_aligner/package_info.py index 98a8850de..e219e397d 100644 --- a/nemo_aligner/package_info.py +++ b/nemo_aligner/package_info.py @@ -23,7 +23,13 @@ VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE, DEV) __shortversion__ = ".".join(map(str, VERSION[:3])) -__version__ = __shortversion__ + VERSION[3] + "." + ".".join(VERSION[4:]) +__version__ = __shortversion__ + +if VERSION[3] != "": + __version__ = __version__ + VERSION[3] + +if VERSION[4] != "": + __version__ = __version__ + "." + ".".join(VERSION[4:]) __package_name__ = "nemo_aligner" __contact_names__ = "NVIDIA" diff --git a/tests/test_mcore_utilities.py b/nemo_aligner/testing/utils.py similarity index 100% rename from tests/test_mcore_utilities.py rename to nemo_aligner/testing/utils.py diff --git a/nemo_aligner/utils/tests/__init__.py b/nemo_aligner/utils/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_distributed.py b/nemo_aligner/utils/tests/distributed_test.py similarity index 100% rename from tests/test_distributed.py rename to nemo_aligner/utils/tests/distributed_test.py diff --git a/tests/test_ppo_utils.py b/nemo_aligner/utils/tests/ppo_utils_test.py similarity index 100% rename from tests/test_ppo_utils.py rename to nemo_aligner/utils/tests/ppo_utils_test.py diff --git a/tests/test_text_generation_utils.py b/nemo_aligner/utils/tests/text_generation_utils_test.py similarity index 100% rename from tests/test_text_generation_utils.py rename to nemo_aligner/utils/tests/text_generation_utils_test.py diff --git a/tests/test_trainer_utils.py b/nemo_aligner/utils/tests/trainer_utils_test.py similarity index 100% rename from tests/test_trainer_utils.py rename to nemo_aligner/utils/tests/trainer_utils_test.py diff --git a/tests/test_trt_llm.py b/nemo_aligner/utils/tests/trt_llm_test.py similarity index 100% rename from tests/test_trt_llm.py rename to nemo_aligner/utils/tests/trt_llm_test.py diff --git a/tests/test_utils.py b/nemo_aligner/utils/tests/utils_test.py similarity index 100% rename from tests/test_utils.py rename to nemo_aligner/utils/tests/utils_test.py diff --git a/setup/requirements.txt b/setup/requirements.txt index d074f3672..4aa22afa1 100644 --- a/setup/requirements.txt +++ b/setup/requirements.txt @@ -3,3 +3,6 @@ jsonlines megatron_core>=0.8 nemo_toolkit[nlp] nvidia-pytriton +# pynvml pin is needed for TRTLLM v0.13.0 since 12.0.0 contains a breaking change. +pynvml==11.5.3 +tensorrt-llm==0.13.0 diff --git a/tests/functional_experimental/README.md b/tests/functional_experimental/README.md new file mode 100644 index 000000000..69694a82d --- /dev/null +++ b/tests/functional_experimental/README.md @@ -0,0 +1,3 @@ +# Experimental Functional Tests + +More context can be found in the [experimental/README.md](../../nemo_aligner/experimental/README.md) file. \ No newline at end of file diff --git a/tests/functional_experimental/test_cases/.gitkeep b/tests/functional_experimental/test_cases/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/tests/functional_experimental/test_data/.gitkeep b/tests/functional_experimental/test_data/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/tests/run_mpi_unit.sh b/tests/run_mpi_unit.sh index e11e5cf10..d905d09cd 100755 --- a/tests/run_mpi_unit.sh +++ b/tests/run_mpi_unit.sh @@ -24,9 +24,9 @@ if [[ $NUM_GPUS_AVAILABLE -lt 2 ]]; then fi export PYTHONPATH=$(realpath ..):${PYTHONPATH:-} -CUDA_VISIBLE_DEVICES=0,1 mpirun -np 2 --allow-run-as-root pytest .. -rA -s -x -vv --mpi $@ || true +CUDA_VISIBLE_DEVICES=0,1 mpirun -np 2 --allow-run-as-root pytest ../nemo_aligner -rA -s -x -vv --mpi $@ || true -if [[ -f PYTEST_SUCCESS ]]; then +if [[ -f ../PYTEST_SUCCESS ]]; then echo SUCCESS else echo FAILURE diff --git a/tests/run_unit.sh b/tests/run_unit.sh index 41216da52..619379dcc 100755 --- a/tests/run_unit.sh +++ b/tests/run_unit.sh @@ -24,9 +24,9 @@ if [[ $NUM_GPUS_AVAILABLE -lt 2 ]]; then fi export PYTHONPATH=$(realpath ..):${PYTHONPATH:-} -CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 -m pytest .. -rA -s -x -vv $@ || true +CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc_per_node 2 -m pytest ../nemo_aligner -rA -s -x -vv $@ || true -if [[ -f PYTEST_SUCCESS ]]; then +if [[ -f ../PYTEST_SUCCESS ]]; then echo SUCCESS else echo FAILURE