NVIDIA · jstjohn · Nov 16, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 5, 2024
diff --git a/.gitignore b/.gitignore
@@ -187,6 +187,7 @@ dist/
 coverage.xml
 
 # Jupyter Notebook
+notebooks/
 .ipynb_checkpoints
 
 # System files

@@ -18,7 +18,7 @@ repos:
     hooks:
       - id: detect-secrets
         name: detect-secrets (everything but notebooks)
-        args: ['--baseline', '.secrets.baseline', '--exclude-files', '(.*\.ipynb|.*\.baseline)$', ]
+        args: ['--baseline', '.secrets.baseline', '--exclude-files', '(.*\.ipynb|.*\.baseline|.*\.fasta)$', ]
         exclude: package.lock.json
       - id: detect-secrets
         name: detect-secrets (notebooks only)

@@ -128,7 +128,7 @@
     {
       "path": "detect_secrets.filters.regex.should_exclude_file",
       "pattern": [
-        "(.*\\.ipynb|.*\\.baseline)$"
+        "(.*\\.ipynb|.*\\.baseline|.*\\.fasta)$"
       ]
     }
   ],
@@ -139,9 +139,9 @@
         "filename": "pyproject.toml",
         "hashed_secret": "79670e9c9d1c7ea5b81a96a2053d81437712c78e",
         "is_verified": false,
-        "line_number": 44
+        "line_number": 45
       }
     ]
   },
-  "generated_at": "2025-01-15T19:06:19Z"
+  "generated_at": "2025-01-30T14:18:42Z"
 }
@@ -0,0 +1,63 @@
+scope: partial-conv
+time_limit: 14400
+script_args:
+  # All arguments referenced in the script string must be specified here.
+  # Arguments not referenced in the script string must have the 'arg' field specified.
+  # See jet/core/configs.py for the specification of the configuration class
+  workspace:
+    value: /workspace/bionemo2
+    key_segment: False
+  data_path:
+    value: /data/evo2
+    key_segment: False
+  model:
+    value: evo2
+  variant:
+    value: train
+  config_name:
+    value: 7b
+  precision:
+    value: fp8
+  nodes:
+    value: 4
+  gpus:
+    value: 8
+  batch_size:
+    value: 2
+  pp:
+    value: 1
+  tp:
+    value: 8
+  cp:
+    value: 1
+  acc_grad:
+    value: 1
+  max_steps:
+    value: 20000
+script: |-
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY python ${workspace}/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py \
+  -d ${workspace}/ci/benchmarks/test_dataset_config.yaml \
+  --dataset-path ${data_path} \
+  --grad-acc-batches ${acc_grad} \
+  --fp8 \
+  --enable-preemption \
+  --ckpt-async-save \
+  --seq-length=8192 \
+  --tensor-parallel-size=${tp} \
+  --context-parallel-size=${cp} \
+  --pipeline-model-parallel-size=${pp} \
+  --workers 8 \
+  --num-nodes=${nodes} \
+  --devices=${gpus} \
+  --micro-batch-size=${batch_size} \
+  --model-size=${config_name} \
+  --max-steps=${max_steps} \
+  --limit-val-batches=20 \
+  --log-every-n-steps=50 \
+  --val-check-interval=500 \
+  --tflops-callback \
+  --experiment-dir=${tensorboard_dir}/${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
+  --wandb-project=${wandb_project_name} \
+  --wandb-group=${model}_${variant}_${config_name}__${target} \
+  --wandb-job-type=${pipeline_label} \
+  --disable-checkpointing;
@@ -0,0 +1,67 @@
+scope: perf
+time_limit: 1800
+script_args:
+  # All arguments referenced in the script string must be specified here.
+  # Arguments not referenced in the script string must have the 'arg' field specified.
+  # See jet/core/configs.py for the specification of the configuration class
+  workspace:
+    value: /workspace/bionemo2
+    key_segment: False
+  data_path:
+    value: /data/evo2
+    key_segment: False
+  model:
+    value: evo2
+  variant:
+    value: train
+  precision:
+    value: fp8
+  gpus:
+    value: 8
+  batch_size:
+    value: 2
+  max_steps:
+    value: 100
+  tp:
+    value: 8
+  cp:
+    value: 1
+  pp:
+    value: 1
+  acc_grad:
+    value: 1
+  products:
+    - nodes: 1
+      config_name: 7b
+    - nodes: 2
+      config_name: 7b
+    - nodes: 8
+      config_name: 40b
+script: |-
+  WANDB_API_KEY=$BIONEMO_WANDB_API_KEY python ${workspace}/sub-packages/bionemo-evo2/src/bionemo/evo2/run/${variant}.py \
+  -d ${workspace}/ci/benchmarks/test_dataset_config.yaml \
+  --dataset-path ${data_path} \
+  --grad-acc-batches ${acc_grad} \
+  --fp8 \
+  --enable-preemption \
+  --ckpt-async-save \
+  --use-megatron-comm-overlap-llama3-8k \
+  --seq-length=8192 \
+  --tensor-parallel-size=${tp} \
+  --context-parallel-size=${cp} \
+  --pipeline-model-parallel-size=${pp} \
+  --workers 8 \
+  --num-nodes=${nodes} \
+  --devices=${gpus} \
+  --micro-batch-size=${batch_size} \
+  --model-size=${config_name} \
+  --max-steps=${max_steps} \
+  --limit-val-batches=20 \
+  --log-every-n-steps=50 \
+  --val-check-interval=${max_steps} \
+  --tflops-callback \
+  --experiment-dir=${tensorboard_dir}/${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
+  --wandb-project=${wandb_project_name} \
+  --wandb-group=${model}_${variant}_${config_name}__${target} \
+  --wandb-job-type=${pipeline_label} \
+  --disable-checkpointing;
diff --git a/ci/benchmarks/test_dataset_config.yaml b/ci/benchmarks/test_dataset_config.yaml
@@ -0,0 +1,81 @@
+- dataset_prefix: metagenomics/pretraining_data_metagenomics/data_metagenomics_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.18
+- dataset_prefix: gtdb_v220/gtdb_v220_imgpr_merged_data/data_gtdb_imgpr_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.24
+- dataset_prefix: imgvr/pretraining_data_imgvr/data_imgvr_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.03
+- dataset_prefix: ncrna/pretraining_data_ncrna/data_ncrna_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.02
+- dataset_prefix: mrna/pretraining_data_mrna/data_mrna_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.09
+- dataset_prefix: euk_windows/stitched_transcripts/pretraining_data_stiched_mrna/data_mrna_stitch_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.09
+- dataset_prefix: euk_windows/windows_split/5kb_windows_lowercase/5kb_windows_lowercase_pretraining_data/windows_5kb_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.35
+- dataset_prefix: promoters/pretraining_data_promoters/data_promoters_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.0003
+- dataset_prefix: organelle/pretraining_data_organelle/data_organelle_train_text_CharLevelTokenizer_document
+  dataset_split: train
+  dataset_weight: 0.005
+- dataset_prefix: metagenomics/pretraining_data_metagenomics/data_metagenomics_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.18
+- dataset_prefix: gtdb_v220/gtdb_v220_imgpr_merged_data/data_gtdb_imgpr_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.24
+- dataset_prefix: imgvr/pretraining_data_imgvr/data_imgvr_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.03
+- dataset_prefix: ncrna/pretraining_data_ncrna/data_ncrna_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.02
+- dataset_prefix: mrna/pretraining_data_mrna/data_mrna_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.09
+- dataset_prefix: euk_windows/stitched_transcripts/pretraining_data_stiched_mrna/data_mrna_stitch_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.09
+- dataset_prefix: euk_windows/windows_split/5kb_windows_lowercase/5kb_windows_lowercase_pretraining_data/windows_5kb_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.35
+- dataset_prefix: promoters/pretraining_data_promoters/data_promoters_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.0003
+- dataset_prefix: organelle/pretraining_data_organelle/data_organelle_valid_text_CharLevelTokenizer_document
+  dataset_split: validation
+  dataset_weight: 0.005
+- dataset_prefix: metagenomics/pretraining_data_metagenomics/data_metagenomics_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.18
+- dataset_prefix: gtdb_v220/gtdb_v220_imgpr_merged_data/data_gtdb_imgpr_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.24
+- dataset_prefix: imgvr/pretraining_data_imgvr/data_imgvr_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.03
+- dataset_prefix: ncrna/pretraining_data_ncrna/data_ncrna_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.02
+- dataset_prefix: mrna/pretraining_data_mrna/data_mrna_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.09
+- dataset_prefix: euk_windows/stitched_transcripts/pretraining_data_stiched_mrna/data_mrna_stitch_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.09
+- dataset_prefix: euk_windows/windows_split/5kb_windows_lowercase/5kb_windows_lowercase_pretraining_data/windows_5kb_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.35
+- dataset_prefix: promoters/pretraining_data_promoters/data_promoters_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.0003
+- dataset_prefix: organelle/pretraining_data_organelle/data_organelle_test_text_CharLevelTokenizer_document
+  dataset_split: test
+  dataset_weight: 0.005
diff --git a/ci/scripts/megatron-lm-mr2604-torch-dist-ckpt-size.patch b/ci/scripts/megatron-lm-mr2604-torch-dist-ckpt-size.patch
@@ -0,0 +1,32 @@
+diff --git a/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
+index 47ab4d112..48de3218b 100644
+--- a/megatron/core/dist_checkpointing/strategies/filesystem_async.py
++++ b/megatron/core/dist_checkpointing/strategies/filesystem_async.py
+@@ -113,6 +113,18 @@ class FileSystemWriterAsync(FileSystemWriter):
+             file_count += 1
+             return file_name
+
++        def _copy_to_cpu(ten: torch.Tensor):
++            """Pinned D2H copy (or a simple clone() if already on the CPU).
++
++            Makes sure we perform a `clone` only if we detect incontiguous storage,
++            so that we don't blow up host memory unnecessarily.
++            """
++            ten = ten.detach()
++            if ten.device.type != "cpu":
++                return ten.to("cpu", non_blocking=True)
++            is_view = ten.untyped_storage().size() != ten.numel() * ten.itemsize
++            return ten.clone() if is_view else ten
++
+         # Prepare bytes / tensor data in each bucket, which will be assigned to each writer process
+         self.write_buckets = []
+         for group_name, group_buckets in _split_by_separation_hint(
+@@ -125,7 +137,7 @@ class FileSystemWriterAsync(FileSystemWriter):
+                     if item.type == WriteItemType.BYTE_IO
+                 ]
+                 tensor_data = [
+-                    (item, planner.resolve_data(item).detach().to("cpu", non_blocking=True))
++                    (item, _copy_to_cpu(planner.resolve_data(item)))
+                     for item in bucket
+                     if item.type != WriteItemType.BYTE_IO
+                 ]
@@ -97,7 +97,6 @@ echo "Test directories: ${TEST_DIRS[*]}"
 # Run tests with coverage
 for dir in "${TEST_DIRS[@]}"; do
     echo "Running pytest in $dir"
-
     if ! pytest "${PYTEST_OPTIONS[@]}" --junitxml=$(basename $dir).junit.xml -o junit_family=legacy "$dir"; then
         error=true
     fi

@@ -20,10 +20,11 @@ check_git_repository() {
     if ! git diff-index --quiet HEAD --; then
         if [ $? -eq 128 ]; then
             echo "ERROR: Not in a git repository!" >&2
+            return 1
         else
-            echo "ERROR: Repository is dirty! Commit all changes before building the image!" >&2
+            echo "Warning: Repository is dirty! Commit all changes before building the image!" >&2
+            return 0
         fi
-        return 1
     fi
 }
 

@@ -44,8 +44,10 @@
     "main",
 )
 
-LICENSE_HEADER: str = """
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+NVIDIA_COPYRIGHT: str = (
+    "# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved."
+)
+APACHE_BLOCK: str = """
 # SPDX-License-Identifier: LicenseRef-Apache2
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -61,6 +63,9 @@
 # limitations under the License.
 """.strip()
 
+# default header (split to allow for intermediate copyright headers)
+LICENSE_HEADER = f"{NVIDIA_COPYRIGHT}\n{APACHE_BLOCK}"
+
 
 @dataclass(frozen=True)
 class HeaderNotFound(ValueError):
@@ -134,8 +139,60 @@ def is_valid_python(pyfile_contents: str) -> Optional[SyntaxError]:
 
 
 def has_header(pyfile_contents: str, *, license_header: str = LICENSE_HEADER) -> bool:
-    """True if the :param:`pyfile_contents` starts with the :param:`license_header`. False otherwise."""
-    return pyfile_contents.startswith(license_header)
+    """Check if file has valid license header.
+
+    First checks if file has multiple copyright lines - if so, validates structure only.
+    If not, and custom license_header provided, does exact string match.
+    Otherwise validates basic structure.
+    """
+    lines = pyfile_contents.split("\n")
+
+    # Count copyright lines at start of file
+    copyright_count = 0
+    for line in lines:
+        if line.strip().startswith("# SPDX-FileCopyrightText: Copyright"):
+            copyright_count += 1
+        else:
+            break
+
+    # If file has multiple copyrights, only validate structure
+    if copyright_count > 1:
+        # Must start with NVIDIA copyright
+        if not lines or not lines[0].strip() == NVIDIA_COPYRIGHT:
+            return False
+
+        # Find where Apache block starts
+        apache_start = None
+        for i, line in enumerate(lines):
+            if line.strip().startswith("# SPDX-License-Identifier: LicenseRef-Apache2"):
+                apache_start = i
+                break
+
+        if apache_start is None:
+            return False
+
+        # All lines between NVIDIA copyright and Apache block must be valid SPDX copyright lines
+        for line in lines[1:apache_start]:
+            if line.strip() and not line.strip().startswith("# SPDX-FileCopyrightText: Copyright"):
+                return False
+
+        # Check Apache block matches exactly
+        apache_lines = APACHE_BLOCK.split("\n")
+        if len(lines[apache_start:]) < len(apache_lines):
+            return False
+
+        for actual, expected in zip(lines[apache_start : apache_start + len(apache_lines)], apache_lines):
+            if actual.strip() != expected.strip():
+                return False
+
+        return True
+
+    # Otherwise, if custom header provided, use exact match
+    if license_header != LICENSE_HEADER:
+        return pyfile_contents.startswith(license_header)
+
+    # Otherwise do basic structure validation
+    return lines[0].strip() == NVIDIA_COPYRIGHT and pyfile_contents.startswith(LICENSE_HEADER)
 
 
 def append_license_header(pyfile_contents: str, *, license_header: str = LICENSE_HEADER, n_sep_lines: int = 2) -> str: