NVIDIA · jwilber · Mar 4, 2025 · Mar 4, 2025
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import torch
+from pathlib import Path
+from bionemo.noodles.nvfaidx import NvFaidx
+
+class SimpleFastaDataset(torch.utils.data.Dataset):
+    """A simple dataset for Evo2 prediction."""
+
+    def __init__(self, fasta_path: Path, tokenizer, prepend_bos: bool = True):
+        """Initialize the dataset."""
+        super().__init__()
+        self.fasta = NvFaidx(fasta_path)
+        self.seqids = sorted(self.fasta.keys())
+        self.tokenizer = tokenizer
+        self.prepend_bos = prepend_bos  # needed for getting predictions for the requested set of tokens.
+
+    def write_idx_map(self, output_dir: Path):
+        """Write the index map to the output directory."""
+        with open(output_dir / "seq_idx_map.json", "w") as f:
+            json.dump({seqid: idx for idx, seqid in enumerate(self.seqids)}, f)
+
+    def __len__(self):
+        """Get the length of the dataset."""
+        return len(self.seqids)
+
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
+        """Get an item from the dataset."""
+        sequence = self.fasta[self.seqids[idx]].sequence().upper()
+        tokenized_seq = self.tokenizer.text_to_ids(sequence)
+        if self.prepend_bos:  # in pretraining we use EOS to start new sequences.
+            tokens: list[int] = [self.tokenizer.eod] + tokenized_seq
+        else:
+            tokens: list[int] = tokenized_seq
+        loss_mask = torch.ones_like(torch.tensor(tokens, dtype=torch.long), dtype=torch.long)
+        if self.prepend_bos:
+            loss_mask[0] = (
+                0  # mask the eos token which we use for causal offsetting. Later in predict we take the output
+            )
+            #  for the first [:-1] tokens which align with the sequence starting after the EOS.
+        return {
+            "tokens": torch.tensor(tokens, dtype=torch.long),
+            "position_ids": torch.arange(len(tokens), dtype=torch.long),
+            "seq_idx": torch.tensor(idx, dtype=torch.long),
+            "loss_mask": loss_mask,
+        }
@@ -18,7 +18,6 @@
 
 
 import argparse
-import json
 import tempfile
 from pathlib import Path
 from typing import Literal, Optional
@@ -35,9 +34,9 @@
 from nemo.lightning.data import WrappedDataLoader
 from torch import Tensor
 
+from bionemo.evo2.data.fasta_dataset import SimpleFastaDataset
 from bionemo.llm.lightning import LightningPassthroughPredictionMixin
 from bionemo.llm.utils.callbacks import PredictionWriter
-from bionemo.noodles.nvfaidx import NvFaidx
 
 
 CheckpointFormats = Literal["torch_dist", "zarr"]
@@ -179,48 +178,6 @@ def predict_step(self, batch, batch_idx: Optional[int] = None) -> Tensor:
             }
 
 
-class SimpleFastaDataset(torch.utils.data.Dataset):
-    """A simple dataset for Evo2 prediction."""
-
-    def __init__(self, fasta_path: Path, tokenizer, prepend_bos: bool = True):
-        """Initialize the dataset."""
-        super().__init__()
-        self.fasta = NvFaidx(fasta_path)
-        self.seqids = sorted(self.fasta.keys())
-        self.tokenizer = tokenizer
-        self.prepend_bos = prepend_bos  # needed for getting predictions for the requested set of tokens.
-
-    def write_idx_map(self, output_dir: Path):
-        """Write the index map to the output directory."""
-        with open(output_dir / "seq_idx_map.json", "w") as f:
-            json.dump({seqid: idx for idx, seqid in enumerate(self.seqids)}, f)
-
-    def __len__(self):
-        """Get the length of the dataset."""
-        return len(self.seqids)
-
-    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
-        """Get an item from the dataset."""
-        sequence = self.fasta[self.seqids[idx]].sequence().upper()
-        tokenized_seq = self.tokenizer.text_to_ids(sequence)
-        if self.prepend_bos:  # in pretraining we use EOS to start new sequences.
-            tokens: list[int] = [self.tokenizer.eod] + tokenized_seq
-        else:
-            tokens: list[int] = tokenized_seq
-        loss_mask = torch.ones_like(torch.tensor(tokens, dtype=torch.long), dtype=torch.long)
-        if self.prepend_bos:
-            loss_mask[0] = (
-                0  # mask the eos token which we use for causal offsetting. Later in predict we take the output
-            )
-            #  for the first [:-1] tokens which align with the sequence starting after the EOS.
-        return {
-            "tokens": torch.tensor(tokens, dtype=torch.long),
-            "position_ids": torch.arange(len(tokens), dtype=torch.long),
-            "seq_idx": torch.tensor(idx, dtype=torch.long),
-            "loss_mask": loss_mask,
-        }
-
-
 def hyena_predict_forward_step(model, batch) -> torch.Tensor:
     """Performs a forward step for the Hyena model.
 

@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Arc Institute. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Michael Poli. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024 Stanford University. All rights reserved
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import pytest
+from pathlib import Path
+import torch
+
+from bionemo.testing.data.fasta import create_fasta_file
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from bionemo.evo2.data.fasta_dataset import SimpleFastaDataset
+
+
+@pytest.fixture
+def fasta_dataset(tmp_path: Path) -> None:
+    """Fixture to create a SimpleFastaDataset for testing."""
+    test_fasta_file_path = create_fasta_file(tmp_path / "test.fasta", num_sequences=10, sequence_length=100)
+    tokenizer = get_nmt_tokenizer("byte-level")
+    return SimpleFastaDataset(test_fasta_file_path, tokenizer)
+
+
+def test_simple_fasta_dataset_initialization(fasta_dataset: SimpleFastaDataset) -> None:
+    """Test initialization of SimpleFastaDataset."""
+    # Check dataset length
+    assert len(fasta_dataset) == 10, "Dataset length should match number of sequences"
+
+    # Check seqids
+    assert len(fasta_dataset.seqids) == 10, "Seqids should match number of sequences"
+
+
+def test_simple_fasta_dataset_getitem(fasta_dataset: SimpleFastaDataset) -> None:
+    """Test __getitem__ method of SimpleFastaDataset."""
+    # Test first item
+    item = fasta_dataset[0]
+
+    # Check keys
+    expected_keys = {"tokens", "position_ids", "seq_idx", "loss_mask"}
+    assert set(item.keys()) == expected_keys, "Item should have correct keys"
+
+    # Check token type
+    assert isinstance(item["tokens"], torch.Tensor), "Tokens should be a torch.Tensor"
+    assert item["tokens"].dtype == torch.long, "Tokens should be long dtype"
+
+    # Check position_ids
+    assert isinstance(item["position_ids"], torch.Tensor), "Position IDs should be a torch.Tensor"
+    assert item["position_ids"].dtype == torch.long, "Position IDs should be long dtype"
+
+    # Validate sequence index
+    assert isinstance(item["seq_idx"], torch.Tensor), "Seq_idx should be a torch.Tensor"
+    assert item["seq_idx"].item() == 0, "First item should have seq_idx 0"
+
+
+def test_simple_fasta_dataset_write_idx_map(fasta_dataset: SimpleFastaDataset, tmp_path: Path) -> None:
+    """Test write_idx_map method of SimpleFastaDataset."""
+    # Create output directory
+    output_dir = tmp_path / "output"
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Write index map
+    fasta_dataset.write_idx_map(output_dir)
+
+    # Check if file was created
+    idx_map_file = output_dir / "seq_idx_map.json"
+    assert idx_map_file.exists(), "seq_idx_map.json should be created"
+
+    import json
+    with open(idx_map_file, 'r') as f:
+        idx_map = json.load(f)
+
+    assert len(idx_map) == 10, "Index map should have an entry for each sequence"
+    for idx, seqid in enumerate(fasta_dataset.seqids):
+        assert idx_map[seqid] == idx, f"Index for {seqid} should match"