Implement SpecBinner

pluskal-lab · May 30, 2024 · 9c43645 · 9c43645
1 parent a0fb3fa
commit 9c43645
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 48 deletions.
diff --git a/massspecgym/transforms.py b/massspecgym/transforms.py
@@ -1,10 +1,13 @@
 import numpy as np
+import pandas as pd
 import torch
 import matchms
 import matchms.filtering as ms_filters
 import massspecgym.utils as utils
+from tokenizers import ByteLevelBPETokenizer
 from rdkit.Chem import AllChem as Chem
-from typing import Optional
+from pathlib import Path
+from typing import Optional, List, Union
 from abc import ABC, abstractmethod
 
 
@@ -42,7 +45,8 @@ def default_matchms_transforms(
     mz_to: float = 1000,
 ) -> matchms.Spectrum:
     spec = ms_filters.select_by_mz(spec, mz_from=mz_from, mz_to=mz_to)
-    spec = ms_filters.reduce_to_number_of_peaks(spec, n_max=n_max_peaks)
+    if n_max_peaks is not None:
+        spec = ms_filters.reduce_to_number_of_peaks(spec, n_max=n_max_peaks)
     spec = ms_filters.normalize_intensities(spec)
     return spec
 
@@ -70,8 +74,64 @@ def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:
 
 
 class SpecBinner(SpecTransform):
-    # TODO
-    pass
+    def __init__(
+        self,
+        max_mz: float = 1000,
+        bin_width: float = 1,
+        to_rel_intensities: bool = True,
+    ) -> None:
+        self.max_mz = max_mz
+        self.bin_width = bin_width
+        self.to_rel_intensities = to_rel_intensities
+        if max_mz % bin_width != 0:
+            raise ValueError("`max_mz` must be divisible by `bin_width`.")
+
+    def matchms_transforms(self, spec: matchms.Spectrum) -> matchms.Spectrum:
+        return default_matchms_transforms(spec, mz_to=self.max_mz, n_max_peaks=None)
+
+    def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:
+        """
+        Bin the spectrum into a fixed number of bins.
+        """
+        binned_spec = self._bin_mass_spectrum(
+            mzs=spec.peaks.mz,
+            intensities=spec.peaks.intensities,
+            max_mz=self.max_mz,
+            bin_width=self.bin_width,
+            to_rel_intensities=self.to_rel_intensities,
+        )
+        return torch.from_numpy(binned_spec)
+
+    def _bin_mass_spectrum(
+        self, mzs, intensities, max_mz, bin_width, to_rel_intensities=True
+    ):
+        # Calculate the number of bins
+        num_bins = int(np.ceil(max_mz / bin_width))
+
+        # Calculate the bin indices for each mass
+        bin_indices = np.floor(mzs / bin_width).astype(int)
+
+        # Filter out mzs that exceed max_mz
+        valid_indices = bin_indices[mzs <= max_mz]
+        valid_intensities = intensities[mzs <= max_mz]
+
+        # Clip bin indices to ensure they are within the valid range
+        valid_indices = np.clip(valid_indices, 0, num_bins - 1)
+
+        # Initialize an array to store the binned intensities
+        binned_intensities = np.zeros(num_bins)
+
+        # Use np.add.at to sum intensities in the appropriate bins
+        np.add.at(binned_intensities, valid_indices, valid_intensities)
+
+        # Generate the bin edges for reference
+        # bin_edges = np.arange(0, max_mz + bin_width, bin_width)
+
+        # Normalize the intensities to relative intensities
+        if to_rel_intensities:
+            binned_intensities /= np.max(binned_intensities)
+
+        return binned_intensities  # , bin_edges
 
 
 class MolTransform(ABC):

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -0,0 +1,66 @@
+import numpy as np
+import torch
+import matchms
+from massspecgym.transforms import SpecTokenizer, SpecBinner
+
+
+def test_spec_tokenizer():
+    spec = matchms.Spectrum(
+        mz=np.array(
+            [
+                45.134823,
+                45.13699,
+                110.096245,
+                130.064972,
+                136.111862,
+                277.16571,
+                289.165924,
+                307.177856,
+                406.223083,
+            ]
+        ),
+        intensities=np.array(
+            [0.01082, 0.01064, 0.17184, 0.1397, 0.00874, 1.0, 0.52842, 0.00793, 0.43696]
+        ),
+        metadata={"precursor_mz": 406.22},
+    )
+
+    tokenizer = SpecTokenizer(n_peaks=60)
+    spec_t = tokenizer(spec)
+    assert spec_t.shape == (60, 2)
+    print(spec_t[: spec.peaks.mz.shape[0], 0])
+    print(torch.from_numpy(spec.peaks.mz))
+    assert (
+        spec_t[: spec.peaks.mz.shape[0], 0] == torch.from_numpy(spec.peaks.mz)
+    ).all()
+    assert (
+        spec_t[: spec.peaks.intensities.shape[0], 1]
+        == torch.from_numpy(spec.peaks.intensities)
+    ).all()
+
+
+def test_spec_binner():
+    spec = matchms.Spectrum(
+        mz=np.array(
+            [
+                45.134823,
+                45.13699,
+                110.096245,
+                130.064972,
+                136.111862,
+                277.16571,
+                289.165924,
+                307.177856,
+                406.223083,
+            ]
+        ),
+        intensities=np.array([0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.1, 0.2]),
+        metadata={"precursor_mz": 406.22},
+    )
+    binner = SpecBinner(max_mz=1000, bin_width=100, to_rel_intensities=False)
+    spec_t = binner(spec)
+    assert spec_t.shape == (1000 // 100,)
+    assert torch.allclose(
+        spec_t,
+        torch.tensor([0.2, 0.3, 1.2, 0.1, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0]).double(),
+    )