Skip to content

Commit

Permalink
Implement SpecBinner
Browse files Browse the repository at this point in the history
  • Loading branch information
roman-bushuiev committed May 30, 2024
1 parent a0fb3fa commit 9c43645
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 48 deletions.
68 changes: 64 additions & 4 deletions massspecgym/transforms.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import numpy as np
import pandas as pd
import torch
import matchms
import matchms.filtering as ms_filters
import massspecgym.utils as utils
from tokenizers import ByteLevelBPETokenizer
from rdkit.Chem import AllChem as Chem
from typing import Optional
from pathlib import Path
from typing import Optional, List, Union
from abc import ABC, abstractmethod


Expand Down Expand Up @@ -42,7 +45,8 @@ def default_matchms_transforms(
mz_to: float = 1000,
) -> matchms.Spectrum:
spec = ms_filters.select_by_mz(spec, mz_from=mz_from, mz_to=mz_to)
spec = ms_filters.reduce_to_number_of_peaks(spec, n_max=n_max_peaks)
if n_max_peaks is not None:
spec = ms_filters.reduce_to_number_of_peaks(spec, n_max=n_max_peaks)
spec = ms_filters.normalize_intensities(spec)
return spec

Expand Down Expand Up @@ -70,8 +74,64 @@ def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:


class SpecBinner(SpecTransform):
# TODO
pass
def __init__(
self,
max_mz: float = 1000,
bin_width: float = 1,
to_rel_intensities: bool = True,
) -> None:
self.max_mz = max_mz
self.bin_width = bin_width
self.to_rel_intensities = to_rel_intensities
if max_mz % bin_width != 0:
raise ValueError("`max_mz` must be divisible by `bin_width`.")

def matchms_transforms(self, spec: matchms.Spectrum) -> matchms.Spectrum:
return default_matchms_transforms(spec, mz_to=self.max_mz, n_max_peaks=None)

def matchms_to_torch(self, spec: matchms.Spectrum) -> torch.Tensor:
"""
Bin the spectrum into a fixed number of bins.
"""
binned_spec = self._bin_mass_spectrum(
mzs=spec.peaks.mz,
intensities=spec.peaks.intensities,
max_mz=self.max_mz,
bin_width=self.bin_width,
to_rel_intensities=self.to_rel_intensities,
)
return torch.from_numpy(binned_spec)

def _bin_mass_spectrum(
self, mzs, intensities, max_mz, bin_width, to_rel_intensities=True
):
# Calculate the number of bins
num_bins = int(np.ceil(max_mz / bin_width))

# Calculate the bin indices for each mass
bin_indices = np.floor(mzs / bin_width).astype(int)

# Filter out mzs that exceed max_mz
valid_indices = bin_indices[mzs <= max_mz]
valid_intensities = intensities[mzs <= max_mz]

# Clip bin indices to ensure they are within the valid range
valid_indices = np.clip(valid_indices, 0, num_bins - 1)

# Initialize an array to store the binned intensities
binned_intensities = np.zeros(num_bins)

# Use np.add.at to sum intensities in the appropriate bins
np.add.at(binned_intensities, valid_indices, valid_intensities)

# Generate the bin edges for reference
# bin_edges = np.arange(0, max_mz + bin_width, bin_width)

# Normalize the intensities to relative intensities
if to_rel_intensities:
binned_intensities /= np.max(binned_intensities)

return binned_intensities # , bin_edges


class MolTransform(ABC):
Expand Down
44 changes: 0 additions & 44 deletions tests/test_tokenizer.py

This file was deleted.

66 changes: 66 additions & 0 deletions tests/test_transforms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import numpy as np
import torch
import matchms
from massspecgym.transforms import SpecTokenizer, SpecBinner


def test_spec_tokenizer():
spec = matchms.Spectrum(
mz=np.array(
[
45.134823,
45.13699,
110.096245,
130.064972,
136.111862,
277.16571,
289.165924,
307.177856,
406.223083,
]
),
intensities=np.array(
[0.01082, 0.01064, 0.17184, 0.1397, 0.00874, 1.0, 0.52842, 0.00793, 0.43696]
),
metadata={"precursor_mz": 406.22},
)

tokenizer = SpecTokenizer(n_peaks=60)
spec_t = tokenizer(spec)
assert spec_t.shape == (60, 2)
print(spec_t[: spec.peaks.mz.shape[0], 0])
print(torch.from_numpy(spec.peaks.mz))
assert (
spec_t[: spec.peaks.mz.shape[0], 0] == torch.from_numpy(spec.peaks.mz)
).all()
assert (
spec_t[: spec.peaks.intensities.shape[0], 1]
== torch.from_numpy(spec.peaks.intensities)
).all()


def test_spec_binner():
spec = matchms.Spectrum(
mz=np.array(
[
45.134823,
45.13699,
110.096245,
130.064972,
136.111862,
277.16571,
289.165924,
307.177856,
406.223083,
]
),
intensities=np.array([0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.2, 0.1, 0.2]),
metadata={"precursor_mz": 406.22},
)
binner = SpecBinner(max_mz=1000, bin_width=100, to_rel_intensities=False)
spec_t = binner(spec)
assert spec_t.shape == (1000 // 100,)
assert torch.allclose(
spec_t,
torch.tensor([0.2, 0.3, 1.2, 0.1, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0]).double(),
)

0 comments on commit 9c43645

Please sign in to comment.