From c3e52fee8b44dd3ab749f7c8cf384efe4d646604 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 14 Jan 2025 14:33:40 -0500
Subject: [PATCH 1/9] WIP: Better sklearn compliance

---
 doc/changes/devel/bugfix.rst           |  1 +
 mne/decoding/tests/test_transformer.py | 35 +++++++++++++++++++++-
 mne/decoding/transformer.py            | 41 ++++++++++----------------
 3 files changed, 51 insertions(+), 26 deletions(-)
 create mode 100644 doc/changes/devel/bugfix.rst

diff --git a/doc/changes/devel/bugfix.rst b/doc/changes/devel/bugfix.rst
new file mode 100644
index 00000000000..e054bcd7e9c
--- /dev/null
+++ b/doc/changes/devel/bugfix.rst
@@ -0,0 +1 @@
+Improved sklearn class compatibility and compliance, which resulted in some parameters of classes like :class:`mne.decoding.FilterEstimator` having an underscore appended to their name (e.g., ``picks`` passed to the initializer is set as ``est.picks_`` during the ``fit`` phase so the original can be preserved in ``est.picks``) by `Eric Larson`_.
diff --git a/mne/decoding/tests/test_transformer.py b/mne/decoding/tests/test_transformer.py
index 8dcc3ad74c7..d7b974fe932 100644
--- a/mne/decoding/tests/test_transformer.py
+++ b/mne/decoding/tests/test_transformer.py
@@ -17,8 +17,9 @@
 
 from sklearn.decomposition import PCA
 from sklearn.kernel_ridge import KernelRidge
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
-from mne import Epochs, io, pick_types, read_events
+from mne import Epochs, create_info, io, pick_types, read_events
 from mne.decoding import (
     FilterEstimator,
     PSDEstimator,
@@ -36,6 +37,7 @@
 data_dir = Path(__file__).parents[2] / "io" / "tests" / "data"
 raw_fname = data_dir / "test_raw.fif"
 event_name = data_dir / "test-eve.fif"
+info = create_info(3, 1000.0, "eeg")
 
 
 @pytest.mark.parametrize(
@@ -299,3 +301,34 @@ def test_bad_triage():
     # Used to fail with "ValueError: Effective band-stop frequency (135.0) is
     # too high (maximum based on Nyquist is 80.0)"
     filt.fit_transform(np.zeros((1, 1, 481)))
+
+
+@pytest.mark.filterwarnings("ignore:.*filter_length.*")
+@parametrize_with_checks(
+    [
+        FilterEstimator(info, l_freq=1, h_freq=10),
+        # PSDEstimator(),
+        # Scaler(info=info),
+        # TemporalFilter(),
+        # UnsupervisedSpatialFilter(PCA()),
+        # Vectorizer(),
+    ]
+)
+def test_sklearn_compliance(estimator, check):
+    """Test LinearModel compliance with sklearn."""
+    ignores = (
+        # TODO: Should probably add these
+        "check_transformer_general",
+        "check_estimators_overwrite_params",
+        "check_fit_idempotent",
+        "check_fit1d",
+        "check_fit2d_predict1d",
+        "check_n_features_in",  # maybe we should add this someday?
+        "check_estimator_sparse_data",  # we densify
+        "check_estimators_overwrite_params",  # self.model changes!
+        "check_parameters_default_constructible",
+        "check_methods_sample_order_invariance",
+    )
+    if any(ignore in str(check) for ignore in ignores):
+        return
+    check(estimator)
diff --git a/mne/decoding/transformer.py b/mne/decoding/transformer.py
index e475cd22161..f34f049fdf2 100644
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -3,14 +3,13 @@
 # Copyright the MNE-Python contributors.
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.base import BaseEstimator, TransformerMixin, check_array
 
 from .._fiff.pick import (
     _pick_data_channels,
     _picks_by_type,
     _picks_to_idx,
     pick_info,
-    pick_types,
 )
 from ..cov import _check_scalings_user
 from ..filter import filter_data
@@ -397,6 +396,7 @@ def __init__(
         self.low_bias = low_bias
         self.n_jobs = n_jobs
         self.normalization = normalization
+        self.verbose = verbose
 
     def fit(self, epochs_data, y):
         """Compute power spectral density (PSD) using a multi-taper method.
@@ -520,7 +520,7 @@ def __init__(
         self.info = info
         self.l_freq = l_freq
         self.h_freq = h_freq
-        self.picks = _picks_to_idx(info, picks)
+        self.picks = picks
         self.filter_length = filter_length
         self.l_trans_bandwidth = l_trans_bandwidth
         self.h_trans_bandwidth = h_trans_bandwidth
@@ -528,6 +528,14 @@ def __init__(
         self.method = method
         self.iir_params = iir_params
         self.fir_design = fir_design
+        self.verbose = verbose
+
+    def _check_data(self, epochs_data):
+        epochs_data = check_array(
+            epochs_data, dtype=np.float64, ensure_2d=False, order="C"
+        )
+        epochs_data = np.atleast_3d(epochs_data)
+        return epochs_data
 
     def fit(self, epochs_data, y):
         """Filter data.
@@ -544,24 +552,11 @@ def fit(self, epochs_data, y):
         self : instance of FilterEstimator
             The modified instance.
         """
-        if not isinstance(epochs_data, np.ndarray):
-            raise ValueError(
-                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
-            )
-
-        if self.picks is None:
-            self.picks = pick_types(
-                self.info, meg=True, eeg=True, ref_meg=False, exclude=[]
-            )
+        self.picks_ = _picks_to_idx(self.info, self.picks)
+        self._check_data(epochs_data)
 
         if self.l_freq == 0:
             self.l_freq = None
-        if self.h_freq is not None and self.h_freq > (self.info["sfreq"] / 2.0):
-            self.h_freq = None
-        if self.l_freq is not None and not isinstance(self.l_freq, float):
-            self.l_freq = float(self.l_freq)
-        if self.h_freq is not None and not isinstance(self.h_freq, float):
-            self.h_freq = float(self.h_freq)
 
         if self.info["lowpass"] is None or (
             self.h_freq is not None
@@ -594,17 +589,12 @@ def transform(self, epochs_data):
         X : array, shape (n_epochs, n_channels, n_times)
             The data after filtering.
         """
-        if not isinstance(epochs_data, np.ndarray):
-            raise ValueError(
-                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
-            )
-        epochs_data = np.atleast_3d(epochs_data)
         return filter_data(
-            epochs_data,
+            self._check_data(epochs_data),
             self.info["sfreq"],
             self.l_freq,
             self.h_freq,
-            self.picks,
+            self.picks_,
             self.filter_length,
             self.l_trans_bandwidth,
             self.h_trans_bandwidth,
@@ -854,6 +844,7 @@ def __init__(
         self.iir_params = iir_params
         self.fir_window = fir_window
         self.fir_design = fir_design
+        self.verbose = verbose
 
         if not isinstance(self.n_jobs, int) and self.n_jobs == "cuda":
             raise ValueError(

From 650864f5124579d70eea795e9f727ae763c21834 Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Tue, 14 Jan 2025 19:53:33 +0000
Subject: [PATCH 2/9] [autofix.ci] apply automated fixes

---
 doc/changes/devel/{bugfix.rst => 13065.bugfix.rst} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename doc/changes/devel/{bugfix.rst => 13065.bugfix.rst} (100%)

diff --git a/doc/changes/devel/bugfix.rst b/doc/changes/devel/13065.bugfix.rst
similarity index 100%
rename from doc/changes/devel/bugfix.rst
rename to doc/changes/devel/13065.bugfix.rst

From e6270f9554109d09a61cddb8e6c0c752e7e82276 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Fri, 17 Jan 2025 14:49:45 -0500
Subject: [PATCH 3/9] FIX: Fixes

---
 mne/cov.py                                |   4 +-
 mne/decoding/base.py                      |  11 +-
 mne/decoding/csp.py                       |  99 +++++------
 mne/decoding/ems.py                       |  23 ++-
 mne/decoding/search_light.py              |  51 +++---
 mne/decoding/ssd.py                       | 138 ++++++++-------
 mne/decoding/tests/test_base.py           |   7 +-
 mne/decoding/tests/test_csp.py            |  44 +++--
 mne/decoding/tests/test_ems.py            |   7 +
 mne/decoding/tests/test_search_light.py   |  30 ++--
 mne/decoding/tests/test_ssd.py            |  53 ++++--
 mne/decoding/tests/test_time_frequency.py |  21 ++-
 mne/decoding/tests/test_transformer.py    |  75 ++++----
 mne/decoding/time_frequency.py            |  33 +++-
 mne/decoding/transformer.py               | 207 +++++++++++++---------
 mne/time_frequency/multitaper.py          |   7 +-
 mne/time_frequency/tfr.py                 |   3 +-
 mne/utils/numerics.py                     |   5 +-
 18 files changed, 489 insertions(+), 329 deletions(-)

diff --git a/mne/cov.py b/mne/cov.py
index 8b86119c1d1..19f70af2bd6 100644
--- a/mne/cov.py
+++ b/mne/cov.py
@@ -1226,7 +1226,7 @@ def _compute_rank_raw_array(
     from .io import RawArray
 
     return _compute_rank(
-        RawArray(data, info, copy=None, verbose=_verbose_safe_false()),
+        RawArray(data, info, copy="auto", verbose=_verbose_safe_false()),
         rank,
         scalings,
         info,
@@ -1405,7 +1405,7 @@ def _compute_covariance_auto(
             # project back
             cov = np.dot(eigvec.T, np.dot(cov, eigvec))
             # undo bias
-            cov *= data.shape[0] / (data.shape[0] - 1)
+            cov *= data.shape[0] / max(data.shape[0] - 1, 1)
             # undo scaling
             _undo_scaling_cov(cov, picks_list, scalings)
             method_ = method[ei]
diff --git a/mne/decoding/base.py b/mne/decoding/base.py
index a291416bb17..f73cd976fe3 100644
--- a/mne/decoding/base.py
+++ b/mne/decoding/base.py
@@ -19,7 +19,7 @@
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import check_scoring
 from sklearn.model_selection import KFold, StratifiedKFold, check_cv
-from sklearn.utils import check_array, indexable
+from sklearn.utils import check_array, check_X_y, indexable
 
 from ..parallel import parallel_func
 from ..utils import _pl, logger, verbose, warn
@@ -76,9 +76,9 @@ class LinearModel(MetaEstimatorMixin, BaseEstimator):
     )
 
     def __init__(self, model=None):
+        # TODO: We need to set this to get our tag checking to work properly
         if model is None:
             model = LogisticRegression(solver="liblinear")
-
         self.model = model
 
     def __sklearn_tags__(self):
@@ -122,7 +122,11 @@ def fit(self, X, y, **fit_params):
         self : instance of LinearModel
             Returns the modified instance.
         """
-        X = check_array(X, input_name="X")
+        if y is not None:
+            X = check_array(X)
+        else:
+            X, y = check_X_y(X, y)
+        self.n_features_in_ = X.shape[1]
         if y is not None:
             y = check_array(y, dtype=None, ensure_2d=False, input_name="y")
             if y.ndim > 2:
@@ -133,6 +137,7 @@ def fit(self, X, y, **fit_params):
 
         # fit the Model
         self.model.fit(X, y, **fit_params)
+        self.model_ = self.model  # for better sklearn compat
 
         # Computes patterns using Haufe's trick: A = Cov_X . W . Precision_Y
 
diff --git a/mne/decoding/csp.py b/mne/decoding/csp.py
index 9e12335cdbe..ea38fd58ca3 100644
--- a/mne/decoding/csp.py
+++ b/mne/decoding/csp.py
@@ -6,7 +6,8 @@
 
 import numpy as np
 from scipy.linalg import eigh
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.base import BaseEstimator
+from sklearn.utils.validation import check_is_fitted
 
 from .._fiff.meas_info import create_info
 from ..cov import _compute_rank_raw_array, _regularized_covariance, _smart_eigh
@@ -19,10 +20,11 @@
     fill_doc,
     pinv,
 )
+from .transformer import MNETransformerMixin
 
 
 @fill_doc
-class CSP(TransformerMixin, BaseEstimator):
+class CSP(MNETransformerMixin, BaseEstimator):
     """M/EEG signal decomposition using the Common Spatial Patterns (CSP).
 
     This class can be used as a supervised decomposition to estimate spatial
@@ -112,49 +114,44 @@ def __init__(
         component_order="mutual_info",
     ):
         # Init default CSP
-        if not isinstance(n_components, int):
-            raise ValueError("n_components must be an integer.")
         self.n_components = n_components
         self.rank = rank
         self.reg = reg
-
-        # Init default cov_est
-        if not (cov_est == "concat" or cov_est == "epoch"):
-            raise ValueError("unknown covariance estimation method")
         self.cov_est = cov_est
-
-        # Init default transform_into
-        self.transform_into = _check_option(
-            "transform_into", transform_into, ["average_power", "csp_space"]
-        )
-
-        # Init default log
-        if transform_into == "average_power":
-            if log is not None and not isinstance(log, bool):
-                raise ValueError(
-                    'log must be a boolean if transform_into == "average_power".'
-                )
-        else:
-            if log is not None:
-                raise ValueError('log must be a None if transform_into == "csp_space".')
+        self.transform_into = transform_into
         self.log = log
-
-        _validate_type(norm_trace, bool, "norm_trace")
         self.norm_trace = norm_trace
         self.cov_method_params = cov_method_params
-        self.component_order = _check_option(
-            "component_order", component_order, ("mutual_info", "alternate")
+        self.component_order = component_order
+
+    def _validate_params(self, *, y):
+        _validate_type(self.n_components, int, "n_components")
+        if hasattr(self, "cov_est"):
+            _validate_type(self.cov_est, str, "cov_est")
+            _check_option("cov_est", self.cov_est, ("concat", "epoch"))
+        if hasattr(self, "norm_trace"):
+            _validate_type(self.norm_trace, bool, "norm_trace")
+        _check_option(
+            "transform_into", self.transform_into, ["average_power", "csp_space"]
         )
-
-    def _check_Xy(self, X, y=None):
-        """Check input data."""
-        if not isinstance(X, np.ndarray):
-            raise ValueError(f"X should be of type ndarray (got {type(X)}).")
-        if y is not None:
-            if len(X) != len(y) or len(y) < 1:
-                raise ValueError("X and y must have the same length.")
-        if X.ndim < 3:
-            raise ValueError("X must have at least 3 dimensions.")
+        if self.transform_into == "average_power":
+            _validate_type(
+                self.log,
+                (bool, None),
+                "log",
+                extra="when transform_into is 'average_power'",
+            )
+        else:
+            _validate_type(
+                self.log, None, "log", extra="when transform_into is 'csp_space'"
+            )
+        _check_option(
+            "component_order", self.component_order, ("mutual_info", "alternate")
+        )
+        self.classes_ = np.unique(y)
+        n_classes = len(self.classes_)
+        if n_classes < 2:
+            raise ValueError(f"n_classes must be >= 2, but got {n_classes} class")
 
     def fit(self, X, y):
         """Estimate the CSP decomposition on epochs.
@@ -171,12 +168,9 @@ def fit(self, X, y):
         self : instance of CSP
             Returns the modified instance.
         """
-        self._check_Xy(X, y)
-
-        self._classes = np.unique(y)
-        n_classes = len(self._classes)
-        if n_classes < 2:
-            raise ValueError("n_classes must be >= 2.")
+        X, y = self._check_data(X, y=y, fit=True, return_y=True)
+        self._validate_params(y=y)
+        n_classes = len(self.classes_)
         if n_classes > 2 and self.component_order == "alternate":
             raise ValueError(
                 "component_order='alternate' requires two classes, but data contains "
@@ -225,13 +219,8 @@ def transform(self, X):
             If self.transform_into == 'csp_space' then returns the data in CSP
             space and shape is (n_epochs, n_components, n_times).
         """
-        if not isinstance(X, np.ndarray):
-            raise ValueError(f"X should be of type ndarray (got {type(X)}).")
-        if self.filters_ is None:
-            raise RuntimeError(
-                "No filters available. Please first fit CSP decomposition."
-            )
-
+        check_is_fitted(self, "filters_")
+        X = self._check_data(X)
         pick_filters = self.filters_[: self.n_components]
         X = np.asarray([np.dot(pick_filters, epoch) for epoch in X])
 
@@ -577,7 +566,7 @@ def _compute_covariance_matrices(self, X, y):
 
         covs = []
         sample_weights = []
-        for ci, this_class in enumerate(self._classes):
+        for ci, this_class in enumerate(self.classes_):
             cov, weight = cov_estimator(
                 X[y == this_class],
                 cov_kind=f"class={this_class}",
@@ -689,7 +678,7 @@ def _normalize_eigenvectors(self, eigen_vectors, covs, sample_weights):
     def _order_components(
         self, covs, sample_weights, eigen_vectors, eigen_values, component_order
     ):
-        n_classes = len(self._classes)
+        n_classes = len(self.classes_)
         if component_order == "mutual_info" and n_classes > 2:
             mutual_info = self._compute_mutual_info(covs, sample_weights, eigen_vectors)
             ix = np.argsort(mutual_info)[::-1]
@@ -889,10 +878,8 @@ def fit(self, X, y):
         self : instance of SPoC
             Returns the modified instance.
         """
-        self._check_Xy(X, y)
-
-        if len(np.unique(y)) < 2:
-            raise ValueError("y must have at least two distinct values.")
+        X, y = self._check_data(X, y=y, fit=True, return_y=True)
+        self._validate_params(y=y)
 
         # The following code is directly copied from pyRiemann
 
diff --git a/mne/decoding/ems.py b/mne/decoding/ems.py
index b3e72a30e21..4bc64499281 100644
--- a/mne/decoding/ems.py
+++ b/mne/decoding/ems.py
@@ -5,15 +5,16 @@
 from collections import Counter
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.base import BaseEstimator
 
 from .._fiff.pick import _picks_to_idx, pick_info, pick_types
 from ..parallel import parallel_func
 from ..utils import logger, verbose
 from .base import _set_cv
+from .transformer import MNETransformerMixin
 
 
-class EMS(TransformerMixin, BaseEstimator):
+class EMS(MNETransformerMixin, BaseEstimator):
     """Transformer to compute event-matched spatial filters.
 
     This version of EMS :footcite:`SchurgerEtAl2013` operates on the entire
@@ -37,6 +38,16 @@ class EMS(TransformerMixin, BaseEstimator):
     .. footbibliography::
     """
 
+    def __sklearn_tags__(self):
+        """Return sklearn tags."""
+        from sklearn.utils import ClassifierTags
+
+        tags = super().__sklearn_tags__()
+        if tags.classifier_tags is None:
+            tags.classifier_tags = ClassifierTags()
+        tags.classifier_tags.multi_class = False
+        return tags
+
     def __repr__(self):  # noqa: D105
         if hasattr(self, "filters_"):
             return (
@@ -64,11 +75,12 @@ def fit(self, X, y):
         self : instance of EMS
             Returns self.
         """
-        classes = np.unique(y)
-        if len(classes) != 2:
+        X, y = self._check_data(X, y=y, fit=True, return_y=True)
+        classes, y = np.unique(y, return_inverse=True)
+        if len(classes) > 2:
             raise ValueError("EMS only works for binary classification.")
         self.classes_ = classes
-        filters = X[y == classes[0]].mean(0) - X[y == classes[1]].mean(0)
+        filters = X[y == 0].mean(0) - X[y == 1].mean(0)
         filters /= np.linalg.norm(filters, axis=0)[None, :]
         self.filters_ = filters
         return self
@@ -86,6 +98,7 @@ def transform(self, X):
         X : array, shape (n_epochs, n_times)
             The input data transformed by the spatial filters.
         """
+        X = self._check_data(X)
         Xt = np.sum(X * self.filters_, axis=1)
         return Xt
 
diff --git a/mne/decoding/search_light.py b/mne/decoding/search_light.py
index e3059a3e959..8bf51a64574 100644
--- a/mne/decoding/search_light.py
+++ b/mne/decoding/search_light.py
@@ -5,18 +5,19 @@
 import logging
 
 import numpy as np
-from sklearn.base import BaseEstimator, MetaEstimatorMixin, TransformerMixin, clone
+from sklearn.base import BaseEstimator, MetaEstimatorMixin, clone
 from sklearn.metrics import check_scoring
 from sklearn.preprocessing import LabelEncoder
-from sklearn.utils import check_array
+from sklearn.utils.validation import check_is_fitted
 
 from ..parallel import parallel_func
-from ..utils import ProgressBar, _parse_verbose, array_split_idx, fill_doc, verbose
+from ..utils import ProgressBar, _parse_verbose, array_split_idx, fill_doc
 from .base import _check_estimator
+from .transformer import MNETransformerMixin
 
 
 @fill_doc
-class SlidingEstimator(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
+class SlidingEstimator(MetaEstimatorMixin, MNETransformerMixin, BaseEstimator):
     """Search Light.
 
     Fit, predict and score a series of models to each subset of the dataset
@@ -38,7 +39,7 @@ class SlidingEstimator(MetaEstimatorMixin, TransformerMixin, BaseEstimator):
         List of fitted scikit-learn estimators (one per task).
     """
 
-    @verbose
+    @fill_doc  # sklearn compliance
     def __init__(
         self,
         base_estimator,
@@ -49,7 +50,6 @@ def __init__(
         allow_2d=False,
         verbose=None,
     ):
-        _check_estimator(base_estimator)
         self.base_estimator = base_estimator
         self.n_jobs = n_jobs
         self.scoring = scoring
@@ -102,7 +102,8 @@ def fit(self, X, y, **fit_params):
         self : object
             Return self.
         """
-        X = self._check_Xy(X, y)
+        _check_estimator(self.base_estimator)
+        X, _ = self._check_Xy(X, y, fit=True)
         parallel, p_func, n_jobs = parallel_func(
             _sl_fit, self.n_jobs, max_jobs=X.shape[-1], verbose=False
         )
@@ -153,7 +154,9 @@ def fit_transform(self, X, y, **fit_params):
 
     def _transform(self, X, method):
         """Aux. function to make parallel predictions/transformation."""
-        X = self._check_Xy(X)
+        X, is_nd = self._check_Xy(X)
+        orig_method = method
+        check_is_fitted(self)
         method = _check_method(self.base_estimator, method)
         if X.shape[-1] != len(self.estimators_):
             raise ValueError("The number of estimators does not match X.shape[-1]")
@@ -174,6 +177,10 @@ def _transform(self, X, method):
             )
 
         y_pred = np.concatenate(y_pred, axis=1)
+        if orig_method == "transform":
+            y_pred = y_pred.astype(X.dtype)
+        if orig_method == "predict_proba" and not is_nd:
+            y_pred = y_pred[:, 0, :]
         return y_pred
 
     def transform(self, X):
@@ -196,7 +203,7 @@ def transform(self, X):
         Xt : array, shape (n_samples, n_estimators)
             The transformed values generated by each estimator.
         """  # noqa: E501
-        return self._transform(X, "transform").astype(X.dtype)
+        return self._transform(X, "transform")
 
     def predict(self, X):
         """Predict each data slice/task with a series of independent estimators.
@@ -265,15 +272,12 @@ def decision_function(self, X):
         """  # noqa: E501
         return self._transform(X, "decision_function")
 
-    def _check_Xy(self, X, y=None):
+    def _check_Xy(self, X, y=None, fit=False):
         """Aux. function to check input data."""
         # Once we require sklearn 1.1+ we should do something like:
-        X = check_array(X, ensure_2d=False, allow_nd=True, input_name="X")
-        if y is not None:
-            y = check_array(y, dtype=None, ensure_2d=False, input_name="y")
-            if len(X) != len(y) or len(y) < 1:
-                raise ValueError("X and y must have the same length.")
-        if X.ndim < 3:
+        X = self._check_data(X, y=y, atleast_3d=False, fit=fit)
+        is_nd = X.ndim >= 3
+        if not is_nd:
             err = None
             if not self.allow_2d:
                 err = 3
@@ -282,7 +286,7 @@ def _check_Xy(self, X, y=None):
             if err:
                 raise ValueError(f"X must have at least {err} dimensions.")
             X = X[..., np.newaxis]
-        return X
+        return X, is_nd
 
     def score(self, X, y):
         """Score each estimator on each task.
@@ -307,7 +311,7 @@ def score(self, X, y):
         score : array, shape (n_samples, n_estimators)
             Score for each estimator/task.
         """  # noqa: E501
-        X = self._check_Xy(X, y)
+        X, _ = self._check_Xy(X, y)
         if X.shape[-1] != len(self.estimators_):
             raise ValueError("The number of estimators does not match X.shape[-1]")
 
@@ -483,7 +487,9 @@ def __repr__(self):  # noqa: D105
 
     def _transform(self, X, method):
         """Aux. function to make parallel predictions/transformation."""
-        X = self._check_Xy(X)
+        X, is_nd = self._check_Xy(X)
+        check_is_fitted(self)
+        orig_method = method
         method = _check_method(self.base_estimator, method)
 
         parallel, p_func, n_jobs = parallel_func(
@@ -500,6 +506,10 @@ def _transform(self, X, method):
             )
 
         y_pred = np.concatenate(y_pred, axis=2)
+        if orig_method == "transform":
+            y_pred = y_pred.astype(X.dtype)
+        if orig_method == "predict_proba" and not is_nd:
+            y_pred = y_pred[:, 0, 0, :]
         return y_pred
 
     def transform(self, X):
@@ -518,6 +528,7 @@ def transform(self, X):
         Xt : array, shape (n_samples, n_estimators, n_slices)
             The transformed values generated by each estimator.
         """
+        check_is_fitted(self)
         return self._transform(X, "transform")
 
     def predict(self, X):
@@ -603,7 +614,7 @@ def score(self, X, y):
         score : array, shape (n_samples, n_estimators, n_slices)
             Score for each estimator / data slice couple.
         """  # noqa: E501
-        X = self._check_Xy(X, y)
+        X, _ = self._check_Xy(X, y)
         # For predictions/transforms the parallelization is across the data and
         # not across the estimators to avoid memory load.
         parallel, p_func, n_jobs = parallel_func(
diff --git a/mne/decoding/ssd.py b/mne/decoding/ssd.py
index 8bc0036d315..111ded9f274 100644
--- a/mne/decoding/ssd.py
+++ b/mne/decoding/ssd.py
@@ -4,8 +4,10 @@
 
 import numpy as np
 from scipy.linalg import eigh
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.base import BaseEstimator
+from sklearn.utils.validation import check_is_fitted
 
+from .._fiff.meas_info import Info, create_info
 from .._fiff.pick import _picks_to_idx
 from ..cov import Covariance, _regularized_covariance
 from ..defaults import _handle_default
@@ -13,17 +15,17 @@
 from ..rank import compute_rank
 from ..time_frequency import psd_array_welch
 from ..utils import (
-    _check_option,
     _time_mask,
     _validate_type,
     _verbose_safe_false,
     fill_doc,
     logger,
 )
+from .transformer import MNETransformerMixin
 
 
 @fill_doc
-class SSD(TransformerMixin, BaseEstimator):
+class SSD(MNETransformerMixin, BaseEstimator):
     """
     Signal decomposition using the Spatio-Spectral Decomposition (SSD).
 
@@ -64,7 +66,7 @@ class SSD(TransformerMixin, BaseEstimator):
        If sort_by_spectral_ratio is set to True, then the SSD sources will be
        sorted according to their spectral ratio which is calculated based on
        :func:`mne.time_frequency.psd_array_welch`. The n_fft parameter sets the
-       length of FFT used.
+       length of FFT used. The default (None) will use 1 second of data.
        See :func:`mne.time_frequency.psd_array_welch` for more information.
     cov_method_params : dict | None (default None)
         As in :class:`mne.decoding.SPoC`
@@ -104,7 +106,25 @@ def __init__(
         rank=None,
     ):
         """Initialize instance."""
-        dicts = {"signal": filt_params_signal, "noise": filt_params_noise}
+        self.info = info
+        self.filt_params_signal = filt_params_signal
+        self.filt_params_noise = filt_params_noise
+        self.reg = reg
+        self.n_components = n_components
+        self.picks = picks
+        self.sort_by_spectral_ratio = sort_by_spectral_ratio
+        self.return_filtered = return_filtered
+        self.n_fft = n_fft
+        self.cov_method_params = cov_method_params
+        self.rank = rank
+
+    def _validate_params(self, X):
+        if isinstance(self.info, float):  # special case, mostly for testing
+            self.sfreq_ = self.info
+        else:
+            _validate_type(self.info, Info, "info")
+            self.sfreq_ = self.info["sfreq"]
+        dicts = {"signal": self.filt_params_signal, "noise": self.filt_params_noise}
         for param, dd in [("l", 0), ("h", 0), ("l", 1), ("h", 1)]:
             key = ("signal", "noise")[dd]
             if param + "_freq" not in dicts[key]:
@@ -116,48 +136,47 @@ def __init__(
                 _validate_type(val, ("numeric",), f"{key} {param}_freq")
         # check freq bands
         if (
-            filt_params_noise["l_freq"] > filt_params_signal["l_freq"]
-            or filt_params_signal["h_freq"] > filt_params_noise["h_freq"]
+            self.filt_params_noise["l_freq"] > self.filt_params_signal["l_freq"]
+            or self.filt_params_signal["h_freq"] > self.filt_params_noise["h_freq"]
         ):
             raise ValueError(
                 "Wrongly specified frequency bands!\n"
                 "The signal band-pass must be within the noise "
                 "band-pass!"
             )
-        self.picks = picks
-        del picks
-        self.info = info
-        self.freqs_signal = (filt_params_signal["l_freq"], filt_params_signal["h_freq"])
-        self.freqs_noise = (filt_params_noise["l_freq"], filt_params_noise["h_freq"])
-        self.filt_params_signal = filt_params_signal
-        self.filt_params_noise = filt_params_noise
-        # check if boolean
-        if not isinstance(sort_by_spectral_ratio, (bool)):
-            raise ValueError("sort_by_spectral_ratio must be boolean")
-        self.sort_by_spectral_ratio = sort_by_spectral_ratio
-        if n_fft is None:
-            self.n_fft = int(self.info["sfreq"])
-        else:
-            self.n_fft = int(n_fft)
-        # check if boolean
-        if not isinstance(return_filtered, (bool)):
-            raise ValueError("return_filtered must be boolean")
-        self.return_filtered = return_filtered
-        self.reg = reg
-        self.n_components = n_components
-        self.rank = rank
-        self.cov_method_params = cov_method_params
+        self.freqs_signal_ = (
+            self.filt_params_signal["l_freq"],
+            self.filt_params_signal["h_freq"],
+        )
+        self.freqs_noise_ = (
+            self.filt_params_noise["l_freq"],
+            self.filt_params_noise["h_freq"],
+        )
+        _validate_type(self.sort_by_spectral_ratio, (bool,), "sort_by_spectral_ratio")
+        _validate_type(self.n_fft, ("numeric", None), "n_fft")
+        self.n_fft_ = min(
+            int(self.n_fft if self.n_fft is not None else self.sfreq_),
+            X.shape[-1],
+        )
+        _validate_type(self.return_filtered, (bool,), "return_filtered")
+        if isinstance(self.info, Info):
+            ch_types = self.info.get_channel_types(picks=self.picks, unique=True)
+            if len(ch_types) > 1:
+                raise ValueError(
+                    "At this point SSD only supports fitting "
+                    f"single channel types. Your info has {len(ch_types)} types."
+                )
 
-    def _check_X(self, X):
+    def _check_X(self, X, *, y=None, fit=False):
         """Check input data."""
-        _validate_type(X, np.ndarray, "X")
-        _check_option("X.ndim", X.ndim, (2, 3))
+        X = self._check_data(X, y=y, fit=fit, atleast_3d=False)
         n_chan = X.shape[-2]
-        if n_chan != self.info["nchan"]:
+        if isinstance(self.info, Info) and n_chan != self.info["nchan"]:
             raise ValueError(
                 "Info must match the input data."
                 f"Found {n_chan} channels but expected {self.info['nchan']}."
             )
+        return X
 
     def fit(self, X, y=None):
         """Estimate the SSD decomposition on raw or epoched data.
@@ -176,18 +195,17 @@ def fit(self, X, y=None):
         self : instance of SSD
             Returns the modified instance.
         """
-        ch_types = self.info.get_channel_types(picks=self.picks, unique=True)
-        if len(ch_types) > 1:
-            raise ValueError(
-                "At this point SSD only supports fitting "
-                f"single channel types. Your info has {len(ch_types)} types."
-            )
-        self.picks_ = _picks_to_idx(self.info, self.picks, none="data", exclude="bads")
-        self._check_X(X)
+        X = self._check_X(X, y=y, fit=True)
+        self._validate_params(X)
+        if isinstance(self.info, Info):
+            info = self.info
+        else:
+            info = create_info(X.shape[-2], self.sfreq_, ch_types="eeg")
+        self.picks_ = _picks_to_idx(info, self.picks, none="data", exclude="bads")
         X_aux = X[..., self.picks_, :]
 
-        X_signal = filter_data(X_aux, self.info["sfreq"], **self.filt_params_signal)
-        X_noise = filter_data(X_aux, self.info["sfreq"], **self.filt_params_noise)
+        X_signal = filter_data(X_aux, self.sfreq_, **self.filt_params_signal)
+        X_noise = filter_data(X_aux, self.sfreq_, **self.filt_params_noise)
         X_noise -= X_signal
         if X.ndim == 3:
             X_signal = np.hstack(X_signal)
@@ -199,19 +217,19 @@ def fit(self, X, y=None):
             reg=self.reg,
             method_params=self.cov_method_params,
             rank="full",
-            info=self.info,
+            info=info,
         )
         cov_noise = _regularized_covariance(
             X_noise,
             reg=self.reg,
             method_params=self.cov_method_params,
             rank="full",
-            info=self.info,
+            info=info,
         )
 
         # project cov to rank subspace
         cov_signal, cov_noise, rank_proj = _dimensionality_reduction(
-            cov_signal, cov_noise, self.info, self.rank
+            cov_signal, cov_noise, info, self.rank
         )
 
         eigvals_, eigvects_ = eigh(cov_signal, cov_noise)
@@ -226,10 +244,10 @@ def fit(self, X, y=None):
         # than the initial ordering. This ordering should be also learned when
         # fitting.
         X_ssd = self.filters_.T @ X[..., self.picks_, :]
-        sorter_spec = Ellipsis
+        sorter_spec = slice(None)
         if self.sort_by_spectral_ratio:
             _, sorter_spec = self.get_spectral_ratio(ssd_sources=X_ssd)
-        self.sorter_spec = sorter_spec
+        self.sorter_spec_ = sorter_spec
         logger.info("Done.")
         return self
 
@@ -248,17 +266,13 @@ def transform(self, X):
         X_ssd : array, shape ([n_epochs, ]n_components, n_times)
             The processed data.
         """
-        self._check_X(X)
-        if self.filters_ is None:
-            raise RuntimeError("No filters available. Please first call fit")
+        check_is_fitted(self, "filters_")
+        X = self._check_X(X)
         if self.return_filtered:
             X_aux = X[..., self.picks_, :]
-            X = filter_data(X_aux, self.info["sfreq"], **self.filt_params_signal)
+            X = filter_data(X_aux, self.sfreq_, **self.filt_params_signal)
         X_ssd = self.filters_.T @ X[..., self.picks_, :]
-        if X.ndim == 2:
-            X_ssd = X_ssd[self.sorter_spec][: self.n_components]
-        else:
-            X_ssd = X_ssd[:, self.sorter_spec, :][:, : self.n_components, :]
+        X_ssd = X_ssd[..., self.sorter_spec_, :][..., : self.n_components, :]
         return X_ssd
 
     def fit_transform(self, X, y=None, **fit_params):
@@ -308,11 +322,9 @@ def get_spectral_ratio(self, ssd_sources):
         ----------
         .. footbibliography::
         """
-        psd, freqs = psd_array_welch(
-            ssd_sources, sfreq=self.info["sfreq"], n_fft=self.n_fft
-        )
-        sig_idx = _time_mask(freqs, *self.freqs_signal)
-        noise_idx = _time_mask(freqs, *self.freqs_noise)
+        psd, freqs = psd_array_welch(ssd_sources, sfreq=self.sfreq_, n_fft=self.n_fft_)
+        sig_idx = _time_mask(freqs, *self.freqs_signal_)
+        noise_idx = _time_mask(freqs, *self.freqs_noise_)
         if psd.ndim == 3:
             mean_sig = psd[:, :, sig_idx].mean(axis=2).mean(axis=0)
             mean_noise = psd[:, :, noise_idx].mean(axis=2).mean(axis=0)
@@ -352,7 +364,7 @@ def apply(self, X):
             The processed data.
         """
         X_ssd = self.transform(X)
-        pick_patterns = self.patterns_[self.sorter_spec][: self.n_components].T
+        pick_patterns = self.patterns_[self.sorter_spec_][: self.n_components].T
         X = pick_patterns @ X_ssd
         return X
 
diff --git a/mne/decoding/tests/test_base.py b/mne/decoding/tests/test_base.py
index 6d915dd24f9..02ca392e590 100644
--- a/mne/decoding/tests/test_base.py
+++ b/mne/decoding/tests/test_base.py
@@ -86,6 +86,8 @@ def _make_data(n_samples=1000, n_features=5, n_targets=3):
     X = Y.dot(A.T)
     X += np.random.randn(n_samples, n_features)  # add noise
     X += np.random.rand(n_features)  # Put an offset
+    if n_targets == 1:
+        Y = Y[:, 0]
 
     return X, Y, A
 
@@ -95,7 +97,7 @@ def test_get_coef():
     """Test getting linear coefficients (filters/patterns) from estimators."""
     lm_classification = LinearModel()
     assert hasattr(lm_classification, "__sklearn_tags__")
-    print(lm_classification.__sklearn_tags__)
+    print(lm_classification.__sklearn_tags__())
     assert is_classifier(lm_classification.model)
     assert is_classifier(lm_classification)
     assert not is_regressor(lm_classification.model)
@@ -473,9 +475,8 @@ def test_cross_val_multiscore():
 def test_sklearn_compliance(estimator, check):
     """Test LinearModel compliance with sklearn."""
     ignores = (
-        "check_n_features_in",  # maybe we should add this someday?
-        "check_estimator_sparse_data",  # we densify
         "check_estimators_overwrite_params",  # self.model changes!
+        "check_dont_overwrite_parameters",
         "check_parameters_default_constructible",
     )
     if any(ignore in str(check) for ignore in ignores):
diff --git a/mne/decoding/tests/test_csp.py b/mne/decoding/tests/test_csp.py
index 7a1a83feeaf..d6374304d28 100644
--- a/mne/decoding/tests/test_csp.py
+++ b/mne/decoding/tests/test_csp.py
@@ -19,6 +19,7 @@
 from sklearn.model_selection import StratifiedKFold, cross_val_score
 from sklearn.pipeline import Pipeline, make_pipeline
 from sklearn.svm import SVC
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from mne import Epochs, compute_proj_raw, io, pick_types, read_events
 from mne.decoding import CSP, LinearModel, Scaler, SPoC, get_coef
@@ -139,18 +140,22 @@ def test_csp():
     y = epochs.events[:, -1]
 
     # Init
-    pytest.raises(ValueError, CSP, n_components="foo", norm_trace=False)
+    csp = CSP(n_components="foo")
+    with pytest.raises(TypeError, match="must be an instance"):
+        csp.fit(epochs_data, y)
     for reg in ["foo", -0.1, 1.1]:
         csp = CSP(reg=reg, norm_trace=False)
         pytest.raises(ValueError, csp.fit, epochs_data, epochs.events[:, -1])
     for reg in ["oas", "ledoit_wolf", 0, 0.5, 1.0]:
         CSP(reg=reg, norm_trace=False)
-    for cov_est in ["foo", None]:
-        pytest.raises(ValueError, CSP, cov_est=cov_est, norm_trace=False)
+    csp = CSP(cov_est="foo", norm_trace=False)
+    with pytest.raises(ValueError, match="Invalid value"):
+        csp.fit(epochs_data, y)
+    csp = CSP(norm_trace="foo")
     with pytest.raises(TypeError, match="instance of bool"):
-        CSP(norm_trace="foo")
+        csp.fit(epochs_data, y)
     for cov_est in ["concat", "epoch"]:
-        CSP(cov_est=cov_est, norm_trace=False)
+        CSP(cov_est=cov_est, norm_trace=False).fit(epochs_data, y)
 
     n_components = 3
     # Fit
@@ -171,8 +176,8 @@ def test_csp():
 
     # Test data exception
     pytest.raises(ValueError, csp.fit, epochs_data, np.zeros_like(epochs.events))
-    pytest.raises(ValueError, csp.fit, epochs, y)
-    pytest.raises(ValueError, csp.transform, epochs)
+    pytest.raises(ValueError, csp.fit, "foo", y)
+    pytest.raises(ValueError, csp.transform, "foo")
 
     # Test plots
     epochs.pick(picks="mag")
@@ -200,7 +205,7 @@ def test_csp():
     for cov_est in ["concat", "epoch"]:
         csp = CSP(n_components=n_components, cov_est=cov_est, norm_trace=False)
         csp.fit(epochs_data, epochs.events[:, 2]).transform(epochs_data)
-        assert_equal(len(csp._classes), 4)
+        assert_equal(len(csp.classes_), 4)
         assert_array_equal(csp.filters_.shape, [n_channels, n_channels])
         assert_array_equal(csp.patterns_.shape, [n_channels, n_channels])
 
@@ -220,15 +225,17 @@ def test_csp():
     # Different normalization return different transform
     assert np.sum((X_trans["True"] - X_trans["False"]) ** 2) > 1.0
     # Check wrong inputs
-    pytest.raises(ValueError, CSP, transform_into="average_power", log="foo")
+    csp = CSP(transform_into="average_power", log="foo")
+    with pytest.raises(TypeError, match="must be an instance of bool"):
+        csp.fit(epochs_data, epochs.events[:, 2])
 
     # Test csp space transform
     csp = CSP(transform_into="csp_space", norm_trace=False)
     assert csp.transform_into == "csp_space"
     for log in ("foo", True, False):
-        pytest.raises(
-            ValueError, CSP, transform_into="csp_space", log=log, norm_trace=False
-        )
+        csp = CSP(transform_into="csp_space", log=log, norm_trace=False)
+        with pytest.raises(TypeError, match="must be an instance"):
+            csp.fit(epochs_data, epochs.events[:, 2])
     n_components = 2
     csp = CSP(n_components=n_components, transform_into="csp_space", norm_trace=False)
     Xt = csp.fit(epochs_data, epochs.events[:, 2]).transform(epochs_data)
@@ -343,7 +350,7 @@ def test_regularized_csp(ch_type, rank, reg):
 
     # test init exception
     pytest.raises(ValueError, csp.fit, epochs_data, np.zeros_like(epochs.events))
-    pytest.raises(ValueError, csp.fit, epochs, y)
+    pytest.raises(ValueError, csp.fit, "foo", y)
     pytest.raises(ValueError, csp.transform, epochs)
 
     csp.n_components = n_components
@@ -465,7 +472,9 @@ def test_csp_component_ordering():
     """Test that CSP component ordering works as expected."""
     x, y = deterministic_toy_data(["class_a", "class_b"])
 
-    pytest.raises(ValueError, CSP, component_order="invalid")
+    csp = CSP(component_order="invalid")
+    with pytest.raises(ValueError, match="Invalid value"):
+        csp.fit(x, y)
 
     # component_order='alternate' only works with two classes
     csp = CSP(component_order="alternate")
@@ -480,3 +489,10 @@ def test_csp_component_ordering():
     # p_alt arranges them to [0.8, 0.06, 0.5, 0.1]
     # p_mut arranges them to [0.06, 0.1, 0.8, 0.5]
     assert_array_almost_equal(p_alt, p_mut[[2, 0, 3, 1]])
+
+
+@pytest.mark.filterwarnings("ignore:.*Only one sample available.*")
+@parametrize_with_checks([CSP(), SPoC()])
+def test_sklearn_compliance(estimator, check):
+    """Test compliance with sklearn."""
+    check(estimator)
diff --git a/mne/decoding/tests/test_ems.py b/mne/decoding/tests/test_ems.py
index 10774c0681a..dc54303a541 100644
--- a/mne/decoding/tests/test_ems.py
+++ b/mne/decoding/tests/test_ems.py
@@ -11,6 +11,7 @@
 pytest.importorskip("sklearn")
 
 from sklearn.model_selection import StratifiedKFold
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from mne import Epochs, io, pick_types, read_events
 from mne.decoding import EMS, compute_ems
@@ -91,3 +92,9 @@ def test_ems():
     assert_equal(ems.__repr__(), "<EMS: fitted with 4 filters on 2 classes.>")
     assert_array_almost_equal(filters, np.mean(coefs, axis=0))
     assert_array_almost_equal(surrogates, np.vstack(Xt))
+
+
+@parametrize_with_checks([EMS()])
+def test_sklearn_compliance(estimator, check):
+    """Test compliance with sklearn."""
+    check(estimator)
diff --git a/mne/decoding/tests/test_search_light.py b/mne/decoding/tests/test_search_light.py
index 7cb3a66dd81..e7abfd9209e 100644
--- a/mne/decoding/tests/test_search_light.py
+++ b/mne/decoding/tests/test_search_light.py
@@ -41,7 +41,7 @@ def make_data():
     return X, y
 
 
-def test_search_light():
+def test_search_light_basic():
     """Test SlidingEstimator."""
     # https://github.com/scikit-learn/scikit-learn/issues/27711
     if platform.system() == "Windows" and check_version("numpy", "2.0.0.dev0"):
@@ -52,7 +52,9 @@ def test_search_light():
     X, y = make_data()
     n_epochs, _, n_time = X.shape
     # init
-    pytest.raises(ValueError, SlidingEstimator, "foo")
+    sl = SlidingEstimator("foo")
+    with pytest.raises(ValueError, match="must be"):
+        sl.fit(X, y)
     sl = SlidingEstimator(Ridge())
     assert not is_classifier(sl)
     sl = SlidingEstimator(LogisticRegression(solver="liblinear"))
@@ -69,7 +71,8 @@ def test_search_light():
     # transforms
     pytest.raises(ValueError, sl.predict, X[:, :, :2])
     y_trans = sl.transform(X)
-    assert X.dtype == y_trans.dtype == np.dtype(float)
+    assert X.dtype == float
+    assert y_trans.dtype == float
     y_pred = sl.predict(X)
     assert y_pred.dtype == np.dtype(int)
     assert_array_equal(y_pred.shape, [n_epochs, n_time])
@@ -344,22 +347,19 @@ def predict_proba(self, X):
 
 
 @pytest.mark.slowtest
-@parametrize_with_checks([SlidingEstimator(LogisticRegression(), allow_2d=True)])
+@parametrize_with_checks(
+    [
+        SlidingEstimator(LogisticRegression(), allow_2d=True),
+        GeneralizingEstimator(LogisticRegression(), allow_2d=True),
+    ]
+)
 def test_sklearn_compliance(estimator, check):
     """Test LinearModel compliance with sklearn."""
     ignores = (
-        "check_estimator_sparse_data",  # we densify
-        "check_classifiers_one_label_sample_weights",  # don't handle singleton
-        "check_classifiers_classes",  # dim mismatch
+        # TODO: we don't handle singleton right (probably)
+        "check_classifiers_one_label_sample_weights",
+        "check_classifiers_classes",
         "check_classifiers_train",
-        "check_decision_proba_consistency",
-        "check_parameters_default_constructible",
-        # Should probably fix these?
-        "check_estimators_unfitted",
-        "check_transformer_data_not_an_array",
-        "check_n_features_in",
-        "check_fit2d_predict1d",
-        "check_do_not_raise_errors_in_init_or_set_params",
     )
     if any(ignore in str(check) for ignore in ignores):
         return
diff --git a/mne/decoding/tests/test_ssd.py b/mne/decoding/tests/test_ssd.py
index 8f4d2472803..d1ac793e65b 100644
--- a/mne/decoding/tests/test_ssd.py
+++ b/mne/decoding/tests/test_ssd.py
@@ -11,6 +11,7 @@
 pytest.importorskip("sklearn")
 
 from sklearn.pipeline import Pipeline
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from mne import create_info, io
 from mne.decoding import CSP
@@ -101,8 +102,9 @@ def test_ssd():
         l_trans_bandwidth=1,
         h_trans_bandwidth=1,
     )
+    ssd = SSD(info, filt_params_signal, filt_params_noise)
     with pytest.raises(TypeError, match="must be an instance "):
-        ssd = SSD(info, filt_params_signal, filt_params_noise)
+        ssd.fit(X)
 
     # Wrongly specified noise band
     freq = 2
@@ -115,14 +117,16 @@ def test_ssd():
         l_trans_bandwidth=1,
         h_trans_bandwidth=1,
     )
+    ssd = SSD(info, filt_params_signal, filt_params_noise)
     with pytest.raises(ValueError, match="Wrongly specified "):
-        ssd = SSD(info, filt_params_signal, filt_params_noise)
+        ssd.fit(X)
 
     # filt param no dict
     filt_params_signal = freqs_sig
     filt_params_noise = freqs_noise
+    ssd = SSD(info, filt_params_signal, filt_params_noise)
     with pytest.raises(ValueError, match="must be defined"):
-        ssd = SSD(info, filt_params_signal, filt_params_noise)
+        ssd.fit(X)
 
     # Data type
     filt_params_signal = dict(
@@ -140,15 +144,18 @@ def test_ssd():
     ssd = SSD(info, filt_params_signal, filt_params_noise)
     raw = io.RawArray(X, info)
 
-    pytest.raises(TypeError, ssd.fit, raw)
+    with pytest.raises(ValueError):
+        ssd.fit(raw)
 
     # check non-boolean return_filtered
-    with pytest.raises(ValueError, match="return_filtered"):
-        ssd = SSD(info, filt_params_signal, filt_params_noise, return_filtered=0)
+    ssd = SSD(info, filt_params_signal, filt_params_noise, return_filtered=0)
+    with pytest.raises(TypeError, match="return_filtered"):
+        ssd.fit(X)
 
     # check non-boolean sort_by_spectral_ratio
-    with pytest.raises(ValueError, match="sort_by_spectral_ratio"):
-        ssd = SSD(info, filt_params_signal, filt_params_noise, sort_by_spectral_ratio=0)
+    ssd = SSD(info, filt_params_signal, filt_params_noise, sort_by_spectral_ratio=0)
+    with pytest.raises(TypeError, match="sort_by_spectral_ratio"):
+        ssd.fit(X)
 
     # More than 1 channel type
     ch_types = np.reshape([["mag"] * 10, ["eeg"] * 10], n_channels)
@@ -161,7 +168,8 @@ def test_ssd():
     # Number of channels
     info_3 = create_info(ch_names=n_channels + 1, sfreq=sf, ch_types="eeg")
     ssd = SSD(info_3, filt_params_signal, filt_params_noise)
-    pytest.raises(ValueError, ssd.fit, X)
+    with pytest.raises(ValueError, match="channels but expected"):
+        ssd.fit(X)
 
     # Fit
     n_components = 10
@@ -381,7 +389,7 @@ def test_sorting():
     ssd.fit(Xtr)
 
     # check sorters
-    sorter_in = ssd.sorter_spec
+    sorter_in = ssd.sorter_spec_
     ssd = SSD(
         info,
         filt_params_signal,
@@ -476,3 +484,28 @@ def test_non_full_rank_data():
     if sys.platform == "darwin":
         pytest.xfail("Unknown linalg bug (Accelerate?)")
     ssd.fit(X)
+
+
+@pytest.mark.filterwarnings("ignore:.*is longer than.*")
+@parametrize_with_checks(
+    [
+        SSD(
+            100.0,
+            dict(l_freq=0.0, h_freq=30.0),
+            dict(l_freq=0.0, h_freq=40.0),
+        )
+    ]
+)
+def test_sklearn_compliance(estimator, check):
+    """Test LinearModel compliance with sklearn."""
+    ignores = (
+        "check_methods_sample_order_invariance",
+        # Shape stuff
+        "check_fit_idempotent",
+        "check_methods_subset_invariance",
+        "check_transformer_general",
+        "check_transformer_data_not_an_array",
+    )
+    if any(ignore in str(check) for ignore in ignores):
+        return
+    check(estimator)
diff --git a/mne/decoding/tests/test_time_frequency.py b/mne/decoding/tests/test_time_frequency.py
index 37e7d7d8dc2..29ba5e7597a 100644
--- a/mne/decoding/tests/test_time_frequency.py
+++ b/mne/decoding/tests/test_time_frequency.py
@@ -10,18 +10,23 @@
 pytest.importorskip("sklearn")
 
 from sklearn.base import clone
+from sklearn.utils.estimator_checks import parametrize_with_checks
 
 from mne.decoding.time_frequency import TimeFrequency
 
 
-def test_timefrequency():
+def test_timefrequency_basic():
     """Test TimeFrequency."""
     # Init
     n_freqs = 3
     freqs = [20, 21, 22]
     tf = TimeFrequency(freqs, sfreq=100)
+    n_epochs, n_chans, n_times = 10, 2, 100
+    X = np.random.rand(n_epochs, n_chans, n_times)
     for output in ["avg_power", "foo", None]:
-        pytest.raises(ValueError, TimeFrequency, freqs, output=output)
+        tf = TimeFrequency(freqs, output=output)
+        with pytest.raises(ValueError, match="Invalid value"):
+            tf.fit(X)
     tf = clone(tf)
 
     # Clone estimator
@@ -30,8 +35,6 @@ def test_timefrequency():
     clone(tf)
 
     # Fit
-    n_epochs, n_chans, n_times = 10, 2, 100
-    X = np.random.rand(n_epochs, n_chans, n_times)
     tf.fit(X, None)
 
     # Transform
@@ -41,9 +44,15 @@ def test_timefrequency():
     Xt = tf.transform(X)
     assert_array_equal(Xt.shape, [n_epochs, n_chans, n_freqs, n_times])
     # 2-D X
-    Xt = tf.transform(X[:, 0, :])
+    Xt = tf.fit_transform(X[:, 0, :])
     assert_array_equal(Xt.shape, [n_epochs, n_freqs, n_times])
     # 3-D with decim
     tf = TimeFrequency(freqs, sfreq=100, decim=2)
-    Xt = tf.transform(X)
+    Xt = tf.fit_transform(X)
     assert_array_equal(Xt.shape, [n_epochs, n_chans, n_freqs, n_times // 2])
+
+
+@parametrize_with_checks([TimeFrequency([300, 400], 1000.0, n_cycles=0.25)])
+def test_sklearn_compliance(estimator, check):
+    """Test LinearModel compliance with sklearn."""
+    check(estimator)
diff --git a/mne/decoding/tests/test_transformer.py b/mne/decoding/tests/test_transformer.py
index d7b974fe932..49beffef788 100644
--- a/mne/decoding/tests/test_transformer.py
+++ b/mne/decoding/tests/test_transformer.py
@@ -37,7 +37,7 @@
 data_dir = Path(__file__).parents[2] / "io" / "tests" / "data"
 raw_fname = data_dir / "test_raw.fif"
 event_name = data_dir / "test-eve.fif"
-info = create_info(3, 1000.0, "eeg")
+info = create_info(2, 1000.0, "eeg")
 
 
 @pytest.mark.parametrize(
@@ -103,9 +103,11 @@ def test_scaler(info, method):
     assert_array_almost_equal(epochs_data, Xi)
 
     # Test init exception
-    pytest.raises(ValueError, Scaler, None, None)
-    pytest.raises(TypeError, scaler.fit, epochs, y)
-    pytest.raises(TypeError, scaler.transform, epochs)
+    x = Scaler(None, None)
+    with pytest.raises(ValueError):
+        x.fit(epochs_data, y)
+    pytest.raises(ValueError, scaler.fit, "foo", y)
+    pytest.raises(ValueError, scaler.transform, "foo")
     epochs_bad = Epochs(
         raw,
         events,
@@ -166,8 +168,8 @@ def test_filterestimator():
     X = filt.fit_transform(epochs_data, y)
 
     # Test init exception
-    pytest.raises(ValueError, filt.fit, epochs, y)
-    pytest.raises(ValueError, filt.transform, epochs)
+    pytest.raises(ValueError, filt.fit, "foo", y)
+    pytest.raises(ValueError, filt.transform, "foo")
 
 
 def test_psdestimator():
@@ -190,8 +192,10 @@ def test_psdestimator():
     assert_array_equal(psd.fit(epochs_data, y).transform(epochs_data), X)
 
     # Test init exception
-    pytest.raises(ValueError, psd.fit, epochs, y)
-    pytest.raises(ValueError, psd.transform, epochs)
+    with pytest.raises(ValueError):
+        psd.fit("foo", y)
+    with pytest.raises(ValueError):
+        psd.transform("foo")
 
 
 def test_vectorizer():
@@ -237,11 +241,13 @@ def test_unsupervised_spatial_filter():
         verbose=False,
     )
 
-    # Test estimator
-    pytest.raises(ValueError, UnsupervisedSpatialFilter, KernelRidge(2))
+    # Test estimator (must be a transformer)
+    X = epochs.get_data(copy=False)
+    usf = UnsupervisedSpatialFilter(KernelRidge(2))
+    with pytest.raises(ValueError, match="transform"):
+        usf.fit(X)
 
     # Test fit
-    X = epochs.get_data(copy=False)
     n_components = 4
     usf = UnsupervisedSpatialFilter(PCA(n_components))
     usf.fit(X)
@@ -257,7 +263,9 @@ def test_unsupervised_spatial_filter():
     # Test with average param
     usf = UnsupervisedSpatialFilter(PCA(4), average=True)
     usf.fit_transform(X)
-    pytest.raises(ValueError, UnsupervisedSpatialFilter, PCA(4), 2)
+    usf = UnsupervisedSpatialFilter(PCA(4), 2)
+    with pytest.raises(TypeError, match="average must be"):
+        usf.fit(X)
 
 
 def test_temporal_filter():
@@ -283,8 +291,8 @@ def test_temporal_filter():
         assert X.shape == Xt.shape
 
     # Test fit and transform numpy type check
-    with pytest.raises(ValueError, match="Data to be filtered must be"):
-        filt.transform([1, 2])
+    with pytest.raises(ValueError):
+        filt.transform("foo")
 
     # Test with 2 dimensional data array
     X = np.random.rand(101, 500)
@@ -307,28 +315,31 @@ def test_bad_triage():
 @parametrize_with_checks(
     [
         FilterEstimator(info, l_freq=1, h_freq=10),
-        # PSDEstimator(),
-        # Scaler(info=info),
-        # TemporalFilter(),
-        # UnsupervisedSpatialFilter(PCA()),
-        # Vectorizer(),
+        PSDEstimator(),
+        Scaler(scalings="mean"),
+        # Not easy to test Scaler(info) b/c number of channels must match
+        TemporalFilter(),
+        UnsupervisedSpatialFilter(PCA()),
+        Vectorizer(),
     ]
 )
 def test_sklearn_compliance(estimator, check):
     """Test LinearModel compliance with sklearn."""
-    ignores = (
-        # TODO: Should probably add these
-        "check_transformer_general",
-        "check_estimators_overwrite_params",
-        "check_fit_idempotent",
-        "check_fit1d",
-        "check_fit2d_predict1d",
-        "check_n_features_in",  # maybe we should add this someday?
-        "check_estimator_sparse_data",  # we densify
-        "check_estimators_overwrite_params",  # self.model changes!
-        "check_parameters_default_constructible",
-        "check_methods_sample_order_invariance",
-    )
+    ignores = []
+    if estimator.__class__.__name__ == "FilterEstimator":
+        ignores += [
+            "check_estimators_overwrite_params",  # we modify self.info
+            "check_methods_sample_order_invariance",
+        ]
+    if estimator.__class__.__name__ == "UnsupervisedSpatialFilter":
+        ignores += [
+            "check_estimators_overwrite_params",  # we modify self.estimator
+        ]
+    if estimator.__class__.__name__.startswith(("PSD", "Temporal")):
+        ignores += [
+            "check_transformers_unfitted",  # allow unfitted transform
+            "check_methods_sample_order_invariance",
+        ]
     if any(ignore in str(check) for ignore in ignores):
         return
     check(estimator)
diff --git a/mne/decoding/time_frequency.py b/mne/decoding/time_frequency.py
index de6ec52155b..5db2580498a 100644
--- a/mne/decoding/time_frequency.py
+++ b/mne/decoding/time_frequency.py
@@ -3,14 +3,16 @@
 # Copyright the MNE-Python contributors.
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.base import BaseEstimator
+from sklearn.utils.validation import check_is_fitted
 
 from ..time_frequency.tfr import _compute_tfr
-from ..utils import _check_option, fill_doc, verbose
+from ..utils import _check_option, fill_doc
+from .transformer import MNETransformerMixin
 
 
 @fill_doc
-class TimeFrequency(TransformerMixin, BaseEstimator):
+class TimeFrequency(MNETransformerMixin, BaseEstimator):
     """Time frequency transformer.
 
     Time-frequency transform of times series along the last axis.
@@ -59,7 +61,7 @@ class TimeFrequency(TransformerMixin, BaseEstimator):
     mne.time_frequency.tfr_multitaper
     """
 
-    @verbose
+    @fill_doc  # sklearn compliance
     def __init__(
         self,
         freqs,
@@ -74,9 +76,6 @@ def __init__(
         verbose=None,
     ):
         """Init TimeFrequency transformer."""
-        # Check non-average output
-        output = _check_option("output", output, ["complex", "power", "phase"])
-
         self.freqs = freqs
         self.sfreq = sfreq
         self.method = method
@@ -89,6 +88,16 @@ def __init__(
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    def __sklearn_tags__(self):
+        """Return sklearn tags."""
+        out = super().__sklearn_tags__()
+        from sklearn.utils import TransformerTags
+
+        if out.transformer_tags is None:
+            out.transformer_tags = TransformerTags()
+        out.transformer_tags.preserves_dtype = []  # real->complex
+        return out
+
     def fit_transform(self, X, y=None):
         """Time-frequency transform of times series along the last axis.
 
@@ -123,6 +132,10 @@ def fit(self, X, y=None):  # noqa: D401
         self : object
             Return self.
         """
+        # Check non-average output
+        _check_option("output", self.output, ["complex", "power", "phase"])
+        self._check_data(X, y=y, fit=True)
+        self.fitted_ = True
         return self
 
     def transform(self, X):
@@ -130,16 +143,18 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_channels, n_times)
+        X : array, shape (n_samples, [n_channels, ]n_times)
             The training data samples. The channel dimension can be zero- or
             1-dimensional.
 
         Returns
         -------
-        Xt : array, shape (n_samples, n_channels, n_freqs, n_times)
+        Xt : array, shape (n_samples, [n_channels, ]n_freqs, n_times)
             The time-frequency transform of the data, where n_channels can be
             zero- or 1-dimensional.
         """
+        X = self._check_data(X, atleast_3d=False)
+        check_is_fitted(self, "fitted_")
         # Ensure 3-dimensional X
         shape = X.shape[1:-1]
         if not shape:
diff --git a/mne/decoding/transformer.py b/mne/decoding/transformer.py
index f34f049fdf2..4d28155ab86 100644
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -4,6 +4,9 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin, check_array
+from sklearn.preprocessing import RobustScaler, StandardScaler
+from sklearn.utils import check_X_y
+from sklearn.utils.validation import check_is_fitted, validate_data
 
 from .._fiff.pick import (
     _pick_data_channels,
@@ -14,7 +17,51 @@
 from ..cov import _check_scalings_user
 from ..filter import filter_data
 from ..time_frequency import psd_array_multitaper
-from ..utils import _check_option, _validate_type, fill_doc, verbose
+from ..utils import _check_option, _validate_type, fill_doc
+
+
+class MNETransformerMixin(TransformerMixin):
+    """TransformerMixin plus some helpers."""
+
+    def _check_data(
+        self,
+        epochs_data,
+        *,
+        y=None,
+        atleast_3d=True,
+        fit=False,
+        return_y=False,
+        multi_output=False,
+        check_n_features=True,
+    ):
+        kwargs = dict(dtype=np.float64, allow_nd=True, order="C", force_writeable=True)
+        if hasattr(self, "n_features_in_") and check_n_features:
+            if y is None:
+                epochs_data = validate_data(
+                    self,
+                    epochs_data,
+                    **kwargs,
+                    reset=fit,
+                )
+            else:
+                epochs_data, y = validate_data(
+                    self,
+                    epochs_data,
+                    y,
+                    **kwargs,
+                    reset=fit,
+                )
+        elif y is None:
+            epochs_data = check_array(epochs_data, **kwargs)
+        else:
+            epochs_data, y = check_X_y(
+                X=epochs_data, y=y, multi_output=multi_output, **kwargs
+            )
+        if fit:
+            self.n_features_in_ = epochs_data.shape[1]
+        if atleast_3d:
+            epochs_data = np.atleast_3d(epochs_data)
+        return (epochs_data, y) if return_y else epochs_data
 
 
 class _ConstantScaler:
@@ -54,8 +101,9 @@ def fit_transform(self, X, y=None):
 
 def _sklearn_reshape_apply(func, return_result, X, *args, **kwargs):
     """Reshape epochs and apply function."""
-    if not isinstance(X, np.ndarray):
-        raise ValueError(f"data should be an np.ndarray, got {type(X)}.")
+    _validate_type(X, np.ndarray, "X")
+    if X.size == 0:
+        return X.copy() if return_result else None
     orig_shape = X.shape
     X = np.reshape(X.transpose(0, 2, 1), (-1, orig_shape[1]))
     X = func(X, *args, **kwargs)
@@ -66,7 +114,7 @@ def _sklearn_reshape_apply(func, return_result, X, *args, **kwargs):
 
 
 @fill_doc
-class Scaler(TransformerMixin, BaseEstimator):
+class Scaler(MNETransformerMixin, BaseEstimator):
     """Standardize channel data.
 
     This class scales data for each channel. It differs from scikit-learn
@@ -108,31 +156,6 @@ def __init__(self, info=None, scalings=None, with_mean=True, with_std=True):
         self.with_std = with_std
         self.scalings = scalings
 
-        if not (scalings is None or isinstance(scalings, dict | str)):
-            raise ValueError(
-                f"scalings type should be dict, str, or None, got {type(scalings)}"
-            )
-        if isinstance(scalings, str):
-            _check_option("scalings", scalings, ["mean", "median"])
-        if scalings is None or isinstance(scalings, dict):
-            if info is None:
-                raise ValueError(
-                    f'Need to specify "info" if scalings is {type(scalings)}'
-                )
-            self._scaler = _ConstantScaler(info, scalings, self.with_std)
-        elif scalings == "mean":
-            from sklearn.preprocessing import StandardScaler
-
-            self._scaler = StandardScaler(
-                with_mean=self.with_mean, with_std=self.with_std
-            )
-        else:  # scalings == 'median':
-            from sklearn.preprocessing import RobustScaler
-
-            self._scaler = RobustScaler(
-                with_centering=self.with_mean, with_scaling=self.with_std
-            )
-
     def fit(self, epochs_data, y=None):
         """Standardize data across channels.
 
@@ -148,11 +171,30 @@ def fit(self, epochs_data, y=None):
         self : instance of Scaler
             The modified instance.
         """
-        _validate_type(epochs_data, np.ndarray, "epochs_data")
-        if epochs_data.ndim == 2:
-            epochs_data = epochs_data[..., np.newaxis]
+        epochs_data = self._check_data(epochs_data, y=y, fit=True, multi_output=True)
         assert epochs_data.ndim == 3, epochs_data.shape
-        _sklearn_reshape_apply(self._scaler.fit, False, epochs_data, y=y)
+
+        _validate_type(self.scalings, (dict, str, type(None)), "scalings")
+        if isinstance(self.scalings, str):
+            _check_option(
+                "scalings", self.scalings, ["mean", "median"], extra="when str"
+            )
+        if self.scalings is None or isinstance(self.scalings, dict):
+            if self.info is None:
+                raise ValueError(
+                    f'Need to specify "info" if scalings is {type(self.scalings)}'
+                )
+            self.scaler_ = _ConstantScaler(self.info, self.scalings, self.with_std)
+        elif self.scalings == "mean":
+            self.scaler_ = StandardScaler(
+                with_mean=self.with_mean, with_std=self.with_std
+            )
+        else:  # scalings == 'median':
+            self.scaler_ = RobustScaler(
+                with_centering=self.with_mean, with_scaling=self.with_std
+            )
+
+        _sklearn_reshape_apply(self.scaler_.fit, False, epochs_data, y=y)
         return self
 
     def transform(self, epochs_data):
@@ -173,13 +215,14 @@ def transform(self, epochs_data):
         This function makes a copy of the data before the operations and the
         memory usage may be large with big data.
         """
-        _validate_type(epochs_data, np.ndarray, "epochs_data")
+        check_is_fitted(self, "scaler_")
+        epochs_data = self._check_data(epochs_data, atleast_3d=False)
         if epochs_data.ndim == 2:  # can happen with SlidingEstimator
             if self.info is not None:
                 assert len(self.info["ch_names"]) == epochs_data.shape[1]
             epochs_data = epochs_data[..., np.newaxis]
         assert epochs_data.ndim == 3, epochs_data.shape
-        return _sklearn_reshape_apply(self._scaler.transform, True, epochs_data)
+        return _sklearn_reshape_apply(self.scaler_.transform, True, epochs_data)
 
     def fit_transform(self, epochs_data, y=None):
         """Fit to data, then transform it.
@@ -225,19 +268,20 @@ def inverse_transform(self, epochs_data):
         This function makes a copy of the data before the operations and the
         memory usage may be large with big data.
         """
+        epochs_data = self._check_data(epochs_data, atleast_3d=False)
         squeeze = False
         # Can happen with CSP
         if epochs_data.ndim == 2:
             squeeze = True
             epochs_data = epochs_data[..., np.newaxis]
         assert epochs_data.ndim == 3, epochs_data.shape
-        out = _sklearn_reshape_apply(self._scaler.inverse_transform, True, epochs_data)
+        out = _sklearn_reshape_apply(self.scaler_.inverse_transform, True, epochs_data)
         if squeeze:
             out = out[..., 0]
         return out
 
 
-class Vectorizer(TransformerMixin, BaseEstimator):
+class Vectorizer(MNETransformerMixin, BaseEstimator):
     """Transform n-dimensional array into 2D array of n_samples by n_features.
 
     This class reshapes an n-dimensional array into an n_samples * n_features
@@ -274,7 +318,7 @@ def fit(self, X, y=None):
         self : instance of Vectorizer
             Return the modified instance.
         """
-        X = np.asarray(X)
+        X = self._check_data(X, y=y, atleast_3d=False, fit=True, check_n_features=False)
         self.features_shape_ = X.shape[1:]
         return self
 
@@ -294,7 +338,7 @@ def transform(self, X):
         X : array, shape (n_samples, n_features)
             The transformed data.
         """
-        X = np.asarray(X)
+        X = self._check_data(X, atleast_3d=False)
         if X.shape[1:] != self.features_shape_:
             raise ValueError("Shape of X used in fit and transform must be same")
         return X.reshape(len(X), -1)
@@ -333,7 +377,7 @@ def inverse_transform(self, X):
             The data transformed into shape as used in fit. The first
             dimension is of length n_samples.
         """
-        X = np.asarray(X)
+        X = self._check_data(X, atleast_3d=False, check_n_features=False)
         if X.ndim not in (2, 3):
             raise ValueError(
                 f"X should be of 2 or 3 dimensions but has shape {X.shape}"
@@ -342,7 +386,7 @@ def inverse_transform(self, X):
 
 
 @fill_doc
-class PSDEstimator(TransformerMixin, BaseEstimator):
+class PSDEstimator(MNETransformerMixin, BaseEstimator):
     """Compute power spectral density (PSD) using a multi-taper method.
 
     Parameters
@@ -374,7 +418,7 @@ class PSDEstimator(TransformerMixin, BaseEstimator):
     mne.Evoked.compute_psd
     """
 
-    @verbose
+    @fill_doc  # NB intentionally don't use verbose here so it doesn't modify the val
     def __init__(
         self,
         sfreq=2 * np.pi,
@@ -398,7 +442,7 @@ def __init__(
         self.normalization = normalization
         self.verbose = verbose
 
-    def fit(self, epochs_data, y):
+    def fit(self, epochs_data, y=None):
         """Compute power spectral density (PSD) using a multi-taper method.
 
         Parameters
@@ -413,14 +457,11 @@ def fit(self, epochs_data, y):
         self : instance of PSDEstimator
             The modified instance.
         """
-        if not isinstance(epochs_data, np.ndarray):
-            raise ValueError(
-                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
-            )
-
+        self._check_data(epochs_data, y=y, fit=True)
+        self.fitted_ = True  # sklearn compliance
         return self
 
-    def transform(self, epochs_data):
+    def transform(self, epochs_data, y=None):
         """Compute power spectral density (PSD) using a multi-taper method.
 
         Parameters
@@ -433,10 +474,7 @@ def transform(self, epochs_data):
         psd : array, shape (n_signals, n_freqs) or (n_freqs,)
             The computed PSD.
         """
-        if not isinstance(epochs_data, np.ndarray):
-            raise ValueError(
-                f"epochs_data should be of type ndarray (got {type(epochs_data)})."
-            )
+        epochs_data = self._check_data(epochs_data)
         psd, _ = psd_array_multitaper(
             epochs_data,
             sfreq=self.sfreq,
@@ -452,7 +490,7 @@ def transform(self, epochs_data):
 
 
 @fill_doc
-class FilterEstimator(TransformerMixin, BaseEstimator):
+class FilterEstimator(MNETransformerMixin, BaseEstimator):
     """Estimator to filter RtEpochs.
 
     Applies a zero-phase low-pass, high-pass, band-pass, or band-stop
@@ -530,13 +568,6 @@ def __init__(
         self.fir_design = fir_design
         self.verbose = verbose
 
-    def _check_data(self, epochs_data):
-        epochs_data = check_array(
-            epochs_data, dtype=np.float64, ensure_2d=False, order="C"
-        )
-        epochs_data = np.atleast_3d(epochs_data)
-        return epochs_data
-
     def fit(self, epochs_data, y):
         """Filter data.
 
@@ -553,7 +584,7 @@ def fit(self, epochs_data, y):
             The modified instance.
         """
         self.picks_ = _picks_to_idx(self.info, self.picks)
-        self._check_data(epochs_data)
+        self._check_data(epochs_data, y=y, fit=True)
 
         if self.l_freq == 0:
             self.l_freq = None
@@ -607,7 +638,7 @@ def transform(self, epochs_data):
         )
 
 
-class UnsupervisedSpatialFilter(TransformerMixin, BaseEstimator):
+class UnsupervisedSpatialFilter(MNETransformerMixin, BaseEstimator):
     """Use unsupervised spatial filtering across time and samples.
 
     Parameters
@@ -620,19 +651,6 @@ class UnsupervisedSpatialFilter(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, estimator, average=False):
-        # XXX: Use _check_estimator #3381
-        for attr in ("fit", "transform", "fit_transform"):
-            if not hasattr(estimator, attr):
-                raise ValueError(
-                    "estimator must be a scikit-learn "
-                    f"transformer, missing {attr} method"
-                )
-
-        if not isinstance(average, bool):
-            raise ValueError(
-                f"average parameter must be of bool type, got {type(bool)} instead"
-            )
-
         self.estimator = estimator
         self.average = average
 
@@ -651,13 +669,28 @@ def fit(self, X, y=None):
         self : instance of UnsupervisedSpatialFilter
             Return the modified instance.
         """
+        # sklearn.utils.estimator_checks.check_estimator(self.estimator) is probably
+        # too strict for us, given that we don't fully adhere yet, so just check attrs
+        for attr in ("fit", "transform", "fit_transform"):
+            if not hasattr(self.estimator, attr):
+                raise ValueError(
+                    "estimator must be a scikit-learn "
+                    f"transformer, missing {attr} method"
+                )
+        _validate_type(self.average, bool, "average")
+        X = self._check_data(X, y=y, fit=True)
         if self.average:
             X = np.mean(X, axis=0).T
         else:
             n_epochs, n_channels, n_times = X.shape
             # trial as time samples
             X = np.transpose(X, (1, 0, 2)).reshape((n_channels, n_epochs * n_times)).T
+
+        # Really shouldn't modify self.estimator, but it's a bad backward compat problem
+        # to change it (will break people's pipelines) so leave it for now and just
+        # wrap to self.estimator_ as well
         self.estimator.fit(X)
+        self.estimator_ = self.estimator
         return self
 
     def fit_transform(self, X, y=None):
@@ -690,6 +723,8 @@ def transform(self, X):
         X : array, shape (n_epochs, n_channels, n_times)
             The transformed data.
         """
+        check_is_fitted(self.estimator_)
+        X = self._check_data(X)
         return self._apply_method(X, "transform")
 
     def inverse_transform(self, X):
@@ -725,7 +760,7 @@ def _apply_method(self, X, method):
         X = np.transpose(X, [1, 0, 2])
         X = np.reshape(X, [n_channels, n_epochs * n_times]).T
         # apply method
-        method = getattr(self.estimator, method)
+        method = getattr(self.estimator_, method)
         X = method(X)
         # put it back to n_epochs, n_dimensions
         X = np.reshape(X.T, [-1, n_epochs, n_times]).transpose([1, 0, 2])
@@ -733,7 +768,7 @@ def _apply_method(self, X, method):
 
 
 @fill_doc
-class TemporalFilter(TransformerMixin, BaseEstimator):
+class TemporalFilter(MNETransformerMixin, BaseEstimator):
     """Estimator to filter data array along the last dimension.
 
     Applies a zero-phase low-pass, high-pass, band-pass, or band-stop
@@ -816,7 +851,7 @@ class TemporalFilter(TransformerMixin, BaseEstimator):
     mne.filter.filter_data
     """
 
-    @verbose
+    @fill_doc  # sklearn compliance
     def __init__(
         self,
         l_freq=None,
@@ -846,17 +881,12 @@ def __init__(
         self.fir_design = fir_design
         self.verbose = verbose
 
-        if not isinstance(self.n_jobs, int) and self.n_jobs == "cuda":
-            raise ValueError(
-                f'n_jobs must be int or "cuda", got {type(self.n_jobs)} instead.'
-            )
-
     def fit(self, X, y=None):
         """Do nothing (for scikit-learn compatibility purposes).
 
         Parameters
         ----------
-        X : array, shape (n_epochs, n_channels, n_times) or or shape (n_channels, n_times)
+        X : array, shape ([n_epochs, ]n_channels, n_times)
             The data to be filtered over the last dimension. The channels
             dimension can be zero when passing a 2D array.
         y : None
@@ -866,7 +896,9 @@ def fit(self, X, y=None):
         -------
         self : instance of TemporalFilter
             The modified instance.
-        """  # noqa: E501
+        """
+        self.fitted_ = True  # sklearn compliance
+        self._check_data(X, y=y, atleast_3d=False, fit=True)
         return self
 
     def transform(self, X):
@@ -874,7 +906,7 @@ def transform(self, X):
 
         Parameters
         ----------
-        X : array, shape (n_epochs, n_channels, n_times) or shape (n_channels, n_times)
+        X : array, shape ([n_epochs, ]n_channels, n_times)
             The data to be filtered over the last dimension. The channels
             dimension can be zero when passing a 2D array.
 
@@ -883,6 +915,7 @@ def transform(self, X):
         X : array
             The data after filtering.
         """  # noqa: E501
+        X = self._check_data(X, atleast_3d=False)
         X = np.atleast_2d(X)
 
         if X.ndim > 3:
diff --git a/mne/time_frequency/multitaper.py b/mne/time_frequency/multitaper.py
index 98705e838c2..f57ddd31110 100644
--- a/mne/time_frequency/multitaper.py
+++ b/mne/time_frequency/multitaper.py
@@ -63,7 +63,12 @@ def dpss_windows(N, half_nbw, Kmax, *, sym=True, norm=None, low_bias=True):
     ----------
     .. footbibliography::
     """
-    dpss, eigvals = sp_dpss(N, half_nbw, Kmax, sym=sym, norm=norm, return_ratios=True)
+    if N <= 1:  # workaround for https://github.com/scipy/scipy/pull/22344
+        dpss, eigvals = np.ones((1, 1)), np.ones(1)
+    else:
+        dpss, eigvals = sp_dpss(
+            N, half_nbw, Kmax, sym=sym, norm=norm, return_ratios=True
+        )
     if low_bias:
         idx = eigvals > 0.9
         if not idx.any():
diff --git a/mne/time_frequency/tfr.py b/mne/time_frequency/tfr.py
index 71dabce6d31..470dbe3ccb7 100644
--- a/mne/time_frequency/tfr.py
+++ b/mne/time_frequency/tfr.py
@@ -564,7 +564,8 @@ def _compute_tfr(
     if len(Ws[0][0]) > epoch_data.shape[2]:
         raise ValueError(
             "At least one of the wavelets is longer than the "
-            "signal. Use a longer signal or shorter wavelets."
+            f"signal ({len(Ws[0][0])} > {epoch_data.shape[2]} samples). "
+            "Use a longer signal or shorter wavelets."
         )
 
     # Initialize output
diff --git a/mne/utils/numerics.py b/mne/utils/numerics.py
index eed23998774..893b0e486c7 100644
--- a/mne/utils/numerics.py
+++ b/mne/utils/numerics.py
@@ -35,7 +35,7 @@
     check_random_state,
 )
 from .docs import fill_doc
-from .misc import _empty_hash
+from .misc import _empty_hash, _pl
 
 
 def split_list(v, n, idx=False):
@@ -479,7 +479,8 @@ def _time_mask(
         extra = "" if include_tmax else "when include_tmax=False "
         raise ValueError(
             f"No samples remain when using tmin={orig_tmin} and tmax={orig_tmax} "
-            f"{extra}(original time bounds are [{times[0]}, {times[-1]}])"
+            f"{extra}(original time bounds are [{times[0]}, {times[-1]}] containing "
+            f"{len(times)} sample{_pl(times)})"
         )
     return mask
 

From 88f7b6368162ee93ead483af0e9f7945424f0116 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Fri, 17 Jan 2025 15:02:33 -0500
Subject: [PATCH 4/9] FIX: More [circle full]

---
 mne/decoding/tests/test_base.py           | 7 ++++++-
 mne/decoding/tests/test_time_frequency.py | 2 ++
 mne/decoding/tests/test_transformer.py    | 4 ++++
 mne/decoding/transformer.py               | 2 +-
 tools/vulture_allowlist.py                | 2 ++
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/mne/decoding/tests/test_base.py b/mne/decoding/tests/test_base.py
index 02ca392e590..504e309d53c 100644
--- a/mne/decoding/tests/test_base.py
+++ b/mne/decoding/tests/test_base.py
@@ -275,7 +275,12 @@ def test_get_coef_multiclass(n_features, n_targets):
     """Test get_coef on multiclass problems."""
     # Check patterns with more than 1 regressor
     X, Y, A = _make_data(n_samples=30000, n_features=n_features, n_targets=n_targets)
-    lm = LinearModel(LinearRegression()).fit(X, Y)
+    lm = LinearModel(LinearRegression())
+    assert not hasattr(lm, "model_")
+    lm.fit(X, Y)
+    # TODO: modifying non-underscored `model` is a sklearn no-no, maybe should be a
+    # metaestimator?
+    assert lm.model is lm.model_
     assert_array_equal(lm.filters_.shape, lm.patterns_.shape)
     if n_targets == 1:
         want_shape = (n_features,)
diff --git a/mne/decoding/tests/test_time_frequency.py b/mne/decoding/tests/test_time_frequency.py
index 29ba5e7597a..638cebda21e 100644
--- a/mne/decoding/tests/test_time_frequency.py
+++ b/mne/decoding/tests/test_time_frequency.py
@@ -35,7 +35,9 @@ def test_timefrequency_basic():
     clone(tf)
 
     # Fit
+    assert not hasattr(tf, "fitted_")
     tf.fit(X, None)
+    assert tf.fitted_
 
     # Transform
     tf = TimeFrequency(freqs, sfreq=100)
diff --git a/mne/decoding/tests/test_transformer.py b/mne/decoding/tests/test_transformer.py
index 49beffef788..bcb2bef58e9 100644
--- a/mne/decoding/tests/test_transformer.py
+++ b/mne/decoding/tests/test_transformer.py
@@ -186,7 +186,9 @@ def test_psdestimator():
     epochs_data = epochs.get_data(copy=False)
     psd = PSDEstimator(2 * np.pi, 0, np.inf)
     y = epochs.events[:, -1]
+    assert not hasattr(psd, "fitted_")
     X = psd.fit_transform(epochs_data, y)
+    assert psd.fitted_
 
     assert X.shape[0] == epochs_data.shape[0]
     assert_array_equal(psd.fit(epochs_data, y).transform(epochs_data), X)
@@ -308,7 +310,9 @@ def test_bad_triage():
     filt = TemporalFilter(l_freq=8, h_freq=60, sfreq=160.0)
     # Used to fail with "ValueError: Effective band-stop frequency (135.0) is
     # too high (maximum based on Nyquist is 80.0)"
+    assert not hasattr(filt, "fitted_")
     filt.fit_transform(np.zeros((1, 1, 481)))
+    assert filt.fitted_
 
 
 @pytest.mark.filterwarnings("ignore:.*filter_length.*")
diff --git a/mne/decoding/transformer.py b/mne/decoding/transformer.py
index 4d28155ab86..fe8956c1dbe 100644
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -461,7 +461,7 @@ def fit(self, epochs_data, y=None):
         self.fitted_ = True  # sklearn compliance
         return self
 
-    def transform(self, epochs_data, y=None):
+    def transform(self, epochs_data):
         """Compute power spectral density (PSD) using a multi-taper method.
 
         Parameters
diff --git a/tools/vulture_allowlist.py b/tools/vulture_allowlist.py
index 24bcd9af64a..9d0e215ee80 100644
--- a/tools/vulture_allowlist.py
+++ b/tools/vulture_allowlist.py
@@ -41,6 +41,8 @@
 
 # Decoding
 _._more_tags
+_.multi_class
+_.preserves_dtype
 deep
 
 # Backward compat or rarely used

From 1918de32d84a81b75b14098992271a366959ba62 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Fri, 17 Jan 2025 16:32:30 -0500
Subject: [PATCH 5/9] FIX: Argh

---
 examples/decoding/linear_model_patterns.py |  2 +-
 mne/decoding/tests/test_transformer.py     | 18 ++++++++++++++----
 mne/decoding/transformer.py                |  7 +++++++
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/examples/decoding/linear_model_patterns.py b/examples/decoding/linear_model_patterns.py
index c1390cbb0d3..7373c0a18b3 100644
--- a/examples/decoding/linear_model_patterns.py
+++ b/examples/decoding/linear_model_patterns.py
@@ -79,7 +79,7 @@
 
 # Extract and plot spatial filters and spatial patterns
 for name, coef in (("patterns", model.patterns_), ("filters", model.filters_)):
-    # We fitted the linear model onto Z-scored data. To make the filters
+    # We fit the linear model on Z-scored data. To make the filters
     # interpretable, we must reverse this normalization step
     coef = scaler.inverse_transform([coef])[0]
 
diff --git a/mne/decoding/tests/test_transformer.py b/mne/decoding/tests/test_transformer.py
index bcb2bef58e9..a0e2a39c2e0 100644
--- a/mne/decoding/tests/test_transformer.py
+++ b/mne/decoding/tests/test_transformer.py
@@ -17,11 +17,14 @@
 
 from sklearn.decomposition import PCA
 from sklearn.kernel_ridge import KernelRidge
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
-from mne import Epochs, create_info, io, pick_types, read_events
+from mne import Epochs, EpochsArray, create_info, io, pick_types, read_events
 from mne.decoding import (
     FilterEstimator,
+    LinearModel,
     PSDEstimator,
     Scaler,
     TemporalFilter,
@@ -218,9 +221,16 @@ def test_vectorizer():
     assert_equal(vect.fit_transform(data[1:]).shape, (149, 108))
 
     # check if raised errors are working correctly
-    vect.fit(np.random.rand(105, 12, 3))
-    pytest.raises(ValueError, vect.transform, np.random.rand(105, 12, 3, 1))
-    pytest.raises(ValueError, vect.inverse_transform, np.random.rand(102, 12, 12))
+    X = np.random.default_rng(0).standard_normal((105, 12, 3))
+    y = np.arange(X.shape[0]) % 2
+    pytest.raises(ValueError, vect.transform, X[..., np.newaxis])
+    pytest.raises(ValueError, vect.inverse_transform, X[:, :-1])
+
+    # And that pipelines work properly
+    X_arr = EpochsArray(X, create_info(12, 1000.0, "eeg"))
+    vect.fit(X_arr)
+    clf = make_pipeline(Vectorizer(), StandardScaler(), LinearModel())
+    clf.fit(X_arr, y)
 
 
 def test_unsupervised_spatial_filter():
diff --git a/mne/decoding/transformer.py b/mne/decoding/transformer.py
index fe8956c1dbe..56d36415d2d 100644
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -15,6 +15,7 @@
     pick_info,
 )
 from ..cov import _check_scalings_user
+from ..epochs import BaseEpochs
 from ..filter import filter_data
 from ..time_frequency import psd_array_multitaper
 from ..utils import _check_option, _validate_type, fill_doc
@@ -34,6 +35,12 @@ def _check_data(
         multi_output=False,
         check_n_features=True,
     ):
+        # Sklearn calls asarray under the hood which works, but elsewhere they check for
+        # __len__ then look at the size of obj[0]... which is an epoch of shape (1, ...)
+        # rather than what they expect (shape (...)). So we explicitly get the NumPy
+        # array to make everyone happy.
+        if isinstance(epochs_data, BaseEpochs):
+            epochs_data = epochs_data.get_data(copy=False)
         kwargs = dict(dtype=np.float64, allow_nd=True, order="C", force_writeable=True)
         if hasattr(self, "n_features_in_") and check_n_features:
             if y is None:

From 6b20743f1abbbb7db3b0648ac42d324a9c1fd497 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 21 Jan 2025 13:37:50 -0500
Subject: [PATCH 6/9] FIX: More

---
 doc/changes/devel/13065.bugfix.rst     |  8 +++++-
 mne/decoding/ems.py                    |  2 +-
 mne/decoding/search_light.py           | 34 ++++++++++++++++++++------
 mne/decoding/tests/test_transformer.py |  4 ---
 mne/decoding/time_frequency.py         |  1 -
 mne/decoding/transformer.py            | 20 +++------------
 mne/time_frequency/multitaper.py       |  4 ++-
 7 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/doc/changes/devel/13065.bugfix.rst b/doc/changes/devel/13065.bugfix.rst
index e054bcd7e9c..bbaa07ae127 100644
--- a/doc/changes/devel/13065.bugfix.rst
+++ b/doc/changes/devel/13065.bugfix.rst
@@ -1 +1,7 @@
-Improved sklearn class compatibility and compliance, which resulted in some parameters of classes like :class:`mne.decoding.FilterEstimator` having an underscore appended to their name (e.g., ``picks`` passed to the initializer is set as ``est.picks_`` during the ``fit`` phase so the original can be preserved in ``est.picks``) by `Eric Larson`_.
+Improved sklearn class compatibility and compliance, which resulted in some parameters of classes having an underscore appended to their name during ``fit``, such as:
+
+- :class:`mne.decoding.FilterEstimator` parameter ``picks`` passed to the initializer is set as ``est.picks_``
+- :class:`mne.decoding.UnsupervisedSpatialFilter` parameter ``estimator`` passed to the initializer is set as ``est.estimator_``
+
+Unused ``verbose`` class parameters (that had no effect) were removed from :class:`~mne.decoding.PSDEstimator`, :class:`~mne.decoding.TemporalFilter`, and :class:`~mne.decoding.FilterEstimator` as well.
+Changes by `Eric Larson`_.
diff --git a/mne/decoding/ems.py b/mne/decoding/ems.py
index 4bc64499281..5c7557798ef 100644
--- a/mne/decoding/ems.py
+++ b/mne/decoding/ems.py
@@ -105,7 +105,7 @@ def transform(self, X):
 
 @verbose
 def compute_ems(
-    epochs, conditions=None, picks=None, n_jobs=None, cv=None, verbose=None
+    epochs, conditions=None, picks=None, n_jobs=None, cv=None, *, verbose=None
 ):
     """Compute event-matched spatial filter on epochs.
 
diff --git a/mne/decoding/search_light.py b/mne/decoding/search_light.py
index 8bf51a64574..8bd96781185 100644
--- a/mne/decoding/search_light.py
+++ b/mne/decoding/search_light.py
@@ -11,7 +11,13 @@
 from sklearn.utils.validation import check_is_fitted
 
 from ..parallel import parallel_func
-from ..utils import ProgressBar, _parse_verbose, array_split_idx, fill_doc
+from ..utils import (
+    ProgressBar,
+    _parse_verbose,
+    _verbose_safe_false,
+    array_split_idx,
+    fill_doc,
+)
 from .base import _check_estimator
 from .transformer import MNETransformerMixin
 
@@ -39,7 +45,6 @@ class SlidingEstimator(MetaEstimatorMixin, MNETransformerMixin, BaseEstimator):
         List of fitted scikit-learn estimators (one per task).
     """
 
-    @fill_doc  # sklearn compliance
     def __init__(
         self,
         base_estimator,
@@ -105,7 +110,10 @@ def fit(self, X, y, **fit_params):
         _check_estimator(self.base_estimator)
         X, _ = self._check_Xy(X, y, fit=True)
         parallel, p_func, n_jobs = parallel_func(
-            _sl_fit, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+            _sl_fit,
+            self.n_jobs,
+            max_jobs=X.shape[-1],
+            verbose=_verbose_safe_false(),
         )
         self.estimators_ = list()
         self.fit_params_ = fit_params
@@ -163,7 +171,10 @@ def _transform(self, X, method):
         # For predictions/transforms the parallelization is across the data and
         # not across the estimators to avoid memory load.
         parallel, p_func, n_jobs = parallel_func(
-            _sl_transform, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+            _sl_transform,
+            self.n_jobs,
+            max_jobs=X.shape[-1],
+            verbose=_verbose_safe_false(),
         )
 
         X_splits = np.array_split(X, n_jobs, axis=-1)
@@ -321,7 +332,10 @@ def score(self, X, y):
         # For predictions/transforms the parallelization is across the data and
         # not across the estimators to avoid memory load.
         parallel, p_func, n_jobs = parallel_func(
-            _sl_score, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+            _sl_score,
+            self.n_jobs,
+            max_jobs=X.shape[-1],
+            verbose=_verbose_safe_false(),
         )
         X_splits = np.array_split(X, n_jobs, axis=-1)
         est_splits = np.array_split(self.estimators_, n_jobs)
@@ -493,7 +507,10 @@ def _transform(self, X, method):
         method = _check_method(self.base_estimator, method)
 
         parallel, p_func, n_jobs = parallel_func(
-            _gl_transform, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+            _gl_transform,
+            self.n_jobs,
+            max_jobs=X.shape[-1],
+            verbose=_verbose_safe_false(),
         )
 
         context = _create_progressbar_context(self, X, "Transforming")
@@ -618,7 +635,10 @@ def score(self, X, y):
         # For predictions/transforms the parallelization is across the data and
         # not across the estimators to avoid memory load.
         parallel, p_func, n_jobs = parallel_func(
-            _gl_score, self.n_jobs, max_jobs=X.shape[-1], verbose=False
+            _gl_score,
+            self.n_jobs,
+            max_jobs=X.shape[-1],
+            verbose=_verbose_safe_false(),
         )
         scoring = check_scoring(self.base_estimator, self.scoring)
         y = _fix_auc(scoring, y)
diff --git a/mne/decoding/tests/test_transformer.py b/mne/decoding/tests/test_transformer.py
index a0e2a39c2e0..a8afe209d96 100644
--- a/mne/decoding/tests/test_transformer.py
+++ b/mne/decoding/tests/test_transformer.py
@@ -345,10 +345,6 @@ def test_sklearn_compliance(estimator, check):
             "check_estimators_overwrite_params",  # we modify self.info
             "check_methods_sample_order_invariance",
         ]
-    if estimator.__class__.__name__ == "UnsupervisedSpatialFilter":
-        ignores += [
-            "check_estimators_overwrite_params",  # we modify self.estimator
-        ]
     if estimator.__class__.__name__.startswith(("PSD", "Temporal")):
         ignores += [
             "check_transformers_unfitted",  # allow unfitted transform
diff --git a/mne/decoding/time_frequency.py b/mne/decoding/time_frequency.py
index 5db2580498a..29232aaeb9f 100644
--- a/mne/decoding/time_frequency.py
+++ b/mne/decoding/time_frequency.py
@@ -61,7 +61,6 @@ class TimeFrequency(MNETransformerMixin, BaseEstimator):
     mne.time_frequency.tfr_multitaper
     """
 
-    @fill_doc  # sklearn compliance
     def __init__(
         self,
         freqs,
diff --git a/mne/decoding/transformer.py b/mne/decoding/transformer.py
index 56d36415d2d..7d989150776 100644
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -3,7 +3,7 @@
 # Copyright the MNE-Python contributors.
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin, check_array
+from sklearn.base import BaseEstimator, TransformerMixin, check_array, clone
 from sklearn.preprocessing import RobustScaler, StandardScaler
 from sklearn.utils import check_X_y
 from sklearn.utils.validation import check_is_fitted, validate_data
@@ -415,7 +415,6 @@ class PSDEstimator(MNETransformerMixin, BaseEstimator):
     n_jobs : int
         Number of parallel jobs to use (only used if adaptive=True).
     %(normalization)s
-    %(verbose)s
 
     See Also
     --------
@@ -425,7 +424,6 @@ class PSDEstimator(MNETransformerMixin, BaseEstimator):
     mne.Evoked.compute_psd
     """
 
-    @fill_doc  # NB intentionally don't use verbose here so it doesn't modify the val
     def __init__(
         self,
         sfreq=2 * np.pi,
@@ -436,8 +434,6 @@ def __init__(
         low_bias=True,
         n_jobs=None,
         normalization="length",
-        *,
-        verbose=None,
     ):
         self.sfreq = sfreq
         self.fmin = fmin
@@ -447,7 +443,6 @@ def __init__(
         self.low_bias = low_bias
         self.n_jobs = n_jobs
         self.normalization = normalization
-        self.verbose = verbose
 
     def fit(self, epochs_data, y=None):
         """Compute power spectral density (PSD) using a multi-taper method.
@@ -533,7 +528,6 @@ class FilterEstimator(MNETransformerMixin, BaseEstimator):
         See mne.filter.construct_iir_filter for details. If iir_params
         is None and method="iir", 4th order Butterworth will be used.
     %(fir_design)s
-    %(verbose)s
 
     See Also
     --------
@@ -559,8 +553,6 @@ def __init__(
         method="fir",
         iir_params=None,
         fir_design="firwin",
-        *,
-        verbose=None,
     ):
         self.info = info
         self.l_freq = l_freq
@@ -573,7 +565,6 @@ def __init__(
         self.method = method
         self.iir_params = iir_params
         self.fir_design = fir_design
-        self.verbose = verbose
 
     def fit(self, epochs_data, y):
         """Filter data.
@@ -696,8 +687,8 @@ def fit(self, X, y=None):
         # Really shouldn't modify self.estimator, but it's a bad backward compat problem
         # to change it (will break people's pipelines) so leave it for now and just
         # wrap to self.estimator_ as well
-        self.estimator.fit(X)
-        self.estimator_ = self.estimator
+        self.estimator_ = clone(self.estimator)
+        self.estimator_.fit(X)
         return self
 
     def fit_transform(self, X, y=None):
@@ -849,7 +840,6 @@ class TemporalFilter(MNETransformerMixin, BaseEstimator):
         attenuation using fewer samples than "firwin2".
 
         .. versionadded:: 0.15
-    %(verbose)s
 
     See Also
     --------
@@ -858,7 +848,6 @@ class TemporalFilter(MNETransformerMixin, BaseEstimator):
     mne.filter.filter_data
     """
 
-    @fill_doc  # sklearn compliance
     def __init__(
         self,
         l_freq=None,
@@ -872,8 +861,6 @@ def __init__(
         iir_params=None,
         fir_window="hamming",
         fir_design="firwin",
-        *,
-        verbose=None,
     ):
         self.l_freq = l_freq
         self.h_freq = h_freq
@@ -886,7 +873,6 @@ def __init__(
         self.iir_params = iir_params
         self.fir_window = fir_window
         self.fir_design = fir_design
-        self.verbose = verbose
 
     def fit(self, X, y=None):
         """Do nothing (for scikit-learn compatibility purposes).
diff --git a/mne/time_frequency/multitaper.py b/mne/time_frequency/multitaper.py
index f57ddd31110..1c1a3baf238 100644
--- a/mne/time_frequency/multitaper.py
+++ b/mne/time_frequency/multitaper.py
@@ -63,7 +63,9 @@ def dpss_windows(N, half_nbw, Kmax, *, sym=True, norm=None, low_bias=True):
     ----------
     .. footbibliography::
     """
-    if N <= 1:  # workaround for https://github.com/scipy/scipy/pull/22344
+    # TODO VERSION can be removed with SciPy 1.16 is min,
+    # workaround for https://github.com/scipy/scipy/pull/22344
+    if N <= 1:
         dpss, eigvals = np.ones((1, 1)), np.ones(1)
     else:
         dpss, eigvals = sp_dpss(

From ca9c2077b0cef60d423ad16db20b99d05a72430e Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 21 Jan 2025 14:45:12 -0500
Subject: [PATCH 7/9] FIX: Checks

---
 mne/decoding/tests/test_csp.py   | 2 +-
 tools/github_actions_env_vars.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mne/decoding/tests/test_csp.py b/mne/decoding/tests/test_csp.py
index d6374304d28..e754b6952f9 100644
--- a/mne/decoding/tests/test_csp.py
+++ b/mne/decoding/tests/test_csp.py
@@ -351,7 +351,7 @@ def test_regularized_csp(ch_type, rank, reg):
     # test init exception
     pytest.raises(ValueError, csp.fit, epochs_data, np.zeros_like(epochs.events))
     pytest.raises(ValueError, csp.fit, "foo", y)
-    pytest.raises(ValueError, csp.transform, epochs)
+    pytest.raises(ValueError, csp.transform, "foo")
 
     csp.n_components = n_components
     sources = csp.transform(epochs_data)
diff --git a/tools/github_actions_env_vars.sh b/tools/github_actions_env_vars.sh
index 8accf72a11a..9f424ae5f48 100755
--- a/tools/github_actions_env_vars.sh
+++ b/tools/github_actions_env_vars.sh
@@ -28,7 +28,7 @@ else  # conda-like
         echo "MNE_LOGGING_LEVEL=warning" | tee -a $GITHUB_ENV
         echo "MNE_QT_BACKEND=PySide6" | tee -a $GITHUB_ENV
         # TODO: Also need "|unreliable on GitHub Actions conda" on macOS, but omit for now to make sure the failure actually shows up
-        echo "MNE_TEST_ALLOW_SKIP=.*(Requires (spm|brainstorm) dataset|CUDA not|PySide6 causes segfaults).*" | tee -a $GITHUB_ENV
+        echo "MNE_TEST_ALLOW_SKIP=.*(Requires (spm|brainstorm) dataset|CUDA not|PySide6 causes segfaults|Accelerate|Flakey verbose behavior).*" | tee -a $GITHUB_ENV
     fi
 fi
 set +x

From e73983d041a49e72277abf66b6036b3091bdf6c2 Mon Sep 17 00:00:00 2001
From: Eric Larson <larson.eric.d@gmail.com>
Date: Tue, 21 Jan 2025 16:14:12 -0500
Subject: [PATCH 8/9] FIX: Val [ci skip]

---
 mne/decoding/tests/test_ssd.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mne/decoding/tests/test_ssd.py b/mne/decoding/tests/test_ssd.py
index d1ac793e65b..b6cdfc472c3 100644
--- a/mne/decoding/tests/test_ssd.py
+++ b/mne/decoding/tests/test_ssd.py
@@ -486,6 +486,7 @@ def test_non_full_rank_data():
     ssd.fit(X)
 
 
+@pytest.mark.filterwarnings("ignore:.*invalid value encountered in divide.*")
 @pytest.mark.filterwarnings("ignore:.*is longer than.*")
 @parametrize_with_checks(
     [

From c7b95df32c25cc4b75d6c9df1cbf57f61154f904 Mon Sep 17 00:00:00 2001
From: Daniel McCloy <dan@mccloy.info>
Date: Wed, 22 Jan 2025 11:22:13 -0600
Subject: [PATCH 9/9] Update mne/decoding/transformer.py [ci skip]

---
 mne/decoding/transformer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mne/decoding/transformer.py b/mne/decoding/transformer.py
index 7d989150776..6d0c83f42ab 100644
--- a/mne/decoding/transformer.py
+++ b/mne/decoding/transformer.py
@@ -684,9 +684,6 @@ def fit(self, X, y=None):
             # trial as time samples
             X = np.transpose(X, (1, 0, 2)).reshape((n_channels, n_epochs * n_times)).T
 
-        # Really shouldn't modify self.estimator, but it's a bad backward compat problem
-        # to change it (will break people's pipelines) so leave it for now and just
-        # wrap to self.estimator_ as well
         self.estimator_ = clone(self.estimator)
         self.estimator_.fit(X)
         return self