diff --git a/aeon/transformations/collection/dictionary_based/_sfa.py b/aeon/transformations/collection/dictionary_based/_sfa.py index a6ffb8b0d7..543052a201 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa.py +++ b/aeon/transformations/collection/dictionary_based/_sfa.py @@ -20,7 +20,6 @@ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from aeon.transformations.collection import BaseCollectionTransformer -from aeon.utils.validation.panel import check_X # The binning methods to use: equi-depth, equi-width, information gain or kmeans binning_methods = { @@ -90,6 +89,11 @@ class SFA(BaseCollectionTransformer): Attributes ---------- words: [] + words is a list of arrays of integers, one for each case. Each array is + length ``(series_length - window_size+1)``. Each integer is a birt + representation of a word. So, for example if ``word_length=6`` and + ``alphabet_size=4`, integer 3235 is bit string 11 00 10 10 00 11, + representing word daccad. breakpoints: = [] num_insts = 0 num_atts = 0 @@ -237,8 +241,6 @@ def _fit(self, X, y=None): "Typed Dictionaries can only handle 15 levels " "(this is way to many anyway)." ) - - X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) if self.levels > 1: diff --git a/aeon/transformations/collection/dictionary_based/_sfa_fast.py b/aeon/transformations/collection/dictionary_based/_sfa_fast.py index 95fcad6330..c9ca494c7e 100644 --- a/aeon/transformations/collection/dictionary_based/_sfa_fast.py +++ b/aeon/transformations/collection/dictionary_based/_sfa_fast.py @@ -29,7 +29,6 @@ from sklearn.utils import check_random_state from aeon.transformations.collection import BaseCollectionTransformer -from aeon.utils.validation.panel import check_X # The binning methods to use: equi-depth, equi-width, information gain or kmeans binning_methods = { @@ -78,55 +77,45 @@ class SFAFast(BaseCollectionTransformer): If True, the Fourier coefficient selection is done via the largest variance. If False, the first Fourier coefficients are selected. Only applicable if labels are given. - dilation : int, default = 0 - When set to dilation > 1, adds dilation to the sliding window operation. - - save_words: boolean, default = False - whether to save the words generated for each series (default False) - - bigrams: boolean, default = False - whether to create bigrams of SFA words. - - feature_selection: {"chi2", "chi2_top_k", "none", "random"}, default: none - Sets the feature selections strategy to be used. Large amounts of memory - may be needed depending on the setting of bigrams (true is more) or - alpha (larger is more). - 'chi2' reduces the number of words, keeping those above the 'p_threshold'. - 'chi2_top_k' reduces the number of words to at most 'max_feature_count', - dropping values based on p-value. - 'random' reduces the number to at most 'max_feature_count', - by randomly selecting features. - 'none' does not apply any feature selection and yields large bag of words, - - p_threshold: int, default=0.05 (disabled by default) - If feature_selection=chi2 is chosen, feature selection is applied based on - the chi-squared test. This is the p-value threshold to use for chi-squared - test on bag-of-words (lower means more strict). 1 indicates that the test - should not be performed. - - max_feature_count: int, default=256 - If feature_selection=random is chosen, this parameter defines the number of - randomly chosen unique words used. - - skip_grams: boolean, default = False - whether to create skip-grams of SFA words - - remove_repeat_words: boolean, default = False - whether to use numerosity reduction (default False) - - return_sparse: boolean, default=True - if set to true, a scipy sparse matrix will be returned as BOP model. - If set to false a dense array will be returned as BOP model. Sparse - arrays are much more compact. - - n_jobs: int, optional, default = 1 - The number of jobs to run in parallel for both `transform`. - ``-1`` means using all processors. - - return_pandas_data_series: boolean, default = False - set to true to return Pandas Series as a result of transform. - setting to true reduces speed significantly but is required for - automatic test. + dilation : int, default = 0 + When set to dilation > 1, adds dilation to the sliding window operation. + save_words : boolean, default = False + whether to save the words generated for each series (default False) + bigrams : boolean, default = False + Whether to create bigrams of SFA words. + feature_selection : {"chi2", "chi2_top_k", "none", "random"}, default: none + Sets the feature selections strategy to be used. Large amounts of memory + may be needed depending on the setting of bigrams (true is more) or + alpha (larger is more). + 'chi2' reduces the number of words, keeping those above the 'p_threshold'. + 'chi2_top_k' reduces the number of words to at most 'max_feature_count', + dropping values based on p-value. + 'random' reduces the number to at most 'max_feature_count', + by randomly selecting features. + 'none' does not apply any feature selection and yields large bag of words, + p_threshold : int, default=0.05 (disabled by default) + If feature_selection=chi2 is chosen, feature selection is applied based on + the chi-squared test. This is the p-value threshold to use for chi-squared + test on bag-of-words (lower means more strict). 1 indicates that the test + should not be performed. + max_feature_count : int, default=256 + If feature_selection=random is chosen, this parameter defines the number of + randomly chosen unique words used. + skip_grams : boolean, default = False + Whether to create skip-grams of SFA words. + remove_repeat_words : boolean, default = False + Whether to use numerosity reduction. + return_sparse : boolean, default=True + If set to true, a scipy sparse matrix will be returned as BOP model. + If set to false a dense array will be returned as BOP model. Sparse + arrays are much more compact. + n_jobs : int, default = 1 + The number of jobs to run in parallel for both `transform`. + ``-1`` means using all processors. + return_pandas_data_series : boolean, default = False + set to true to return Pandas Series as a result of transform. + setting to true reduces speed significantly but is required for + automatic test. Attributes ---------- @@ -258,8 +247,6 @@ def _fit_transform(self, X, y=None): self.support = np.arange(self.word_length_actual) self.letter_bits = np.uint32(math.ceil(math.log2(self.alphabet_size))) # self.word_bits = self.word_length_actual * self.letter_bits - - X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) if self.dilation >= 1 or self.first_difference: @@ -303,7 +290,7 @@ def _fit(self, X, y=None): Parameters ---------- - X : pandas DataFrame or 3d numpy array, input time series. + X : 3d numpy array, input time series. y : array_like, target values (optional, ignored). Returns @@ -319,14 +306,12 @@ def _transform(self, X, y=None): Parameters ---------- - X : pandas DataFrame or 3d numpy array, input time series. - y : array_like, target values (optional, ignored). + X : 3d numpy array, input time series. Returns ------- List of dictionaries containing SFA words """ - X = check_X(X, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) if self.dilation >= 1 or self.first_difference: