predefined.py: move Simple,CN2,IREP,Ripper to separate module

azrdev · Aug 8, 2019 · 2df453a · 2df453a
1 parent 2717572
commit 2df453a
Show file tree

Hide file tree

Showing 9 changed files with 150 additions and 136 deletions.
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ under supervision of Johannes Fürnkranz.
 For current test suite results, check
 [Continuous Integration](https://travis-ci.com/azrdev/sklearn-seco).
 
-To run a comparison of `sklearn_seco.concrete.RipperEstimator` with
+To run a comparison of `sklearn_seco.RipperEstimator` with
 [weka.JRip](http://weka.sourceforge.net/doc.stable/weka/classifiers/rules/JRip.html),
 [weka.J48](http://weka.sourceforge.net/doc.stable/weka/classifiers/trees/J48.html), and
 [sklearn.dtree](https://scikit-learn.org/stable/modules/tree.html)
@@ -69,7 +69,7 @@ dependency sets "numba" and "tests".
   [the Orange implementation](https://orange3.readthedocs.io/projects/orange-visual-programming/widgets/model/cn2ruleinduction.html),
   but should be complete
 
-- Ripper misses the original class binarization strategy and the global post-optimization,
+- Ripper lacks the original class binarization strategy and the global post-optimization,
   therefore results are not identical to JRip (the only other freely available implementation).
 
 - various TODOs throughout the code mark missing details

diff --git a/evaluation.py b/evaluation.py
@@ -23,7 +23,7 @@
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils import Bunch
 
-from sklearn_seco.concrete import RipperEstimator
+from sklearn_seco import RipperEstimator
 
 CACHE_DIR = 'openml_cache/'
 RESULT_DIR = 'evaluation/'

diff --git a/seco_line_profiling.py b/seco_line_profiling.py
@@ -2,9 +2,9 @@
 
 from line_profiler import LineProfiler
 
-from sklearn_seco.tests import conftest
+from sklearn_seco import SimpleSeCoEstimator
 from sklearn_seco.common import match_rule, RuleContext
-from sklearn_seco.concrete import SimpleSeCoEstimator
+from sklearn_seco.tests import conftest
 
 
 def tcn2(dataset):

diff --git a/seco_runtime_scaling.py b/seco_runtime_scaling.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 
-import sklearn_seco.concrete
+import sklearn_seco
 
 
 def time_seco(estimator: str, dataset_args: str) -> Optional[Sequence[float]]:
@@ -107,7 +107,7 @@ def log(message):
 if __name__ == "__main__":
     categorical = 'c' in sys.argv[1:]
 
-    estimator = sklearn_seco.concrete.RipperEstimator.__name__
+    estimator = sklearn_seco.RipperEstimator.__name__
     log("start timing of %s" % estimator)
     print("n_samples, n_features, timings...")
     all_timings = []

diff --git a/sklearn_seco/__init__.py b/sklearn_seco/__init__.py
@@ -22,4 +22,10 @@
 - classification only, no regression
 """
 
-__all__ = ['abstract', 'common', 'concrete', 'extra', 'tests', 'util']
+from sklearn_seco.predefined import \
+    SimpleSeCoEstimator, CN2Estimator, IrepEstimator, RipperEstimator
+
+__all__ = [
+    'abstract', 'common', 'concrete', 'extra', 'tests', 'util',
+    'SimpleSeCoEstimator', 'CN2Estimator', 'IrepEstimator', 'RipperEstimator',
+]
diff --git a/sklearn_seco/concrete.py b/sklearn_seco/concrete.py
@@ -1,11 +1,11 @@
 """
 Implementation of SeCo / Covering algorithm:
-Usual building blocks & known instantiations of the abstract base algorithm.
+Usual building blocks of SeCo algorithms, implemented as Mixins.
 
-Implemented as Mixins. For __init__ parameters, they use cooperative
-multi-inheritance, each class has to declare **kwargs and forward anything it
-doesn't consume using `super().__init__(**kwargs)`. Users of mixin-composed
-classes will have to use keyword- instead of positional arguments.
+For __init__ parameters, they use cooperative multi-inheritance: Each class has
+to declare **kwargs and forward anything it doesn't consume using
+`super().__init__(**kwargs)`. Creators of mixin-composed classes will have to
+use keyword- instead of positional arguments.
 """
 
 import functools
@@ -17,10 +17,9 @@
 import numpy as np
 from scipy.special import xlogy
 
-from sklearn_seco.abstract import \
-    Theory, SeCoEstimator
+from sklearn_seco.abstract import Theory
 from sklearn_seco.common import \
-    Rule, RuleQueue, AugmentedRule, T, TGT, SeCoAlgorithmConfiguration, \
+    Rule, RuleQueue, AugmentedRule, T, TGT, \
     AbstractSecoImplementation, RuleContext, TheoryContext
 from sklearn_seco.ripper_mdl import \
     data_description_length, relative_description_length
@@ -68,9 +67,6 @@ def grow_prune_split(y,
     return grow, prune
 
 
-# Implementation facets
-
-
 class BeamSearch(AbstractSecoImplementation):
     """Mixin implementing a beam search of width `n`.
 
@@ -663,118 +659,5 @@ def _num_unique_values(self, feature: int):
         return len(unique_values)
 
 
-# Example Algorithm configurations
-
-
-class SimpleSeCoEstimator(SeCoEstimator):
-    class algorithm_config(SeCoAlgorithmConfiguration):
-        RuleContextClass = TopDownSearchContext
-
-        class Implementation(BeamSearch,
-                             TopDownSearchImplementation,
-                             PurityHeuristic,
-                             NoNegativesStop,
-                             SkipPostPruning,
-                             CoverageRuleStop,
-                             SkipPostProcess):
-            pass
-
-
-class CN2Estimator(SeCoEstimator):
-    """CN2 as refined by (Clark and Boswell 1991)."""
-
-    class algorithm_config(SeCoAlgorithmConfiguration):
-        RuleContextClass = TopDownSearchContext
-
-        class Implementation(BeamSearch,
-                             TopDownSearchImplementation,
-                             LaplaceHeuristic,
-                             SignificanceStoppingCriterion,
-                             SkipPostPruning,
-                             PositiveThresholdRuleStop,
-                             SkipPostProcess):
-            positive_coverage_stop_threshold = 1  # → PositiveThresholdRuleStop
-            beam_width = 3  # → BeamSearch
-
-
-class RipperEstimator(SeCoEstimator):
-    """Ripper as defined by (Cohen 1995).
-
-    NOTE: The global post-optimization phase is currently not implemented
-        (that would be the `post_process` method).
-    """
-    class algorithm_config(SeCoAlgorithmConfiguration):
-        RuleClass = ConditionTracingAugmentedRule
-        TheoryContextClass = RipperMdlRuleStopTheoryContext
-
-        class Implementation(BeamSearch,
-                             TopDownSearchImplementation,
-                             InformationGainHeuristic,
-                             RipperMdlRuleStopImplementation,
-                             RipperPostPruning,
-                             SkipPostProcess
-                             ):
-            @classmethod
-            @delayed_inner_stop
-            def inner_stopping_criterion(cls, rule: AugmentedRule,
-                                         context: RuleContext) -> bool:
-                """Laplace-based criterion. Field `accuRate` in JRip.java."""
-                p, n = rule.pn(context)
-                accuracy_rate = (p + 1) / (p + n + 1)
-                return accuracy_rate >= 1
-
-        class RuleContextClass(TopDownSearchContext,
-                               GrowPruneSplitRuleContext):
-            def pruning_heuristic(self, rule: AugmentedRule,
-                                  context: RuleContext
-                                  ) -> float:
-                """Laplace heuristic, as defined by (Clark and Boswell 1991).
-
-                JRip documentation states:
-                "The pruning metric is (p-n)/(p+n) -- but it's actually
-                2p/(p+n) -1, so in this implementation we simply use p/(p+n)
-                (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5)."
-                """
-                p, n = rule.pn(context)
-                return (p + 1) / (p + n + 2)  # laplace
-
-
-class IrepEstimator(SeCoEstimator):
-    """IREP as defined by (Cohen 1995), originally by (Fürnkranz, Widmer 1994).
-    """
-
-    class algorithm_config(SeCoAlgorithmConfiguration):
-        RuleClass = ConditionTracingAugmentedRule
-
-        class Implementation(BeamSearch,
-                             TopDownSearchImplementation,
-                             InformationGainHeuristic,
-                             NoNegativesStop,
-                             RipperPostPruning,
-                             CoverageRuleStop,
-                             SkipPostProcess):
-
-            @classmethod
-            def rule_stopping_criterion(cls, theory: Theory,
-                                        rule: AugmentedRule,
-                                        context: RuleContext) -> bool:
-                assert isinstance(context, GrowPruneSplitRuleContext)
-                context.growing = False
-                return super().rule_stopping_criterion(theory, rule, context)
-
-        class RuleContextClass(TopDownSearchContext,
-                               GrowPruneSplitRuleContext):
-            def pruning_heuristic(self, rule: AugmentedRule,
-                                  context: GrowPruneSplitRuleContext) -> float:
-                """:return: (#true positives + #true negatives) / #examples"""
-                context.growing = False
-                p, n = rule.pn(context)
-                P, N = context.PN(rule.head)
-                if P + N == 0:
-                    return 0
-                tn = N - n
-                return (p + tn) / (P + N)
-
-
 # TODO: sklearn.get/set_param setting *Implementation fields?
 # TODO: allow defining heuristics/metrics (and stop criteria?) as functions and pulling them in as growing_/pruning_heuristic etc without defining an extra class
diff --git a/sklearn_seco/predefined.py b/sklearn_seco/predefined.py
@@ -0,0 +1,125 @@
+"""
+Implementation of SeCo / Covering algorithm:
+Known instantiations / example configurations of the abstract base algorithm.
+"""
+
+from sklearn_seco.abstract import SeCoEstimator
+from sklearn_seco.common import \
+    SeCoAlgorithmConfiguration, AugmentedRule, RuleContext, Theory
+from sklearn_seco.concrete import TopDownSearchContext, BeamSearch, \
+    TopDownSearchImplementation, PurityHeuristic, NoNegativesStop, \
+    SkipPostPruning, CoverageRuleStop, SkipPostProcess, LaplaceHeuristic, \
+    SignificanceStoppingCriterion, PositiveThresholdRuleStop, \
+    ConditionTracingAugmentedRule, RipperMdlRuleStopTheoryContext, \
+    InformationGainHeuristic, RipperMdlRuleStopImplementation, \
+    RipperPostPruning, delayed_inner_stop, GrowPruneSplitRuleContext
+
+
+class SimpleSeCoEstimator(SeCoEstimator):
+    class algorithm_config(SeCoAlgorithmConfiguration):
+        RuleContextClass = TopDownSearchContext
+
+        class Implementation(BeamSearch,
+                             TopDownSearchImplementation,
+                             PurityHeuristic,
+                             NoNegativesStop,
+                             SkipPostPruning,
+                             CoverageRuleStop,
+                             SkipPostProcess):
+            pass
+
+
+class CN2Estimator(SeCoEstimator):
+    """CN2 as refined by (Clark and Boswell 1991)."""
+
+    class algorithm_config(SeCoAlgorithmConfiguration):
+        RuleContextClass = TopDownSearchContext
+
+        class Implementation(BeamSearch,
+                             TopDownSearchImplementation,
+                             LaplaceHeuristic,
+                             SignificanceStoppingCriterion,
+                             SkipPostPruning,
+                             PositiveThresholdRuleStop,
+                             SkipPostProcess):
+            positive_coverage_stop_threshold = 1  # → PositiveThresholdRuleStop
+            beam_width = 3  # → BeamSearch
+
+
+class RipperEstimator(SeCoEstimator):
+    """Ripper as defined by (Cohen 1995).
+
+    NOTE: The global post-optimization phase is currently not implemented
+        (that would be the `post_process` method).
+    """
+    class algorithm_config(SeCoAlgorithmConfiguration):
+        RuleClass = ConditionTracingAugmentedRule
+        TheoryContextClass = RipperMdlRuleStopTheoryContext
+
+        class Implementation(BeamSearch,
+                             TopDownSearchImplementation,
+                             InformationGainHeuristic,
+                             RipperMdlRuleStopImplementation,
+                             RipperPostPruning,
+                             SkipPostProcess
+                             ):
+            @classmethod
+            @delayed_inner_stop
+            def inner_stopping_criterion(cls, rule: AugmentedRule,
+                                         context: RuleContext) -> bool:
+                """Laplace-based criterion. Field `accuRate` in JRip.java."""
+                p, n = rule.pn(context)
+                accuracy_rate = (p + 1) / (p + n + 1)
+                return accuracy_rate >= 1
+
+        class RuleContextClass(TopDownSearchContext,
+                               GrowPruneSplitRuleContext):
+            def pruning_heuristic(self, rule: AugmentedRule,
+                                  context: RuleContext
+                                  ) -> float:
+                """Laplace heuristic, as defined by (Clark and Boswell 1991).
+
+                JRip documentation states:
+                "The pruning metric is (p-n)/(p+n) -- but it's actually
+                2p/(p+n) -1, so in this implementation we simply use p/(p+n)
+                (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5)."
+                """
+                p, n = rule.pn(context)
+                return (p + 1) / (p + n + 2)  # laplace
+
+
+class IrepEstimator(SeCoEstimator):
+    """IREP as defined by (Cohen 1995), originally by (Fürnkranz, Widmer 1994).
+    """
+
+    class algorithm_config(SeCoAlgorithmConfiguration):
+        RuleClass = ConditionTracingAugmentedRule
+
+        class Implementation(BeamSearch,
+                             TopDownSearchImplementation,
+                             InformationGainHeuristic,
+                             NoNegativesStop,
+                             RipperPostPruning,
+                             CoverageRuleStop,
+                             SkipPostProcess):
+
+            @classmethod
+            def rule_stopping_criterion(cls, theory: Theory,
+                                        rule: AugmentedRule,
+                                        context: RuleContext) -> bool:
+                assert isinstance(context, GrowPruneSplitRuleContext)
+                context.growing = False
+                return super().rule_stopping_criterion(theory, rule, context)
+
+        class RuleContextClass(TopDownSearchContext,
+                               GrowPruneSplitRuleContext):
+            def pruning_heuristic(self, rule: AugmentedRule,
+                                  context: GrowPruneSplitRuleContext) -> float:
+                """:return: (#true positives + #true negatives) / #examples"""
+                context.growing = False
+                p, n = rule.pn(context)
+                P, N = context.PN(rule.head)
+                if P + N == 0:
+                    return 0
+                tn = N - n
+                return (p + tn) / (P + N)
diff --git a/sklearn_seco/tests/conftest.py b/sklearn_seco/tests/conftest.py
@@ -8,10 +8,10 @@
 from sklearn.utils import check_random_state
 from sklearn.utils.testing import set_random_state
 
+from sklearn_seco import \
+    SimpleSeCoEstimator, CN2Estimator, RipperEstimator, IrepEstimator
 from sklearn_seco.abstract import SeCoEstimator
 from sklearn_seco.common import Theory
-from sklearn_seco.concrete import \
-    SimpleSeCoEstimator, CN2Estimator, IrepEstimator, RipperEstimator
 
 from .datasets import Dataset, \
     binary_slight_overlap, xor_2d, checkerboard_2d, \

diff --git a/sklearn_seco/tests/test_concrete.py b/sklearn_seco/tests/test_concrete.py
@@ -12,8 +12,8 @@
 
 from sklearn_seco.abstract import _BaseSeCoEstimator
 from sklearn_seco.common import Rule
-from sklearn_seco.concrete import grow_prune_split, SimpleSeCoEstimator, \
-    TopDownSearchImplementation
+from sklearn_seco.concrete import grow_prune_split, TopDownSearchImplementation
+from predefined import SimpleSeCoEstimator
 from sklearn_seco.util import TargetTransformingMetaEstimator, \
     BySizeLabelEncoder
 from .conftest import count_conditions