Skip to content

Commit

Permalink
predefined.py: move Simple,CN2,IREP,Ripper to separate module
Browse files Browse the repository at this point in the history
  • Loading branch information
azrdev committed Aug 8, 2019
1 parent 2717572 commit 2df453a
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 136 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ under supervision of Johannes Fürnkranz.
For current test suite results, check
[Continuous Integration](https://travis-ci.com/azrdev/sklearn-seco).

To run a comparison of `sklearn_seco.concrete.RipperEstimator` with
To run a comparison of `sklearn_seco.RipperEstimator` with
[weka.JRip](http://weka.sourceforge.net/doc.stable/weka/classifiers/rules/JRip.html),
[weka.J48](http://weka.sourceforge.net/doc.stable/weka/classifiers/trees/J48.html), and
[sklearn.dtree](https://scikit-learn.org/stable/modules/tree.html)
Expand Down Expand Up @@ -69,7 +69,7 @@ dependency sets "numba" and "tests".
[the Orange implementation](https://orange3.readthedocs.io/projects/orange-visual-programming/widgets/model/cn2ruleinduction.html),
but should be complete
- Ripper misses the original class binarization strategy and the global post-optimization,
- Ripper lacks the original class binarization strategy and the global post-optimization,
therefore results are not identical to JRip (the only other freely available implementation).
- various TODOs throughout the code mark missing details
Expand Down
2 changes: 1 addition & 1 deletion evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import Bunch

from sklearn_seco.concrete import RipperEstimator
from sklearn_seco import RipperEstimator

CACHE_DIR = 'openml_cache/'
RESULT_DIR = 'evaluation/'
Expand Down
4 changes: 2 additions & 2 deletions seco_line_profiling.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from line_profiler import LineProfiler

from sklearn_seco.tests import conftest
from sklearn_seco import SimpleSeCoEstimator
from sklearn_seco.common import match_rule, RuleContext
from sklearn_seco.concrete import SimpleSeCoEstimator
from sklearn_seco.tests import conftest


def tcn2(dataset):
Expand Down
4 changes: 2 additions & 2 deletions seco_runtime_scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import numpy as np

import sklearn_seco.concrete
import sklearn_seco


def time_seco(estimator: str, dataset_args: str) -> Optional[Sequence[float]]:
Expand Down Expand Up @@ -107,7 +107,7 @@ def log(message):
if __name__ == "__main__":
categorical = 'c' in sys.argv[1:]

estimator = sklearn_seco.concrete.RipperEstimator.__name__
estimator = sklearn_seco.RipperEstimator.__name__
log("start timing of %s" % estimator)
print("n_samples, n_features, timings...")
all_timings = []
Expand Down
8 changes: 7 additions & 1 deletion sklearn_seco/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,10 @@
- classification only, no regression
"""

__all__ = ['abstract', 'common', 'concrete', 'extra', 'tests', 'util']
from sklearn_seco.predefined import \
SimpleSeCoEstimator, CN2Estimator, IrepEstimator, RipperEstimator

__all__ = [
'abstract', 'common', 'concrete', 'extra', 'tests', 'util',
'SimpleSeCoEstimator', 'CN2Estimator', 'IrepEstimator', 'RipperEstimator',
]
131 changes: 7 additions & 124 deletions sklearn_seco/concrete.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
Implementation of SeCo / Covering algorithm:
Usual building blocks & known instantiations of the abstract base algorithm.
Usual building blocks of SeCo algorithms, implemented as Mixins.
Implemented as Mixins. For __init__ parameters, they use cooperative
multi-inheritance, each class has to declare **kwargs and forward anything it
doesn't consume using `super().__init__(**kwargs)`. Users of mixin-composed
classes will have to use keyword- instead of positional arguments.
For __init__ parameters, they use cooperative multi-inheritance: Each class has
to declare **kwargs and forward anything it doesn't consume using
`super().__init__(**kwargs)`. Creators of mixin-composed classes will have to
use keyword- instead of positional arguments.
"""

import functools
Expand All @@ -17,10 +17,9 @@
import numpy as np
from scipy.special import xlogy

from sklearn_seco.abstract import \
Theory, SeCoEstimator
from sklearn_seco.abstract import Theory
from sklearn_seco.common import \
Rule, RuleQueue, AugmentedRule, T, TGT, SeCoAlgorithmConfiguration, \
Rule, RuleQueue, AugmentedRule, T, TGT, \
AbstractSecoImplementation, RuleContext, TheoryContext
from sklearn_seco.ripper_mdl import \
data_description_length, relative_description_length
Expand Down Expand Up @@ -68,9 +67,6 @@ def grow_prune_split(y,
return grow, prune


# Implementation facets


class BeamSearch(AbstractSecoImplementation):
"""Mixin implementing a beam search of width `n`.
Expand Down Expand Up @@ -663,118 +659,5 @@ def _num_unique_values(self, feature: int):
return len(unique_values)


# Example Algorithm configurations


class SimpleSeCoEstimator(SeCoEstimator):
class algorithm_config(SeCoAlgorithmConfiguration):
RuleContextClass = TopDownSearchContext

class Implementation(BeamSearch,
TopDownSearchImplementation,
PurityHeuristic,
NoNegativesStop,
SkipPostPruning,
CoverageRuleStop,
SkipPostProcess):
pass


class CN2Estimator(SeCoEstimator):
"""CN2 as refined by (Clark and Boswell 1991)."""

class algorithm_config(SeCoAlgorithmConfiguration):
RuleContextClass = TopDownSearchContext

class Implementation(BeamSearch,
TopDownSearchImplementation,
LaplaceHeuristic,
SignificanceStoppingCriterion,
SkipPostPruning,
PositiveThresholdRuleStop,
SkipPostProcess):
positive_coverage_stop_threshold = 1 # → PositiveThresholdRuleStop
beam_width = 3 # → BeamSearch


class RipperEstimator(SeCoEstimator):
"""Ripper as defined by (Cohen 1995).
NOTE: The global post-optimization phase is currently not implemented
(that would be the `post_process` method).
"""
class algorithm_config(SeCoAlgorithmConfiguration):
RuleClass = ConditionTracingAugmentedRule
TheoryContextClass = RipperMdlRuleStopTheoryContext

class Implementation(BeamSearch,
TopDownSearchImplementation,
InformationGainHeuristic,
RipperMdlRuleStopImplementation,
RipperPostPruning,
SkipPostProcess
):
@classmethod
@delayed_inner_stop
def inner_stopping_criterion(cls, rule: AugmentedRule,
context: RuleContext) -> bool:
"""Laplace-based criterion. Field `accuRate` in JRip.java."""
p, n = rule.pn(context)
accuracy_rate = (p + 1) / (p + n + 1)
return accuracy_rate >= 1

class RuleContextClass(TopDownSearchContext,
GrowPruneSplitRuleContext):
def pruning_heuristic(self, rule: AugmentedRule,
context: RuleContext
) -> float:
"""Laplace heuristic, as defined by (Clark and Boswell 1991).
JRip documentation states:
"The pruning metric is (p-n)/(p+n) -- but it's actually
2p/(p+n) -1, so in this implementation we simply use p/(p+n)
(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5)."
"""
p, n = rule.pn(context)
return (p + 1) / (p + n + 2) # laplace


class IrepEstimator(SeCoEstimator):
"""IREP as defined by (Cohen 1995), originally by (Fürnkranz, Widmer 1994).
"""

class algorithm_config(SeCoAlgorithmConfiguration):
RuleClass = ConditionTracingAugmentedRule

class Implementation(BeamSearch,
TopDownSearchImplementation,
InformationGainHeuristic,
NoNegativesStop,
RipperPostPruning,
CoverageRuleStop,
SkipPostProcess):

@classmethod
def rule_stopping_criterion(cls, theory: Theory,
rule: AugmentedRule,
context: RuleContext) -> bool:
assert isinstance(context, GrowPruneSplitRuleContext)
context.growing = False
return super().rule_stopping_criterion(theory, rule, context)

class RuleContextClass(TopDownSearchContext,
GrowPruneSplitRuleContext):
def pruning_heuristic(self, rule: AugmentedRule,
context: GrowPruneSplitRuleContext) -> float:
""":return: (#true positives + #true negatives) / #examples"""
context.growing = False
p, n = rule.pn(context)
P, N = context.PN(rule.head)
if P + N == 0:
return 0
tn = N - n
return (p + tn) / (P + N)


# TODO: sklearn.get/set_param setting *Implementation fields?
# TODO: allow defining heuristics/metrics (and stop criteria?) as functions and pulling them in as growing_/pruning_heuristic etc without defining an extra class
125 changes: 125 additions & 0 deletions sklearn_seco/predefined.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Implementation of SeCo / Covering algorithm:
Known instantiations / example configurations of the abstract base algorithm.
"""

from sklearn_seco.abstract import SeCoEstimator
from sklearn_seco.common import \
SeCoAlgorithmConfiguration, AugmentedRule, RuleContext, Theory
from sklearn_seco.concrete import TopDownSearchContext, BeamSearch, \
TopDownSearchImplementation, PurityHeuristic, NoNegativesStop, \
SkipPostPruning, CoverageRuleStop, SkipPostProcess, LaplaceHeuristic, \
SignificanceStoppingCriterion, PositiveThresholdRuleStop, \
ConditionTracingAugmentedRule, RipperMdlRuleStopTheoryContext, \
InformationGainHeuristic, RipperMdlRuleStopImplementation, \
RipperPostPruning, delayed_inner_stop, GrowPruneSplitRuleContext


class SimpleSeCoEstimator(SeCoEstimator):
class algorithm_config(SeCoAlgorithmConfiguration):
RuleContextClass = TopDownSearchContext

class Implementation(BeamSearch,
TopDownSearchImplementation,
PurityHeuristic,
NoNegativesStop,
SkipPostPruning,
CoverageRuleStop,
SkipPostProcess):
pass


class CN2Estimator(SeCoEstimator):
"""CN2 as refined by (Clark and Boswell 1991)."""

class algorithm_config(SeCoAlgorithmConfiguration):
RuleContextClass = TopDownSearchContext

class Implementation(BeamSearch,
TopDownSearchImplementation,
LaplaceHeuristic,
SignificanceStoppingCriterion,
SkipPostPruning,
PositiveThresholdRuleStop,
SkipPostProcess):
positive_coverage_stop_threshold = 1 # → PositiveThresholdRuleStop
beam_width = 3 # → BeamSearch


class RipperEstimator(SeCoEstimator):
"""Ripper as defined by (Cohen 1995).
NOTE: The global post-optimization phase is currently not implemented
(that would be the `post_process` method).
"""
class algorithm_config(SeCoAlgorithmConfiguration):
RuleClass = ConditionTracingAugmentedRule
TheoryContextClass = RipperMdlRuleStopTheoryContext

class Implementation(BeamSearch,
TopDownSearchImplementation,
InformationGainHeuristic,
RipperMdlRuleStopImplementation,
RipperPostPruning,
SkipPostProcess
):
@classmethod
@delayed_inner_stop
def inner_stopping_criterion(cls, rule: AugmentedRule,
context: RuleContext) -> bool:
"""Laplace-based criterion. Field `accuRate` in JRip.java."""
p, n = rule.pn(context)
accuracy_rate = (p + 1) / (p + n + 1)
return accuracy_rate >= 1

class RuleContextClass(TopDownSearchContext,
GrowPruneSplitRuleContext):
def pruning_heuristic(self, rule: AugmentedRule,
context: RuleContext
) -> float:
"""Laplace heuristic, as defined by (Clark and Boswell 1991).
JRip documentation states:
"The pruning metric is (p-n)/(p+n) -- but it's actually
2p/(p+n) -1, so in this implementation we simply use p/(p+n)
(actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5)."
"""
p, n = rule.pn(context)
return (p + 1) / (p + n + 2) # laplace


class IrepEstimator(SeCoEstimator):
"""IREP as defined by (Cohen 1995), originally by (Fürnkranz, Widmer 1994).
"""

class algorithm_config(SeCoAlgorithmConfiguration):
RuleClass = ConditionTracingAugmentedRule

class Implementation(BeamSearch,
TopDownSearchImplementation,
InformationGainHeuristic,
NoNegativesStop,
RipperPostPruning,
CoverageRuleStop,
SkipPostProcess):

@classmethod
def rule_stopping_criterion(cls, theory: Theory,
rule: AugmentedRule,
context: RuleContext) -> bool:
assert isinstance(context, GrowPruneSplitRuleContext)
context.growing = False
return super().rule_stopping_criterion(theory, rule, context)

class RuleContextClass(TopDownSearchContext,
GrowPruneSplitRuleContext):
def pruning_heuristic(self, rule: AugmentedRule,
context: GrowPruneSplitRuleContext) -> float:
""":return: (#true positives + #true negatives) / #examples"""
context.growing = False
p, n = rule.pn(context)
P, N = context.PN(rule.head)
if P + N == 0:
return 0
tn = N - n
return (p + tn) / (P + N)
4 changes: 2 additions & 2 deletions sklearn_seco/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from sklearn.utils import check_random_state
from sklearn.utils.testing import set_random_state

from sklearn_seco import \
SimpleSeCoEstimator, CN2Estimator, RipperEstimator, IrepEstimator
from sklearn_seco.abstract import SeCoEstimator
from sklearn_seco.common import Theory
from sklearn_seco.concrete import \
SimpleSeCoEstimator, CN2Estimator, IrepEstimator, RipperEstimator

from .datasets import Dataset, \
binary_slight_overlap, xor_2d, checkerboard_2d, \
Expand Down
4 changes: 2 additions & 2 deletions sklearn_seco/tests/test_concrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

from sklearn_seco.abstract import _BaseSeCoEstimator
from sklearn_seco.common import Rule
from sklearn_seco.concrete import grow_prune_split, SimpleSeCoEstimator, \
TopDownSearchImplementation
from sklearn_seco.concrete import grow_prune_split, TopDownSearchImplementation
from predefined import SimpleSeCoEstimator
from sklearn_seco.util import TargetTransformingMetaEstimator, \
BySizeLabelEncoder
from .conftest import count_conditions
Expand Down

0 comments on commit 2df453a

Please sign in to comment.