Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/sapientml/core into main
Browse files Browse the repository at this point in the history
  • Loading branch information
AkiraUra committed Aug 23, 2023
2 parents a0f4b3a + eb111d1 commit 51263fb
Show file tree
Hide file tree
Showing 14 changed files with 240 additions and 3,720 deletions.
3,637 changes: 0 additions & 3,637 deletions poetry.lock

This file was deleted.

17 changes: 11 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ license = "Apache-2.0"
[tool.poetry.dependencies]
python = ">=3.10,<3.13"
sapientml = "*"
sapientml-loaddata = "^0.2.5"
sapientml-preprocess = "^0.2.2"
sapientml-loaddata = "^0.3.3"
sapientml-preprocess = "^0.3.1"
scikit-learn = "1.1.3"
scipy = "^1.11.1"
jinja2 = "^3.1.2"
Expand Down Expand Up @@ -44,8 +44,11 @@ pre-commit = "^3.3.3"
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.plugins."pipeline_generator"]
sapientml_core = "sapientml_core:SapientMLGenerator"
[tool.poetry.plugins."sapientml.pipeline_generator"]
sapientml = "sapientml_core:SapientMLGenerator"

[tool.poetry.plugins."sapientml.config"]
sapientml = "sapientml_core:SapientMLConfig"

[tool.pysen]
version = "0.10"
Expand All @@ -63,5 +66,7 @@ py_version = "py310"

[tool.pysen.lint.source]
includes = ["sapientml_core/", "tests/"]
[pytest]
addopts = "-p no:cacheprovider -s"

[tool.pytest.ini_options]
addopts = "-s -x --cov=sapientml_core"
testpaths = ["tests"]
3 changes: 2 additions & 1 deletion sapientml_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
# limitations under the License.

from .generator import SapientMLGenerator
from .params import SapientMLConfig

__all__ = ["SapientMLGenerator"]
__all__ = ["SapientMLGenerator", "SapientMLConfig"]
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _get_labels_from_skeleton_predictor(self):
)
)

n_models = self.task.n_models
n_models = self.config.n_models
if n_models < 1:
raise ValueError("Please set 'n_models' to a number greater than or equal to 1.")
model_labels = dict(list(model_labels.items())[0:n_models])
Expand Down
4 changes: 2 additions & 2 deletions sapientml_core/explain/code_miner.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ def execute_notebook(
if (timeout > 0) and (time.time() - start_time) > timeout:
ep_thread.trigger_interrupt_kernel()
is_interrupted = True
if cancel and cancel.isTriggered:
if cancel and cancel.is_triggered:
ep_thread.trigger_interrupt_kernel()
is_interrupted = True
time.sleep(1)
Expand All @@ -729,7 +729,7 @@ def save_all(self, execution=False, timeout: int = 0, cancel: Optional[Cancellat
path = os.path.join(self.output_path, rec["filename"] + ".ipynb")
self.__save__(path, rec["jupyter"])
self.logger.info(f"saved:{path}")
if execution and (cancel is None or cancel.isTriggered is False):
if execution and (cancel is None or cancel.is_triggered is False):
try:
self.logger.info("Running the explained notebook...")
with open(path, "r", encoding="utf-8") as f:
Expand Down
166 changes: 113 additions & 53 deletions sapientml_core/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,43 +14,45 @@

import ast
import copy
import glob
import json
import re
from importlib.metadata import entry_points
from pathlib import Path
from typing import Optional, Tuple
from shutil import copyfile
from typing import Tuple, Union

import pandas as pd
from sapientml.executor import PipelineExecutor
from sapientml.generator import CodeBlockGenerator, PipelineGenerator
from sapientml.params import CancellationToken, Code, Config, Dataset, PipelineResult, RunningResult, Task
from sapientml.result import SapientMLGeneratorResult
from sapientml.macros import metric_lower_is_better
from sapientml.params import Code, Dataset, PipelineResult, RunningResult, Task
from sapientml.util.json_util import JSONEncoder
from sapientml.util.logging import setup_logger

from . import ps_macros
from .adaptation.generation.template_based_adaptation import Adaptation
from .explain.main import process as explain
from .params import Pipeline, summarize_dataset
from .params import Pipeline, SapientMLConfig, summarize_dataset
from .seeding.predictor import predict

model_dir_path_default = Path(__file__).parent / "models"

logger = setup_logger()


def _is_strnum_column(c):
c2 = c.loc[c.notnull()]
c2 = pd.to_numeric(c2, errors="coerce")
ratio = c2.notnull().sum() / c2.shape[0]
return ratio > 0.9


class SapientMLGenerator(PipelineGenerator, CodeBlockGenerator):
def __init__(self, config: Config):
CodeBlockGenerator.__init__(self, config)
eps = entry_points(group="code_block_generator")
self.loaddata = eps["loaddata"].load()(config)
self.preprocess = eps["preprocess"].load()(config)
def __init__(self, **kwargs):
self.config = SapientMLConfig(**kwargs)
self.config.postinit()
eps = entry_points(group="sapientml.code_block_generator")
self.loaddata = eps["loaddata"].load()(**kwargs)
self.preprocess = eps["preprocess"].load()(**kwargs)

def generate_pipeline(self, dataset: Dataset, task: Task) -> list[Code]:
def generate_pipeline(self, dataset: Dataset, task: Task):
self.dataset = dataset
self.task = task

logger.info("Generating pipelines...")
dataset, loaddata_block = self.loaddata.generate_code(dataset, task)
dataset, preprocess_block = self.preprocess.generate_code(dataset, task)
code_block = loaddata_block + preprocess_block
Expand All @@ -63,10 +65,27 @@ def generate_pipeline(self, dataset: Dataset, task: Task) -> list[Code]:
pipeline.train = code_block.train + pipeline.train
pipeline.predict = code_block.predict + pipeline.predict
result_pipelines.append(pipeline)
return result_pipelines

logger.info("Executing generated pipelines...")
executor = PipelineExecutor()
self.execution_results = executor.execute(
result_pipelines,
self.config.initial_timeout,
Path(dataset.output_dir),
self.config.cancel,
)

logger.info("Evaluating execution results of generated pipelines...")
lower_is_better = self.task.adaptation_metric in metric_lower_is_better
self.evaluate(self.execution_results, lower_is_better)
logger.info("Done.")

return (self._best_pipeline, self._best_pipeline_score), self._candidate_scripts

def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, list[Pipeline]]:
df = dataset.training_dataframe
# Generate the meta-features
logger.info("Generating meta features ...")
dataset_summary = summarize_dataset(df, task) # type: ignore
if dataset_summary.has_inf_value_targets:
raise ValueError("Stopped generation because target columns have infinity value.")
Expand Down Expand Up @@ -97,11 +116,11 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, list[Pip

return dataset, pipelines

def evaluate(self, pipeline_results: list[tuple[Code, RunningResult]], lower_is_better: bool = False) -> None:
def evaluate(self, execution_results: list[tuple[Code, RunningResult]], lower_is_better: bool = False) -> None:
self._best_pipeline = None
self._best_pipeline_score = PipelineResult(score=None, metric=None, best_params=None)
candidate_scripts = []
for pipeline, result in pipeline_results:
for pipeline, result in execution_results:
if result.returncode == 0:
pipeline_score = self._parse_pipeline_output(result.output)
else:
Expand Down Expand Up @@ -140,10 +159,6 @@ def evaluate(self, pipeline_results: list[tuple[Code, RunningResult]], lower_is_
self._best_pipeline = best_pipeline
self._best_pipeline_score = best_pipeline_tuple[1]


def get_result(self):
return (self._best_pipeline, self._best_pipeline_score), self._candidate_scripts

@staticmethod
def _parse_pipeline_output(output: str):
score = None
Expand All @@ -162,40 +177,85 @@ def _parse_pipeline_output(output: str):
pass
return PipelineResult(score=score, metric=metric, best_params=best_params)

def save(
self,
result: SapientMLGeneratorResult,
output_dir_path: str,
project_name: str = "",
cancel: Optional[CancellationToken] = None,
):
if self._best_pipeline is None:
return

skeleton = self._best_pipeline.labels

def save(self, output_dir: Union[Path, str]):
def add_prefix(filename, prefix):
if not prefix:
return filename
return f"{prefix}_{filename}"

path = Path(output_dir)
path.mkdir(parents=True, exist_ok=True)

_output_dir = Path(self.dataset.output_dir)

candidate_scripts = self._candidate_scripts
if candidate_scripts:
if self._best_pipeline:
script_body = self._best_pipeline.test.replace(_output_dir.as_posix(), ".")
with open(path / add_prefix("final_script.py", self.config.project_name), "w", encoding="utf-8") as f:
f.write(script_body)

script_body = self._best_pipeline.train.replace(_output_dir.as_posix(), ".")
with open(path / add_prefix("final_train.py", self.config.project_name), "w", encoding="utf-8") as f:
f.write(script_body)

script_body = self._best_pipeline.predict.replace(_output_dir.as_posix(), ".")
with open(path / add_prefix("final_predict.py", self.config.project_name), "w", encoding="utf-8") as f:
f.write(script_body)

with open(
path / (add_prefix("final_script", self.config.project_name) + ".out.json"),
"w",
encoding="utf-8",
) as f:
json.dump(self._best_pipeline_score.__dict__, f, cls=JSONEncoder, indent=4)
else:
logger.warning("All candidate scripts failed. Final script is not saved.")
raise RuntimeError("All candidate scripts failed. Final script is not saved.")

# copy libs
lib_path = path / "lib"
lib_path.mkdir(exist_ok=True)

eps = entry_points(group="sapientml.export_modules")
for ep in eps:
for file in glob.glob(f"{ep.load().__path__[0]}/*.py"):
copyfile(file, lib_path / Path(file).name)

for index, (script, detail) in enumerate(candidate_scripts, start=1):
# script.dataset.training_data_path is '{user specified dir}/{name}.csv' or '{tmpdir}/training.pkl'
# If latter one, we have to modify the {tmpdir} to output_dir.
script_body = script.validation.replace(_output_dir.as_posix(), ".")

with open(path / f"{index}_script.py", "w", encoding="utf-8") as f:
f.write(script_body)

skeleton = self._best_pipeline.labels

debug_info = {}
for i, candidate in enumerate(self._candidate_scripts):
for i, candidate in enumerate(candidate_scripts, start=1):
info = {"content": candidate[0].dict(), "run_info": candidate[1].__dict__}
debug_info[i] = info

explain(
visualization=True,
eda=True,
dataframe=result.training_data,
script_path=(Path(output_dir_path) / add_prefix("final_script.py", project_name)).absolute().as_posix(),
target_columns=result.target_columns,
problem_type=result.task_type,
ignore_columns=result.ignore_columns,
skeleton=skeleton,
explanation=self._best_pipeline.pipeline_json,
run_info=debug_info,
internal_execution=True,
timeout=result.timeout_for_test,
cancel=cancel,
)
if self.config.debug:
with open(path / add_prefix("run_info.json", self.config.project_name), "w", encoding="utf-8") as f:
json.dump(debug_info, f, cls=JSONEncoder, indent=4)

if self.config.add_explanation:
explain(
visualization=True,
eda=True,
dataframe=self.dataset.training_dataframe,
script_path=(Path(output_dir) / add_prefix("final_script.py", self.config.project_name))
.absolute()
.as_posix(),
target_columns=self.task.target_columns,
problem_type=self.task.task_type,
ignore_columns=self.task.ignore_columns,
skeleton=skeleton,
explanation=self._best_pipeline.pipeline_json,
run_info=debug_info,
internal_execution=True,
timeout=self.config.timeout_for_test,
cancel=self.config.cancel,
)
Loading

0 comments on commit 51263fb

Please sign in to comment.