This repository has been archived by the owner on Apr 14, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 64
/
pipeline.yaml
113 lines (111 loc) · 5.29 KB
/
pipeline.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# `pipeline.yaml` is the main configuration file for an MLflow Pipeline.
# Required pipeline parameters should be defined in this file with either concrete values or
# variables such as {{ INGEST_DATA_LOCATION }}.
#
# Variables must be dereferenced in a profile YAML file, located under `profiles/`.
# See `profiles/local.yaml` for example usage. One may switch among profiles quickly by
# providing a profile name such as `local` in the Pipeline object constructor:
# `p = Pipeline(profile="local")`
#
# NOTE: All "FIXME::REQUIRED" fields in pipeline.yaml and profiles/*.yaml must be set correctly
# to adapt this template to a specific regression problem. To find all required fields,
# under the root directory of this pipeline, type on a unix-like command line:
# $> grep "# FIXME::REQUIRED:" pipeline.yaml profiles/*.yaml
#
# NOTE: YAML does not support tabs for indentation. Please use spaces and ensure that all YAML
# files are properly formatted.
template: "regression/v1"
# Specifies the dataset to use for model development
data:
# Dataset locations on the local filesystem are supported, as well as HTTP(S) URLs and
# any other remote locations resolvable by MLflow, such as those listed in
# https://mlflow.org/docs/latest/tracking.html#artifact-stores
location: {{INGEST_DATA_LOCATION}}
# Beyond `parquet` datasets, the `spark_sql` and `delta` formats are also natively supported for
# use with Spark
format: {{INGEST_DATA_FORMAT|default('parquet')}}
# Datasets with other formats, including `csv`, can be used by implementing and
# specifying a `custom_loader_method`
custom_loader_method: steps.ingest.load_file_as_dataframe
# If the `spark_sql` `format` is specified,
# And if the table location format is path-like, use the following sql command for Spark to read
# sub-columns from the table:
# sql: SELECT col1, col2 FROM delta.`{{INGEST_DATA_LOCATION}}`
# And if the table location format is table-like, use the following sql command for Spark to read
# sub-columns from the table:
# sql: SELECT col1, col2 FROM {{INGEST_DATA_LOCATION}}
# If the `delta` `format` is specified, you can also configure the Delta table `version` to read
# or the `timestamp` at which to read data
# version: 2
# timestamp: 2022-06-01T00:00:00.000Z
# FIXME::OPTIONAL: Specify the dataset to use for batch scoring. All params serve the same function
# as in `data`
# data_scoring:
# location: {{INGEST_SCORING_DATA_LOCATION}}
# format: {{INGEST_SCORING_DATA_FORMAT|default('parquet')}}
# custom_loader_method: steps.ingest.load_file_as_dataframe
# sql: SELECT * FROM delta.`{{INGEST_SCORING_DATA_LOCATION}}`
#
# FIXME::REQUIRED: Specifies the target column name for model training and evaluation.
#
target_col: ""
steps:
split:
#
# FIXME::OPTIONAL: Adjust the train/validation/test split ratios below.
#
split_ratios: [0.75, 0.125, 0.125]
#
# FIXME::OPTIONAL: Specifies the method to use to "post-process" the split datasets. Note that
# arbitrary transformations should go into the transform step.
post_split_filter_method: steps.split.create_dataset_filter
transform:
#
# FIXME::OPTIONAL: Specifies the method that defines an sklearn-compatible transformer, which
# applies input feature transformation during model training and inference.
transformer_method: steps.transform.transformer_fn
train:
using: estimator_spec
# Specifies the method that defines the estimator type and parameters to use for model training
estimator_method: steps.train.estimator_fn
evaluate:
#
# FIXME::OPTIONAL: Sets performance thresholds that a trained model must meet in order to be
# eligible for registration to the MLflow Model Registry.
#
# validation_criteria:
# - metric: root_mean_squared_error
# threshold: 10
register:
#
# FIXME::REQUIRED: Specifies the name of the Registered Model to use when registering a trained
# model to the MLflow Model Registry.
#
model_name: ""
# Indicates whether or not a model that fails to meet performance thresholds should still
# be registered to the MLflow Model Registry
allow_non_validated_model: false
# FIXME::OPTIONAL: Configure the predict step for batch scoring. See README.md for full
# configuration reference.
# predict:
# output_format: {{SCORED_OUTPUT_DATA_FORMAT|default('parquet')}}
# output_location: {{SCORED_OUTPUT_DATA_LOCATION}}
# model_uri: "models/model.pkl"
# result_type: "double"
# save_mode: "default
metrics:
#
# FIXME::REQUIRED: Sets the primary metric to use to evaluate model performance. This primary
# metric is used to select best performing models in MLflow UI as well as in
# train and evaluation step.
# Built-in metrics are: example_count, mean_absolute_error, mean_squared_error,
# root_mean_squared_error, sum_on_label, mean_on_label, r2_score, max_error,
# mean_absolute_percentage_error
primary: ""
#
# FIXME::OPTIONAL: Defines custom performance metrics to compute during model development.
#
# custom:
# - name: ""
# function: get_custom_metrics
# greater_is_better: False