Skip to content

Commit

Permalink
Great expectation export improvements (#496)
Browse files Browse the repository at this point in the history
* feat: export GX add expectation_suite_name param

* feat: add engine, suite_name and sql_server_type management in GE export

* format

* doc: update changelog and readme

* tests: add cli tests
  • Loading branch information
pierre-monnet authored Nov 14, 2024
1 parent 0925f76 commit bcb7d53
Show file tree
Hide file tree
Showing 8 changed files with 690 additions and 84 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- Support for model import from parquet file metadata.
- Great Expectation export: add optional args (#496)
- `suite_name` the name of the expectation suite to export
- `engine` used to run checks
- `sql_server_type` to define the type of SQL Server to use when engine is `sql`
- Changelog support for `Info` and `Terms` blocks.

### Changed
Expand Down
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -819,9 +819,10 @@ Available export options:
| `dcs` | Export to Data Contract Specification in YAML format ||
| Missing something? | Please create an issue on GitHub | TBD |


#### Great Expectations

The export function transforms a specified data contract into a comprehensive Great Expectations JSON suite.
The `export` function transforms a specified data contract into a comprehensive Great Expectations JSON suite.
If the contract includes multiple models, you need to specify the names of the model you wish to export.

```shell
Expand All @@ -831,7 +832,22 @@ datacontract export datacontract.yaml --format great-expectations --model order
The export creates a list of expectations by utilizing:

- The data from the Model definition with a fixed mapping
- The expectations provided in the quality field for each model (find here the expectations gallery https://greatexpectations.io/expectations/)
- The expectations provided in the quality field for each model (find here the expectations gallery: [Great Expectations Gallery](https://greatexpectations.io/expectations/))

### Additional Arguments

To further customize the export, the following optional arguments are available:

- **`suite_name`**: The name of the expectation suite. This suite groups all generated expectations and provides a convenient identifier within Great Expectations. If not provided, a default suite name will be generated based on the model name(s).

- **`engine`**: Specifies the engine used to run Great Expectations checks. Accepted values are:
- `pandas` — Use this when working with in-memory data frames through the Pandas library.
- `spark` — Use this for working with Spark dataframes.
- `sql` — Use this for working with SQL databases.

- **`sql_server_type`**: Specifies the type of SQL server to connect with when `engine` is set to `sql`.

Providing `sql_server_type` ensures that the appropriate SQL dialect and connection settings are applied during the expectation validation.

#### RDF

Expand Down
95 changes: 72 additions & 23 deletions datacontract/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,13 @@
from datacontract.catalog.catalog import create_data_contract_html, create_index_html
from datacontract.data_contract import DataContract, ExportFormat
from datacontract.imports.importer import ImportFormat
from datacontract.init.download_datacontract_file import FileExistsException, download_datacontract_file
from datacontract.integration.datamesh_manager import publish_data_contract_to_datamesh_manager
from datacontract.init.download_datacontract_file import (
FileExistsException,
download_datacontract_file,
)
from datacontract.integration.datamesh_manager import (
publish_data_contract_to_datamesh_manager,
)

DEFAULT_DATA_CONTRACT_SCHEMA_URL = "https://datacontract.com/datacontract.schema.json"

Expand Down Expand Up @@ -45,7 +50,11 @@ def version_callback(value: bool):
def common(
ctx: typer.Context,
version: bool = typer.Option(
None, "--version", help="Prints the current version.", callback=version_callback, is_eager=True
None,
"--version",
help="Prints the current version.",
callback=version_callback,
is_eager=True,
),
):
"""
Expand All @@ -61,7 +70,8 @@ def common(
@app.command()
def init(
location: Annotated[
str, typer.Argument(help="The location (url or path) of the data contract yaml to create.")
str,
typer.Argument(help="The location (url or path) of the data contract yaml to create."),
] = "datacontract.yaml",
template: Annotated[
str, typer.Option(help="URL of a template or data contract")
Expand All @@ -83,10 +93,12 @@ def init(
@app.command()
def lint(
location: Annotated[
str, typer.Argument(help="The location (url or path) of the data contract yaml.")
str,
typer.Argument(help="The location (url or path) of the data contract yaml."),
] = "datacontract.yaml",
schema: Annotated[
str, typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema")
str,
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
):
"""
Expand All @@ -99,10 +111,12 @@ def lint(
@app.command()
def test(
location: Annotated[
str, typer.Argument(help="The location (url or path) of the data contract yaml.")
str,
typer.Argument(help="The location (url or path) of the data contract yaml."),
] = "datacontract.yaml",
schema: Annotated[
str, typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema")
str,
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
server: Annotated[
str,
Expand All @@ -114,7 +128,8 @@ def test(
),
] = "all",
examples: Annotated[
bool, typer.Option(help="Run the schema and quality tests on the example data within the data contract.")
bool,
typer.Option(help="Run the schema and quality tests on the example data within the data contract."),
] = None,
publish: Annotated[str, typer.Option(help="The url to publish the results after the test")] = None,
publish_to_opentelemetry: Annotated[
Expand Down Expand Up @@ -165,7 +180,10 @@ def export(
# TODO: this should be a subcommand
rdf_base: Annotated[
Optional[str],
typer.Option(help="[rdf] The base URI used to generate the RDF graph.", rich_help_panel="RDF Options"),
typer.Option(
help="[rdf] The base URI used to generate the RDF graph.",
rich_help_panel="RDF Options",
),
] = None,
# TODO: this should be a subcommand
sql_server_type: Annotated[
Expand All @@ -176,11 +194,18 @@ def export(
),
] = "auto",
location: Annotated[
str, typer.Argument(help="The location (url or path) of the data contract yaml.")
str,
typer.Argument(help="The location (url or path) of the data contract yaml."),
] = "datacontract.yaml",
schema: Annotated[
str, typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema")
str,
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
# TODO: this should be a subcommand
engine: Annotated[
Optional[str],
typer.Option(help="[engine] The engine used for great expection run."),
] = None,
):
"""
Convert data contract to a specific format. console.prints to stdout.
Expand All @@ -192,6 +217,7 @@ def export(
server=server,
rdf_base=rdf_base,
sql_server_type=sql_server_type,
engine=engine,
)
# Don't interpret console markup in output.
if output is None:
Expand All @@ -206,7 +232,8 @@ def export(
def import_(
format: Annotated[ImportFormat, typer.Option(help="The format of the source file.")],
source: Annotated[
Optional[str], typer.Option(help="The path to the file or Glue Database that should be imported.")
Optional[str],
typer.Option(help="The path to the file or Glue Database that should be imported."),
] = None,
glue_table: Annotated[
Optional[List[str]],
Expand Down Expand Up @@ -270,10 +297,12 @@ def import_(
@app.command(name="publish")
def publish(
location: Annotated[
str, typer.Argument(help="The location (url or path) of the data contract yaml.")
str,
typer.Argument(help="The location (url or path) of the data contract yaml."),
] = "datacontract.yaml",
schema: Annotated[
str, typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema")
str,
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
):
"""
Expand All @@ -289,11 +318,13 @@ def publish(
@app.command(name="catalog")
def catalog(
files: Annotated[
Optional[str], typer.Option(help="Glob pattern for the data contract files to include in the catalog.")
Optional[str],
typer.Option(help="Glob pattern for the data contract files to include in the catalog."),
] = "*.yaml",
output: Annotated[Optional[str], typer.Option(help="Output directory for the catalog html files.")] = "catalog/",
schema: Annotated[
str, typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema")
str,
typer.Option(help="The location (url or path) of the Data Contract Specification JSON Schema"),
] = DEFAULT_DATA_CONTRACT_SCHEMA_URL,
):
"""
Expand All @@ -315,8 +346,14 @@ def catalog(

@app.command()
def breaking(
location_old: Annotated[str, typer.Argument(help="The location (url or path) of the old data contract yaml.")],
location_new: Annotated[str, typer.Argument(help="The location (url or path) of the new data contract yaml.")],
location_old: Annotated[
str,
typer.Argument(help="The location (url or path) of the old data contract yaml."),
],
location_new: Annotated[
str,
typer.Argument(help="The location (url or path) of the new data contract yaml."),
],
):
"""
Identifies breaking changes between data contracts. Prints to stdout.
Expand All @@ -335,8 +372,14 @@ def breaking(

@app.command()
def changelog(
location_old: Annotated[str, typer.Argument(help="The location (url or path) of the old data contract yaml.")],
location_new: Annotated[str, typer.Argument(help="The location (url or path) of the new data contract yaml.")],
location_old: Annotated[
str,
typer.Argument(help="The location (url or path) of the old data contract yaml."),
],
location_new: Annotated[
str,
typer.Argument(help="The location (url or path) of the new data contract yaml."),
],
):
"""
Generate a changelog between data contracts. Prints to stdout.
Expand All @@ -352,8 +395,14 @@ def changelog(

@app.command()
def diff(
location_old: Annotated[str, typer.Argument(help="The location (url or path) of the old data contract yaml.")],
location_new: Annotated[str, typer.Argument(help="The location (url or path) of the new data contract yaml.")],
location_old: Annotated[
str,
typer.Argument(help="The location (url or path) of the old data contract yaml."),
],
location_new: Annotated[
str,
typer.Argument(help="The location (url or path) of the new data contract yaml."),
],
):
"""
PLACEHOLDER. Currently works as 'changelog' does.
Expand Down
2 changes: 1 addition & 1 deletion datacontract/export/exporter_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def load_module_class(module_path, class_name):
exporter_factory.register_lazy_exporter(
name=ExportFormat.great_expectations,
module_path="datacontract.export.great_expectations_converter",
class_name="GreateExpectationsExporter",
class_name="GreatExpectationsExporter",
)

exporter_factory.register_lazy_exporter(
Expand Down
Loading

0 comments on commit bcb7d53

Please sign in to comment.