From 17fcc233afbdafa9dcf005902b465d2138b0b979 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 6 Dec 2022 16:50:10 -0500 Subject: [PATCH] tweak + fix tests --- .../src/datahub/ingestion/source/ge_data_profiler.py | 11 +++++++---- .../tests/integration/trino/trino_mces_golden.json | 8 ++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 4e9c46d211a5ae..7e9dab140465eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -574,19 +574,22 @@ def generate_dataset_profile( # noqa: C901 (complexity) self._get_dataset_column_median(column_profile, column) self._get_dataset_column_stdev(column_profile, column) - self._get_dataset_column_quantiles(column_profile, column) - self._get_dataset_column_histogram(column_profile, column) - if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, - Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) + if cardinality in { + Cardinality.FEW, + Cardinality.MANY, + Cardinality.VERY_MANY, + }: + self._get_dataset_column_quantiles(column_profile, column) + self._get_dataset_column_histogram(column_profile, column) elif type_ == ProfilerDataType.STRING: if cardinality in [ diff --git a/metadata-ingestion/tests/integration/trino/trino_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_mces_golden.json index c4adf3d8265a71..b0892c16a27f8e 100644 --- a/metadata-ingestion/tests/integration/trino/trino_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_mces_golden.json @@ -656,7 +656,7 @@ "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 3, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"3\", \"mean\": \"2.0\", \"median\": \"2\", \"stdev\": \"1.0\", \"quantiles\": [{\"quantile\": \"0.05\", \"value\": \"1\"}, {\"quantile\": \"0.25\", \"value\": \"1\"}, {\"quantile\": \"0.5\", \"value\": \"2\"}, {\"quantile\": \"0.75\", \"value\": \"3\"}, {\"quantile\": \"0.95\", \"value\": \"3\"}], \"histogram\": {\"boundaries\": [\"1.0\", \"2.0\", \"3.0\"], \"heights\": [0.0, 0.3333333333333333, 0.6666666666666666, 0.0]}, \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1, \"sampleValues\": []}, {\"fieldPath\": \"tags\", \"nullCount\": 3, \"nullProportion\": 1, \"sampleValues\": []}, {\"fieldPath\": \"genre_ids\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1, \"sampleValues\": []}]}", + "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 3, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 3, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"3\", \"mean\": \"2.0\", \"median\": \"2\", \"stdev\": \"1.0\", \"sampleValues\": [\"1\", \"2\", \"3\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 3, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 1\", \"Book 2\", \"Book 3\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 3, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"ABC\", \"PQR\", \"XYZ\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1, \"sampleValues\": []}, {\"fieldPath\": \"tags\", \"nullCount\": 3, \"nullProportion\": 1, \"sampleValues\": []}, {\"fieldPath\": \"genre_ids\", \"uniqueCount\": 0, \"nullCount\": 3, \"nullProportion\": 1, \"sampleValues\": []}]}", "contentType": "application/json" }, "systemMetadata": { @@ -670,7 +670,7 @@ "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"2\", \"mean\": \"1.5\", \"median\": \"2\", \"stdev\": \"0.7071067811865476\", \"quantiles\": [{\"quantile\": \"0.05\", \"value\": \"1\"}, {\"quantile\": \"0.25\", \"value\": \"1\"}, {\"quantile\": \"0.5\", \"value\": \"2\"}, {\"quantile\": \"0.75\", \"value\": \"2\"}, {\"quantile\": \"0.95\", \"value\": \"2\"}], \"histogram\": {\"boundaries\": [\"1.0\", \"1.5\", \"2.0\"], \"heights\": [0.0, 0.5, 0.5, 0.0]}, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"2\", \"mean\": \"1.5\", \"median\": \"2\", \"stdev\": \"0.7071067811865476\", \"quantiles\": [{\"quantile\": \"0.05\", \"value\": \"1\"}, {\"quantile\": \"0.25\", \"value\": \"1\"}, {\"quantile\": \"0.5\", \"value\": \"2\"}, {\"quantile\": \"0.75\", \"value\": \"2\"}, {\"quantile\": \"0.95\", \"value\": \"2\"}], \"histogram\": {\"boundaries\": [\"1.0\", \"1.5\", \"2.0\"], \"heights\": [0.0, 0.5, 0.5, 0.0]}, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"distinctValueFrequencies\": [{\"value\": \"2021-09-27\", \"frequency\": 2}], \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", + "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 2, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"book_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"2\", \"mean\": \"1.5\", \"median\": \"2\", \"stdev\": \"0.7071067811865476\", \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"2\", \"mean\": \"1.5\", \"median\": \"2\", \"stdev\": \"0.7071067811865476\", \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 0.5, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"distinctValueFrequencies\": [{\"value\": \"2021-09-27\", \"frequency\": 2}], \"sampleValues\": [\"2021-09-27\", \"2021-09-27\"]}, {\"fieldPath\": \"return_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 1, \"nullProportion\": 0.5, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", "contentType": "application/json" }, "systemMetadata": { @@ -684,7 +684,7 @@ "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"2\", \"mean\": \"1.5\", \"median\": \"2\", \"stdev\": \"0.7071067811865476\", \"quantiles\": [{\"quantile\": \"0.05\", \"value\": \"1\"}, {\"quantile\": \"0.25\", \"value\": \"1\"}, {\"quantile\": \"0.5\", \"value\": \"2\"}, {\"quantile\": \"0.75\", \"value\": \"2\"}, {\"quantile\": \"0.95\", \"value\": \"2\"}], \"histogram\": {\"boundaries\": [\"1.0\", \"1.5\", \"2.0\"], \"heights\": [0.0, 0.5, 0.5, 0.0]}, \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}", + "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 2, \"columnCount\": 2, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1\", \"max\": \"2\", \"mean\": \"1.5\", \"median\": \"2\", \"stdev\": \"0.7071067811865476\", \"sampleValues\": [\"1\", \"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 2, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Member 1\", \"Member 2\"]}]}", "contentType": "application/json" }, "systemMetadata": { @@ -698,7 +698,7 @@ "changeType": "UPSERT", "aspectName": "datasetProfile", "aspect": { - "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2\", \"max\": \"2\", \"mean\": \"2.0\", \"median\": \"2\", \"stdev\": \"0.0\", \"quantiles\": [{\"quantile\": \"0.05\", \"value\": \"2\"}, {\"quantile\": \"0.25\", \"value\": \"2\"}, {\"quantile\": \"0.5\", \"value\": \"2\"}, {\"quantile\": \"0.75\", \"value\": \"2\"}, {\"quantile\": \"0.95\", \"value\": \"2\"}], \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2\", \"max\": \"2\", \"mean\": \"2.0\", \"median\": \"2\", \"stdev\": \"0.0\", \"quantiles\": [{\"quantile\": \"0.05\", \"value\": \"2\"}, {\"quantile\": \"0.25\", \"value\": \"2\"}, {\"quantile\": \"0.5\", \"value\": \"2\"}, {\"quantile\": \"0.75\", \"value\": \"2\"}, {\"quantile\": \"0.95\", \"value\": \"2\"}], \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", + "value": "{\"timestampMillis\": 1632398400000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 1, \"columnCount\": 6, \"fieldProfiles\": [{\"fieldPath\": \"id\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2\", \"max\": \"2\", \"mean\": \"2.0\", \"median\": \"2\", \"stdev\": \"0.0\", \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"name\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"Book 2\"]}, {\"fieldPath\": \"author\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"PQR\"]}, {\"fieldPath\": \"publisher\", \"uniqueCount\": 0, \"nullCount\": 1, \"nullProportion\": 1, \"sampleValues\": []}, {\"fieldPath\": \"member_id\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2\", \"max\": \"2\", \"mean\": \"2.0\", \"median\": \"2\", \"stdev\": \"0.0\", \"sampleValues\": [\"2\"]}, {\"fieldPath\": \"issue_date\", \"uniqueCount\": 1, \"uniqueProportion\": 1, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"2021-09-27\", \"max\": \"2021-09-27\", \"sampleValues\": [\"2021-09-27\"]}]}", "contentType": "application/json" }, "systemMetadata": {