Skip to content

Commit

Permalink
feat(ingest): update profiling to fetch configurable number of sample…
Browse files Browse the repository at this point in the history
… values (#6859)
  • Loading branch information
mayurinehate authored Dec 27, 2022
1 parent 10ea10c commit 69a2347
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,12 @@ def _get_dataset_column_sample_values(
self.dataset.set_config_value("interactive_evaluation", True)

res = self.dataset.expect_column_values_to_be_in_set(
column, [], result_format="SUMMARY"
column,
[],
result_format={
"result_format": "SUMMARY",
"partial_unexpected_count": self.config.field_sample_values_limit,
},
).result

column_profile.sampleValues = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ class GEProfilingConfig(ConfigModel):
default=True,
description="Whether to profile for the sample values for all columns.",
)
field_sample_values_limit: int = Field(
default=20,
description="Upper limit for number of sample values to collect for all columns.",
)

_allow_deny_patterns: AllowDenyPattern = pydantic.PrivateAttr(
default=AllowDenyPattern.allow_all(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@
"changeType": "UPSERT",
"aspectName": "datasetProfile",
"aspect": {
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 112, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 0.08928571428571429, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"10001\", \"max\": \"10010\", \"mean\": \"10005.3125\", \"median\": \"10005.0\", \"stdev\": \"2.834889609688869\", \"distinctValueFrequencies\": [{\"value\": \"10001\", \"frequency\": 17}, {\"value\": \"10002\", \"frequency\": 6}, {\"value\": \"10003\", \"frequency\": 7}, {\"value\": \"10004\", \"frequency\": 16}, {\"value\": \"10005\", \"frequency\": 13}, {\"value\": \"10006\", \"frequency\": 12}, {\"value\": \"10007\", \"frequency\": 14}, {\"value\": \"10008\", \"frequency\": 3}, {\"value\": \"10009\", \"frequency\": 18}, {\"value\": \"10010\", \"frequency\": 6}], \"sampleValues\": [\"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10002\", \"10002\", \"10002\"]}, {\"fieldPath\": \"salary\", \"uniqueCount\": 111, \"uniqueProportion\": 0.9910714285714286, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"40000\", \"max\": \"94692\", \"mean\": \"68303.11607142857\", \"median\": \"69544.0\", \"stdev\": \"15505.291475014095\", \"sampleValues\": [\"60117\", \"62102\", \"66074\", \"66596\", \"66961\", \"71046\", \"74333\", \"75286\", \"75994\", \"76884\", \"80013\", \"81025\", \"81097\", \"84917\", \"85112\", \"85097\", \"88958\", \"65909\", \"65909\", \"67534\"]}, {\"fieldPath\": \"from_date\", \"uniqueCount\": 106, \"uniqueProportion\": 0.9464285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"2002-06-22\", \"sampleValues\": [\"1986-06-26\", \"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\"]}, {\"fieldPath\": \"to_date\", \"uniqueCount\": 99, \"uniqueProportion\": 0.8839285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1986-02-18\", \"max\": \"9999-01-01\", \"sampleValues\": [\"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"9999-01-01\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\"]}]}",
"value": "{\"timestampMillis\": 1586847600000, \"partitionSpec\": {\"type\": \"FULL_TABLE\", \"partition\": \"FULL_TABLE_SNAPSHOT\"}, \"rowCount\": 112, \"columnCount\": 4, \"fieldProfiles\": [{\"fieldPath\": \"emp_no\", \"uniqueCount\": 10, \"uniqueProportion\": 0.08928571428571429, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"10001\", \"max\": \"10010\", \"mean\": \"10005.3125\", \"median\": \"10005.0\", \"stdev\": \"2.834889609688869\", \"distinctValueFrequencies\": [{\"value\": \"10001\", \"frequency\": 17}, {\"value\": \"10002\", \"frequency\": 6}, {\"value\": \"10003\", \"frequency\": 7}, {\"value\": \"10004\", \"frequency\": 16}, {\"value\": \"10005\", \"frequency\": 13}, {\"value\": \"10006\", \"frequency\": 12}, {\"value\": \"10007\", \"frequency\": 14}, {\"value\": \"10008\", \"frequency\": 3}, {\"value\": \"10009\", \"frequency\": 18}, {\"value\": \"10010\", \"frequency\": 6}], \"sampleValues\": [\"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10001\", \"10002\", \"10002\", \"10002\", \"10002\", \"10002\", \"10002\", \"10003\", \"10003\", \"10003\", \"10003\", \"10003\", \"10003\", \"10003\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10004\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10005\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10006\", \"10007\", \"10007\", \"10007\", \"10007\"]}, {\"fieldPath\": \"salary\", \"uniqueCount\": 111, \"uniqueProportion\": 0.9910714285714286, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"40000\", \"max\": \"94692\", \"mean\": \"68303.11607142857\", \"median\": \"69544.0\", \"stdev\": \"15505.291475014095\", \"sampleValues\": [\"60117\", \"62102\", \"66074\", \"66596\", \"66961\", \"71046\", \"74333\", \"75286\", \"75994\", \"76884\", \"80013\", \"81025\", \"81097\", \"84917\", \"85112\", \"85097\", \"88958\", \"65909\", \"65909\", \"67534\", \"69366\", \"71963\", \"72527\", \"40006\", \"43616\", \"43466\", \"43636\", \"43478\", \"43699\", \"43311\", \"40054\", \"42283\", \"42542\", \"46065\", \"48271\", \"50594\", \"52119\", \"54693\", \"58326\", \"60770\", \"62566\", \"64340\", \"67096\", \"69722\", \"70698\", \"74057\", \"78228\", \"82621\", \"83735\", \"85572\", \"85076\", \"86050\", \"88448\", \"88063\", \"89724\", \"90392\", \"90531\", \"91453\", \"94692\", \"40000\", \"42085\", \"42629\", \"45844\", \"47518\", \"47917\", \"52255\", \"53747\", \"56032\", \"58299\", \"60098\", \"59755\", \"56724\", \"60740\", \"62745\", \"63475\"]}, {\"fieldPath\": \"from_date\", \"uniqueCount\": 106, \"uniqueProportion\": 0.9464285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1985-02-18\", \"max\": \"2002-06-22\", \"sampleValues\": [\"1986-06-26\", \"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\", \"2000-08-02\", \"2001-08-02\", \"1995-12-03\", \"1996-12-02\", \"1997-12-02\", \"1998-12-02\", \"1999-12-02\", \"2000-12-01\", \"2001-12-01\", \"1986-12-01\", \"1987-12-01\", \"1988-11-30\", \"1989-11-30\", \"1990-11-30\", \"1991-11-30\", \"1992-11-29\", \"1993-11-29\", \"1994-11-29\", \"1995-11-29\", \"1996-11-28\", \"1997-11-28\", \"1998-11-28\", \"1999-11-28\", \"2000-11-27\", \"2001-11-27\", \"1989-09-12\", \"1990-09-12\", \"1991-09-12\", \"1992-09-11\", \"1993-09-11\", \"1994-09-11\", \"1995-09-11\", \"1996-09-10\", \"1997-09-10\", \"1998-09-10\", \"1999-09-10\", \"2000-09-09\", \"2001-09-09\", \"1990-08-05\", \"1991-08-05\", \"1992-08-04\", \"1993-08-04\", \"1994-08-04\", \"1995-08-04\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\", \"2000-08-02\", \"2001-08-02\", \"1989-02-10\", \"1990-02-10\", \"1991-02-10\", \"1992-02-10\"]}, {\"fieldPath\": \"to_date\", \"uniqueCount\": 99, \"uniqueProportion\": 0.8839285714285714, \"nullCount\": 0, \"nullProportion\": 0.0, \"min\": \"1986-02-18\", \"max\": \"9999-01-01\", \"sampleValues\": [\"1987-06-26\", \"1988-06-25\", \"1989-06-25\", \"1990-06-25\", \"1991-06-25\", \"1992-06-24\", \"1993-06-24\", \"1994-06-24\", \"1995-06-24\", \"1996-06-23\", \"1997-06-23\", \"1998-06-23\", \"1999-06-23\", \"2000-06-22\", \"2001-06-22\", \"2002-06-22\", \"9999-01-01\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\", \"2000-08-02\", \"2001-08-02\", \"9999-01-01\", \"1996-12-02\", \"1997-12-02\", \"1998-12-02\", \"1999-12-02\", \"2000-12-01\", \"2001-12-01\", \"9999-01-01\", \"1987-12-01\", \"1988-11-30\", \"1989-11-30\", \"1990-11-30\", \"1991-11-30\", \"1992-11-29\", \"1993-11-29\", \"1994-11-29\", \"1995-11-29\", \"1996-11-28\", \"1997-11-28\", \"1998-11-28\", \"1999-11-28\", \"2000-11-27\", \"2001-11-27\", \"9999-01-01\", \"1990-09-12\", \"1991-09-12\", \"1992-09-11\", \"1993-09-11\", \"1994-09-11\", \"1995-09-11\", \"1996-09-10\", \"1997-09-10\", \"1998-09-10\", \"1999-09-10\", \"2000-09-09\", \"2001-09-09\", \"9999-01-01\", \"1991-08-05\", \"1992-08-04\", \"1993-08-04\", \"1994-08-04\", \"1995-08-04\", \"1996-08-03\", \"1997-08-03\", \"1998-08-03\", \"1999-08-03\", \"2000-08-02\", \"2001-08-02\", \"9999-01-01\", \"1990-02-10\", \"1991-02-10\", \"1992-02-10\", \"1993-02-09\"]}]}",
"contentType": "application/json"
},
"systemMetadata": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ source:
include_field_distinct_value_frequencies: true
include_field_histogram: true
include_field_sample_values: true
field_sample_values_limit: 75
domain:
"urn:li:domain:sales":
allow:
Expand Down

0 comments on commit 69a2347

Please sign in to comment.