Skip to content

Commit

Permalink
issue-297/Adding programmatic_descriptions to table search export (#198)
Browse files Browse the repository at this point in the history
* Adding programmatic_descriptions to table search export

* fixing tests from merge

* Rebasing from upstream master

* fixing merge

* fixing the neo4j query to be more optimized

* adding programmatic_descriptions to the elasticsearch_constants.py
  • Loading branch information
samshuster authored Jun 3, 2020
1 parent 2ac583c commit 8f18faf
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 12 deletions.
18 changes: 12 additions & 6 deletions databuilder/extractor/neo4j_search_data_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,23 @@ class Neo4jSearchDataExtractor(Extractor):
{publish_tag_filter}
OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description)
OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description)
WITH db, cluster, schema, schema_description, table, table_description,
COLLECT(prog_descs.description) as programmatic_descriptions
OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
WITH db, cluster, schema, schema_description, table, table_description, COLLECT(DISTINCT tags.key) as tags
WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions,
COLLECT(DISTINCT tags.key) as tags
OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS
badges
WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags,
COLLECT(DISTINCT badges.key) as badges
OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS
total_usage,
WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges,
SUM(read.read_count) AS total_usage,
COUNT(DISTINCT user.email) as unique_usage
OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage,
programmatic_descriptions,
COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
Expand All @@ -47,7 +52,8 @@ class Neo4jSearchDataExtractor(Extractor):
total_usage,
unique_usage,
tags,
badges
badges,
programmatic_descriptions
ORDER BY table.name;
"""
)
Expand Down
2 changes: 2 additions & 0 deletions databuilder/models/table_elasticsearch_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def __init__(self,
badges=None, # type: Optional[List[str]]
display_name=None, # type: Optional[str]
schema_description=None, # type: Optional[str]
programmatic_descriptions=[], # type: List[str]
):
# type: (...) -> None
self.database = database
Expand All @@ -42,3 +43,4 @@ def __init__(self,
self.tags = tags
self.badges = badges
self.schema_description = schema_description
self.programmatic_descriptions = programmatic_descriptions
4 changes: 4 additions & 0 deletions databuilder/publisher/elasticsearch_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@
},
"unique_usage": {
"type": "long"
},
"programmatic_descriptions": {
"type": "text",
"analyzer": "simple"
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/extractor/test_neo4j_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ def test_extraction_with_model_class(self):
unique_usage=5,
tags=['hive'],
badges=['badge1'],
schema_description='schema_description')
schema_description='schema_description',
programmatic_descriptions=['TEST'])

extractor.results = [result_dict]
result_obj = extractor.extract()
Expand Down
17 changes: 12 additions & 5 deletions tests/unit/loader/test_file_system_elasticsearch_json_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def test_loading_with_different_object(self):
column_descriptions=['test_comment1', 'test_comment2'],
total_usage=10,
unique_usage=5,
tags=['test_tag1', 'test_tag2'])
tags=['test_tag1', 'test_tag2'],
programmatic_descriptions=['test'])

with self.assertRaises(Exception) as context:
loader.load(data) # type: ignore
Expand Down Expand Up @@ -102,7 +103,8 @@ def test_loading_with_single_object(self):
unique_usage=5,
tags=['test_tag1', 'test_tag2'],
badges=['badge1'],
schema_description='schema description')
schema_description='schema description',
programmatic_descriptions=['test'])
loader.load(data)
loader.close()

Expand All @@ -112,7 +114,9 @@ def test_loading_with_single_object(self):
'"column_names": ["test_col1", "test_col2"], "name": "test_table", '
'"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
'"description": "test_description", "unique_usage": 5, "total_usage": 10, '
'"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema description"}')
'"tags": ["test_tag1", "test_tag2"], "schema_description": "schema description", '
'"programmatic_descriptions": ["test"], '
'"badges": ["badge1"]}')
]

self._check_results_helper(expected=expected)
Expand Down Expand Up @@ -140,7 +144,8 @@ def test_loading_with_list_of_objects(self):
unique_usage=5,
tags=['test_tag1', 'test_tag2'],
badges=['badge1'],
schema_description='schema_description')] * 5
schema_description='schema_description',
programmatic_descriptions=['test'])] * 5

for d in data:
loader.load(d)
Expand All @@ -152,7 +157,9 @@ def test_loading_with_list_of_objects(self):
'"column_names": ["test_col1", "test_col2"], "name": "test_table", '
'"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
'"description": "test_description", "unique_usage": 5, "total_usage": 10, '
'"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema_description"}')
'"tags": ["test_tag1", "test_tag2"], "schema_description": "schema_description", '
'"programmatic_descriptions":["test"], '
'"badges": ["badge1"]}')
] * 5

self._check_results_helper(expected=expected)
2 changes: 2 additions & 0 deletions tests/unit/models/test_table_elasticsearch_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def test_to_json(self):
total_usage=100,
unique_usage=10,
tags=['test'],
programmatic_descriptions=['test'],
badges=['badge1'],
schema_description='schema description')

Expand All @@ -39,6 +40,7 @@ def test_to_json(self):
"total_usage": 100,
"unique_usage": 10,
"tags": ["test"],
"programmatic_descriptions": ['test'],
"badges": ["badge1"],
'schema_description': 'schema description'
}
Expand Down

0 comments on commit 8f18faf

Please sign in to comment.