diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py index cd9ecf8f9..d8a323727 100644 --- a/databuilder/extractor/neo4j_search_data_extractor.py +++ b/databuilder/extractor/neo4j_search_data_extractor.py @@ -24,18 +24,23 @@ class Neo4jSearchDataExtractor(Extractor): {publish_tag_filter} OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description) OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description) + OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description) + WITH db, cluster, schema, schema_description, table, table_description, + COLLECT(prog_descs.description) as programmatic_descriptions OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default' - WITH db, cluster, schema, schema_description, table, table_description, COLLECT(DISTINCT tags.key) as tags + WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, + COLLECT(DISTINCT tags.key) as tags OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge' - WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS - badges + WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, + COLLECT(DISTINCT badges.key) as badges OPTIONAL MATCH (table)-[read:READ_BY]->(user:User) - WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS - total_usage, + WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges, + SUM(read.read_count) AS total_usage, COUNT(DISTINCT user.email) as unique_usage OPTIONAL MATCH (table)-[:COLUMN]->(col:Column) OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description) WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage, + programmatic_descriptions, COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp) RETURN db.name as database, cluster.name AS cluster, schema.name AS schema, @@ -47,7 +52,8 @@ class Neo4jSearchDataExtractor(Extractor): total_usage, unique_usage, tags, - badges + badges, + programmatic_descriptions ORDER BY table.name; """ ) diff --git a/databuilder/models/table_elasticsearch_document.py b/databuilder/models/table_elasticsearch_document.py index f4db99ef1..6fc199b71 100644 --- a/databuilder/models/table_elasticsearch_document.py +++ b/databuilder/models/table_elasticsearch_document.py @@ -23,6 +23,7 @@ def __init__(self, badges=None, # type: Optional[List[str]] display_name=None, # type: Optional[str] schema_description=None, # type: Optional[str] + programmatic_descriptions=[], # type: List[str] ): # type: (...) -> None self.database = database @@ -42,3 +43,4 @@ def __init__(self, self.tags = tags self.badges = badges self.schema_description = schema_description + self.programmatic_descriptions = programmatic_descriptions diff --git a/databuilder/publisher/elasticsearch_constants.py b/databuilder/publisher/elasticsearch_constants.py index 1e407ef2e..c53a237b6 100644 --- a/databuilder/publisher/elasticsearch_constants.py +++ b/databuilder/publisher/elasticsearch_constants.py @@ -81,6 +81,10 @@ }, "unique_usage": { "type": "long" + }, + "programmatic_descriptions": { + "type": "text", + "analyzer": "simple" } } } diff --git a/tests/unit/extractor/test_neo4j_extractor.py b/tests/unit/extractor/test_neo4j_extractor.py index 7e06f686d..2e34c43d4 100644 --- a/tests/unit/extractor/test_neo4j_extractor.py +++ b/tests/unit/extractor/test_neo4j_extractor.py @@ -114,7 +114,8 @@ def test_extraction_with_model_class(self): unique_usage=5, tags=['hive'], badges=['badge1'], - schema_description='schema_description') + schema_description='schema_description', + programmatic_descriptions=['TEST']) extractor.results = [result_dict] result_obj = extractor.extract() diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py index fa6c80bf9..7b3e47a12 100644 --- a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py +++ b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py @@ -72,7 +72,8 @@ def test_loading_with_different_object(self): column_descriptions=['test_comment1', 'test_comment2'], total_usage=10, unique_usage=5, - tags=['test_tag1', 'test_tag2']) + tags=['test_tag1', 'test_tag2'], + programmatic_descriptions=['test']) with self.assertRaises(Exception) as context: loader.load(data) # type: ignore @@ -102,7 +103,8 @@ def test_loading_with_single_object(self): unique_usage=5, tags=['test_tag1', 'test_tag2'], badges=['badge1'], - schema_description='schema description') + schema_description='schema description', + programmatic_descriptions=['test']) loader.load(data) loader.close() @@ -112,7 +114,9 @@ def test_loading_with_single_object(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema description"}') + '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema description", ' + '"programmatic_descriptions": ["test"], ' + '"badges": ["badge1"]}') ] self._check_results_helper(expected=expected) @@ -140,7 +144,8 @@ def test_loading_with_list_of_objects(self): unique_usage=5, tags=['test_tag1', 'test_tag2'], badges=['badge1'], - schema_description='schema_description')] * 5 + schema_description='schema_description', + programmatic_descriptions=['test'])] * 5 for d in data: loader.load(d) @@ -152,7 +157,9 @@ def test_loading_with_list_of_objects(self): '"column_names": ["test_col1", "test_col2"], "name": "test_table", ' '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", ' '"description": "test_description", "unique_usage": 5, "total_usage": 10, ' - '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema_description"}') + '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema_description", ' + '"programmatic_descriptions":["test"], ' + '"badges": ["badge1"]}') ] * 5 self._check_results_helper(expected=expected) diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py index 3291726f8..c51fe9d88 100644 --- a/tests/unit/models/test_table_elasticsearch_document.py +++ b/tests/unit/models/test_table_elasticsearch_document.py @@ -23,6 +23,7 @@ def test_to_json(self): total_usage=100, unique_usage=10, tags=['test'], + programmatic_descriptions=['test'], badges=['badge1'], schema_description='schema description') @@ -39,6 +40,7 @@ def test_to_json(self): "total_usage": 100, "unique_usage": 10, "tags": ["test"], + "programmatic_descriptions": ['test'], "badges": ["badge1"], 'schema_description': 'schema description' }