issue-297/Adding programmatic_descriptions to table search export (#198)

* Adding programmatic_descriptions to table search export * fixing tests from merge * Rebasing from upstream master * fixing merge * fixing the neo4j query to be more optimized * adding programmatic_descriptions to the elasticsearch_constants.py
amundsen-io · Jun 3, 2020 · 8f18faf · 8f18faf
1 parent 2ac583c
commit 8f18faf
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 12 deletions.
diff --git a/databuilder/extractor/neo4j_search_data_extractor.py b/databuilder/extractor/neo4j_search_data_extractor.py
@@ -24,18 +24,23 @@ class Neo4jSearchDataExtractor(Extractor):
         {publish_tag_filter}
         OPTIONAL MATCH (table)-[:DESCRIPTION]->(table_description:Description)
         OPTIONAL MATCH (schema)-[:DESCRIPTION]->(schema_description:Description)
+        OPTIONAL MATCH (table)-[:DESCRIPTION]->(prog_descs:Programmatic_Description)
+        WITH db, cluster, schema, schema_description, table, table_description,
+        COLLECT(prog_descs.description) as programmatic_descriptions
         OPTIONAL MATCH (table)-[:TAGGED_BY]->(tags:Tag) WHERE tags.tag_type='default'
-        WITH db, cluster, schema, schema_description, table, table_description, COLLECT(DISTINCT tags.key) as tags
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions,
+        COLLECT(DISTINCT tags.key) as tags
         OPTIONAL MATCH (table)-[:TAGGED_BY]->(badges:Tag) WHERE badges.tag_type='badge'
-        WITH db, cluster, schema, schema_description, table, table_description, tags, COLLECT(DISTINCT badges.key) AS
-        badges
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags,
+        COLLECT(DISTINCT badges.key) as badges
         OPTIONAL MATCH (table)-[read:READ_BY]->(user:User)
-        WITH db, cluster, schema, schema_description, table, table_description, tags, badges, SUM(read.read_count) AS
-        total_usage,
+        WITH db, cluster, schema, schema_description, table, table_description, programmatic_descriptions, tags, badges,
+        SUM(read.read_count) AS total_usage,
         COUNT(DISTINCT user.email) as unique_usage
         OPTIONAL MATCH (table)-[:COLUMN]->(col:Column)
         OPTIONAL MATCH (col)-[:DESCRIPTION]->(col_description:Description)
         WITH db, cluster, schema, schema_description, table, table_description, tags, badges, total_usage, unique_usage,
+        programmatic_descriptions,
         COLLECT(col.name) AS column_names, COLLECT(col_description.description) AS column_descriptions
         OPTIONAL MATCH (table)-[:LAST_UPDATED_AT]->(time_stamp:Timestamp)
         RETURN db.name as database, cluster.name AS cluster, schema.name AS schema,
@@ -47,7 +52,8 @@ class Neo4jSearchDataExtractor(Extractor):
         total_usage,
         unique_usage,
         tags,
-        badges
+        badges,
+        programmatic_descriptions
         ORDER BY table.name;
         """
     )

diff --git a/databuilder/models/table_elasticsearch_document.py b/databuilder/models/table_elasticsearch_document.py
@@ -23,6 +23,7 @@ def __init__(self,
                  badges=None,  # type: Optional[List[str]]
                  display_name=None,  # type: Optional[str]
                  schema_description=None,  # type: Optional[str]
+                 programmatic_descriptions=[],  # type: List[str]
                  ):
         # type: (...) -> None
         self.database = database
@@ -42,3 +43,4 @@ def __init__(self,
         self.tags = tags
         self.badges = badges
         self.schema_description = schema_description
+        self.programmatic_descriptions = programmatic_descriptions
diff --git a/databuilder/publisher/elasticsearch_constants.py b/databuilder/publisher/elasticsearch_constants.py
@@ -81,6 +81,10 @@
             },
             "unique_usage": {
               "type": "long"
+            },
+            "programmatic_descriptions": {
+              "type": "text",
+              "analyzer": "simple"
             }
           }
         }

diff --git a/tests/unit/extractor/test_neo4j_extractor.py b/tests/unit/extractor/test_neo4j_extractor.py
@@ -114,7 +114,8 @@ def test_extraction_with_model_class(self):
                                unique_usage=5,
                                tags=['hive'],
                                badges=['badge1'],
-                               schema_description='schema_description')
+                               schema_description='schema_description',
+                               programmatic_descriptions=['TEST'])
 
             extractor.results = [result_dict]
             result_obj = extractor.extract()

diff --git a/tests/unit/loader/test_file_system_elasticsearch_json_loader.py b/tests/unit/loader/test_file_system_elasticsearch_json_loader.py
@@ -72,7 +72,8 @@ def test_loading_with_different_object(self):
                     column_descriptions=['test_comment1', 'test_comment2'],
                     total_usage=10,
                     unique_usage=5,
-                    tags=['test_tag1', 'test_tag2'])
+                    tags=['test_tag1', 'test_tag2'],
+                    programmatic_descriptions=['test'])
 
         with self.assertRaises(Exception) as context:
             loader.load(data)  # type: ignore
@@ -102,7 +103,8 @@ def test_loading_with_single_object(self):
                                unique_usage=5,
                                tags=['test_tag1', 'test_tag2'],
                                badges=['badge1'],
-                               schema_description='schema description')
+                               schema_description='schema description',
+                               programmatic_descriptions=['test'])
         loader.load(data)
         loader.close()
 
@@ -112,7 +114,9 @@ def test_loading_with_single_object(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema description"}')
+             '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema description", '
+             '"programmatic_descriptions": ["test"], '
+             '"badges": ["badge1"]}')
         ]
 
         self._check_results_helper(expected=expected)
@@ -140,7 +144,8 @@ def test_loading_with_list_of_objects(self):
                                 unique_usage=5,
                                 tags=['test_tag1', 'test_tag2'],
                                 badges=['badge1'],
-                                schema_description='schema_description')] * 5
+                                schema_description='schema_description',
+                                programmatic_descriptions=['test'])] * 5
 
         for d in data:
             loader.load(d)
@@ -152,7 +157,9 @@ def test_loading_with_list_of_objects(self):
              '"column_names": ["test_col1", "test_col2"], "name": "test_table", '
              '"last_updated_timestamp": 123456789, "display_name": "test_schema.test_table", '
              '"description": "test_description", "unique_usage": 5, "total_usage": 10, '
-             '"tags": ["test_tag1", "test_tag2"], "badges": ["badge1"], "schema_description": "schema_description"}')
+             '"tags": ["test_tag1", "test_tag2"], "schema_description": "schema_description", '
+             '"programmatic_descriptions":["test"], '
+             '"badges": ["badge1"]}')
         ] * 5
 
         self._check_results_helper(expected=expected)
diff --git a/tests/unit/models/test_table_elasticsearch_document.py b/tests/unit/models/test_table_elasticsearch_document.py
@@ -23,6 +23,7 @@ def test_to_json(self):
                                    total_usage=100,
                                    unique_usage=10,
                                    tags=['test'],
+                                   programmatic_descriptions=['test'],
                                    badges=['badge1'],
                                    schema_description='schema description')
 
@@ -39,6 +40,7 @@ def test_to_json(self):
                                   "total_usage": 100,
                                   "unique_usage": 10,
                                   "tags": ["test"],
+                                  "programmatic_descriptions": ['test'],
                                   "badges": ["badge1"],
                                   'schema_description': 'schema description'
                                   }