fix: add csv badges back in Quickstart (#418)

* fix: csv badges back in quickstart Signed-off-by: jornh <[email protected]> * Adding noqa C901 because test breaks if I split code Signed-off-by: jornh <[email protected]> * mypy Signed-off-by: jornh <[email protected]> * cleanup Signed-off-by: jornh <[email protected]> * test CsvTableColumnExtractor Signed-off-by: jornh <[email protected]> * test CsvTableColumnExtractor Signed-off-by: jornh <[email protected]> * Back out table level badges Signed-off-by: jornh <[email protected]> * Update example/sample_data/sample_col.csv Signed-off-by: jornh <[email protected]> * always have caller send a list of badge names Signed-off-by: jornh <[email protected]> * back out unrelated Signed-off-by: jornh <[email protected]> * back out unrelated CASE change in test Signed-off-by: jornh <[email protected]> * __eq__ stricter typing the mypy way Signed-off-by: jornh <[email protected]>
amundsen-io · Dec 14, 2020 · c0296b7 · c0296b7
1 parent 453a18b
commit c0296b7
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 44 deletions.
diff --git a/databuilder/extractor/csv_extractor.py b/databuilder/extractor/csv_extractor.py
@@ -191,7 +191,8 @@ def _load_csv(self) -> None:
                 name=column_dict['name'],
                 description=column_dict['description'],
                 col_type=column_dict['col_type'],
-                sort_order=int(column_dict['sort_order'])
+                sort_order=int(column_dict['sort_order']),
+                badges=[column_dict['badges']]
             )
             parsed_columns[id].append(column)
 

diff --git a/databuilder/models/badge.py b/databuilder/models/badge.py
@@ -18,6 +18,12 @@ def __repr__(self) -> str:
         return 'Badge({!r}, {!r})'.format(self.name,
                                           self.category)
 
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Badge):
+            return NotImplemented
+        return self.name == other.name and \
+            self.category == other.category
+
 
 class BadgeMetadata(GraphSerializable):
     """
@@ -86,7 +92,7 @@ def get_metadata_model_key(self) -> str:
 
     def create_nodes(self) -> List[GraphNode]:
         """
-        Create a list of Neo4j node records
+        Create a list of `GraphNode` records
         :return:
         """
         results = []
@@ -103,7 +109,7 @@ def create_nodes(self) -> List[GraphNode]:
         return results
 
     def create_relation(self) -> List[GraphRelationship]:
-        results = []
+        results: List[GraphRelationship] = []
         for badge in self.badges:
             relation = GraphRelationship(
                 start_label=self.start_label,

diff --git a/databuilder/models/table_metadata.py b/databuilder/models/table_metadata.py
@@ -17,6 +17,16 @@
 DESCRIPTION_NODE_LABEL = DESCRIPTION_NODE_LABEL_VAL
 
 
+def _format_as_list(tags: Union[List, str, None]) -> List:
+    if tags is None:
+        tags = []
+    if isinstance(tags, str):
+        tags = list(filter(None, tags.split(',')))
+    if isinstance(tags, list):
+        tags = [tag.lower().strip() for tag in tags]
+    return tags
+
+
 class TagMetadata(GraphSerializable):
     TAG_NODE_LABEL = 'Tag'
     TAG_KEY_FORMAT = '{tag}'
@@ -157,24 +167,23 @@ def __init__(self,
                  description: Union[str, None],
                  col_type: str,
                  sort_order: int,
-                 badges: Union[List[str], None] = None
+                 badges: Union[List[str], None] = None,
                  ) -> None:
         """
         TODO: Add stats
         :param name:
         :param description:
         :param col_type:
         :param sort_order:
+        :param badges: Optional. Column level badges
         """
         self.name = name
         self.description = DescriptionMetadata.create_description_metadata(source=None,
                                                                            text=description)
         self.type = col_type
         self.sort_order = sort_order
-        if badges:
-            self.badges = [Badge(badge, 'column') for badge in badges]
-        else:
-            self.badges = []
+        formatted_badges = _format_as_list(badges)
+        self.badges = [Badge(badge, 'column') for badge in formatted_badges]
 
     def __repr__(self) -> str:
         return 'ColumnMetadata({!r}, {!r}, {!r}, {!r}, {!r})'.format(self.name,
@@ -260,7 +269,7 @@ def __init__(self,
         self.is_view = is_view
         self.attrs: Optional[Dict[str, Any]] = None
 
-        self.tags = TableMetadata.format_tags(tags)
+        self.tags = _format_as_list(tags)
 
         if kwargs:
             self.attrs = copy.deepcopy(kwargs)
@@ -324,14 +333,7 @@ def _get_col_description_key(self,
 
     @staticmethod
     def format_tags(tags: Union[List, str, None]) -> List:
-        if tags is None:
-            tags = []
-        if isinstance(tags, str):
-            tags = list(filter(None, tags.split(',')))
-        if isinstance(tags, list):
-            tags = [tag.lower().strip() for tag in tags]
-
-        return tags
+        return _format_as_list(tags)
 
     def create_next_node(self) -> Union[GraphNode, None]:
         try:
@@ -346,7 +348,7 @@ def _create_next_node(self) -> Iterator[GraphNode]:
             node_key = self._get_table_description_key(self.description)
             yield self.description.get_node(node_key)
 
-        # Create the table tag node
+        # Create the table tag nodes
         if self.tags:
             for tag in self.tags:
                 yield TagMetadata.create_tag_node(tag)
@@ -368,11 +370,11 @@ def _create_next_node(self) -> Iterator[GraphNode]:
                 yield col.description.get_node(node_key)
 
             if col.badges:
-                badge_metadata = BadgeMetadata(start_label=ColumnMetadata.COLUMN_NODE_LABEL,
-                                               start_key=self._get_col_key(col),
-                                               badges=col.badges)
-                badge_nodes = badge_metadata.create_nodes()
-                for node in badge_nodes:
+                col_badge_metadata = BadgeMetadata(
+                    start_label=ColumnMetadata.COLUMN_NODE_LABEL,
+                    start_key=self._get_col_key(col),
+                    badges=col.badges)
+                for node in col_badge_metadata.create_nodes():
                     yield node
 
         # Database, cluster, schema

diff --git a/example/sample_data/sample_col.csv b/example/sample_data/sample_col.csv
@@ -1,12 +1,12 @@
-name,description,col_type,sort_order,database,cluster,schema,table_name
-col1,"col1 description","string",1,hive,gold,test_schema,test_table1
-col2,"col2 description","string",2,hive,gold,test_schema,test_table1
-col3,"col3 description","string",3,hive,gold,test_schema,test_table1
-col4,"col4 description","string",4,hive,gold,test_schema,test_table1
-col5,"col5 description","float",5,hive,gold,test_schema,test_table1
-col1,"col1 description","string",1,dynamo,gold,test_schema,test_table2
-col2,"col2 description","string",2,dynamo,gold,test_schema,test_table2
-col3,"col3 description","string",3,dynamo,gold,test_schema,test_table2
-col4,"col4 description","int",4,dynamo,gold,test_schema,test_table2
-col1,"view col description","int",1,hive,gold,test_schema,test_view1
-col1,"col1 description","int",1,hive,gold,test_schema,test_table3
+name,description,col_type,sort_order,database,cluster,schema,table_name,badges
+col1,"col1 description","string",1,hive,gold,test_schema,test_table1,PK
+col2,"col2 description","string",2,hive,gold,test_schema,test_table1,PII
+col3,"col3 description","string",3,hive,gold,test_schema,test_table1,
+col4,"col4 description","string",4,hive,gold,test_schema,test_table1,
+col5,"col5 description","float",5,hive,gold,test_schema,test_table1,
+col1,"col1 description","string",1,dynamo,gold,test_schema,test_table2,
+col2,"col2 description","string",2,dynamo,gold,test_schema,test_table2,
+col3,"col3 description","string",3,dynamo,gold,test_schema,test_table2,
+col4,"col4 description","int",4,dynamo,gold,test_schema,test_table2,
+col1,"view col description","int",1,hive,gold,test_schema,test_view1,
+col1,"col1 description","int",1,hive,gold,test_schema,test_table3,
diff --git a/example/sample_data/sample_table.csv b/example/sample_data/sample_table.csv
@@ -1,6 +1,6 @@
 database,cluster,schema,name,description,tags,is_view,description_source
-hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false,
-dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false,
+hive,gold,test_schema,test_table1,"1st test table","tag1,tag2",false,
+dynamo,gold,test_schema,test_table2,"2nd test table",recommended,false,
 hive,gold,test_schema,test_view1,"1st test view","tag1",true,
 hive,gold,test_schema,test_table3,"3rd test","needs_documentation",false,
 hive,gold,test_schema,"test's_table4","4th test","needs_documentation",false,
diff --git a/tests/unit/extractor/test_csv_extractor.py b/tests/unit/extractor/test_csv_extractor.py
@@ -6,22 +6,21 @@
 from pyhocon import ConfigFactory
 
 from databuilder import Scoped
-from databuilder.extractor.csv_extractor import CsvExtractor
+from databuilder.extractor.csv_extractor import CsvExtractor, CsvTableColumnExtractor
+from databuilder.models.badge import Badge
 
 
 class TestCsvExtractor(unittest.TestCase):
 
-    def setUp(self) -> None:
+    def test_extraction_with_model_class(self) -> None:
+        """
+        Test Extraction using model class
+        """
         config_dict = {
             'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): 'example/sample_data/sample_table.csv',
             'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata',
         }
         self.conf = ConfigFactory.from_dict(config_dict)
-
-    def test_extraction_with_model_class(self) -> None:
-        """
-        Test Extraction using model class
-        """
         extractor = CsvExtractor()
         extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                               scope=extractor.get_scope()))
@@ -32,3 +31,34 @@ def test_extraction_with_model_class(self) -> None:
         self.assertEqual(result.database, 'hive')
         self.assertEqual(result.cluster, 'gold')
         self.assertEqual(result.schema, 'test_schema')
+        self.assertEqual(result.tags, ['tag1', 'tag2'])
+        self.assertEqual(result.is_view, 'false')
+
+        result2 = extractor.extract()
+        self.assertEqual(result2.name, 'test_table2')
+        self.assertEqual(result2.is_view, 'false')
+
+        result3 = extractor.extract()
+        self.assertEqual(result3.name, 'test_view1')
+        self.assertEqual(result3.is_view, 'true')
+
+    def test_extraction_of_tablecolumn_badges(self) -> None:
+        """
+        Test Extraction using the combined CsvTableModel model class
+        """
+        config_dict = {
+            f'extractor.csvtablecolumn.{CsvTableColumnExtractor.TABLE_FILE_LOCATION}':
+            'example/sample_data/sample_table.csv',
+            f'extractor.csvtablecolumn.{CsvTableColumnExtractor.COLUMN_FILE_LOCATION}':
+            'example/sample_data/sample_col.csv',
+        }
+        self.conf = ConfigFactory.from_dict(config_dict)
+
+        extractor = CsvTableColumnExtractor()
+        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
+                                              scope=extractor.get_scope()))
+
+        result = extractor.extract()
+        self.assertEqual(result.name, 'test_table1')
+        self.assertEqual(result.columns[0].badges, [Badge('pk', 'column')])
+        self.assertEqual(result.columns[1].badges, [Badge('pii', 'column')])