datahub-project · shirshanka · Jun 22, 2021 · Jun 9, 2021 · Jun 9, 2021 · Jun 10, 2021
diff --git a/datahub-web-react/src/utils/sort/topologicalSort.ts b/datahub-web-react/src/utils/sort/topologicalSort.ts
@@ -6,6 +6,7 @@ function topologicalSortHelper(
     explored: Set<string>,
     result: Array<EntityRelationship>,
     urnsArray: Array<string>,
+    nodes: Array<EntityRelationship>,
 ) {
     if (!node.entity?.urn) {
         return;
@@ -16,11 +17,14 @@ function topologicalSortHelper(
         .filter((entity) => entity?.entity?.urn && urnsArray.includes(entity?.entity?.urn))
         .forEach((n) => {
             if (n?.entity?.urn && !explored.has(n?.entity?.urn)) {
-                topologicalSortHelper(n, explored, result, urnsArray);
+                topologicalSortHelper(n, explored, result, urnsArray, nodes);
             }
         });
     if (urnsArray.includes(node?.entity?.urn)) {
-        result.push(node);
+        const fullyFetchedEntity = nodes.find((n) => n?.entity?.urn === node?.entity?.urn);
+        if (fullyFetchedEntity) {
+            result.push(fullyFetchedEntity);
+        }
     }
 }
 
@@ -34,7 +38,7 @@ export function topologicalSort(input: Array<EntityRelationship | null>) {
         .map((node) => node.entity?.urn) as Array<string>;
     nodes.forEach((node) => {
         if (node.entity?.urn && !explored.has(node.entity?.urn)) {
-            topologicalSortHelper(node, explored, result, urnsArray);
+            topologicalSortHelper(node, explored, result, urnsArray, nodes);
         }
     });
 

diff --git a/gms/api/src/main/pegasus/com/linkedin/datajob/DataJob.pdl b/gms/api/src/main/pegasus/com/linkedin/datajob/DataJob.pdl
@@ -7,7 +7,7 @@ import com.linkedin.common.Status
 import com.linkedin.common.GlobalTags
 
 /**
- * Metadata bout DataJob
+ * Metadata about DataJob
  */
 record DataJob includes DataJobKey, ChangeAuditStamps {
   /**
@@ -28,7 +28,7 @@ record DataJob includes DataJobKey, ChangeAuditStamps {
   /**
    * Input and output datasets of job
    */
-   inputOutput: optional DataJobInputOutput
+  inputOutput: optional DataJobInputOutput
 
   /**
    * Status information for the chart such as removed or not

diff --git a/gms/api/src/main/snapshot/com.linkedin.chart.charts.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.chart.charts.snapshot.json
@@ -158,7 +158,13 @@
         "items" : "ChartDataSourceType"
       },
       "doc" : "Data sources for the chart",
-      "optional" : true
+      "optional" : true,
+      "Relationship" : {
+        "/*/string" : {
+          "entityTypes" : [ "dataset" ],
+          "name" : "Consumes"
+        }
+      }
     }, {
       "name" : "type",
       "type" : {

diff --git a/gms/api/src/main/snapshot/com.linkedin.datajob.dataJobs.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.datajob.dataJobs.snapshot.json
@@ -378,7 +378,7 @@
     "type" : "record",
     "name" : "DataJob",
     "namespace" : "com.linkedin.datajob",
-    "doc" : "Metadata bout DataJob",
+    "doc" : "Metadata about DataJob",
     "include" : [ {
       "type" : "record",
       "name" : "DataJobKey",
@@ -438,9 +438,10 @@
             "name" : "AzkabanJobType",
             "namespace" : "com.linkedin.datajob.azkaban",
             "doc" : "The various types of support azkaban jobs",
-            "symbols" : [ "COMMAND", "HADOOP_JAVA", "HADOOP_SHELL", "HIVE", "PIG", "SQL" ],
+            "symbols" : [ "COMMAND", "HADOOP_JAVA", "HADOOP_SHELL", "HIVE", "PIG", "SQL", "GLUE" ],
             "symbolDocs" : {
               "COMMAND" : "The command job type is one of the basic built-in types. It runs multiple UNIX commands using java processbuilder.\nUpon execution, Azkaban spawns off a process to run the command.",
+              "GLUE" : "Glue type is for running AWS Glue job transforms.",
               "HADOOP_JAVA" : "Runs a java program with ability to access Hadoop cluster.\nhttps://azkaban.readthedocs.io/en/latest/jobTypes.html#java-job-type",
               "HADOOP_SHELL" : "In large part, this is the same Command type. The difference is its ability to talk to a Hadoop cluster\nsecurely, via Hadoop tokens.",
               "HIVE" : "Hive type is for running Hive jobs.",

diff --git a/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json b/gms/api/src/main/snapshot/com.linkedin.entity.entities.snapshot.json
@@ -158,7 +158,13 @@
         "items" : "ChartDataSourceType"
       },
       "doc" : "Data sources for the chart",
-      "optional" : true
+      "optional" : true,
+      "Relationship" : {
+        "/*/string" : {
+          "entityTypes" : [ "dataset" ],
+          "name" : "Consumes"
+        }
+      }
     }, {
       "name" : "type",
       "type" : {
@@ -1187,9 +1193,10 @@
         "name" : "AzkabanJobType",
         "namespace" : "com.linkedin.datajob.azkaban",
         "doc" : "The various types of support azkaban jobs",
-        "symbols" : [ "COMMAND", "HADOOP_JAVA", "HADOOP_SHELL", "HIVE", "PIG", "SQL" ],
+        "symbols" : [ "COMMAND", "HADOOP_JAVA", "HADOOP_SHELL", "HIVE", "PIG", "SQL", "GLUE" ],
         "symbolDocs" : {
           "COMMAND" : "The command job type is one of the basic built-in types. It runs multiple UNIX commands using java processbuilder.\nUpon execution, Azkaban spawns off a process to run the command.",
+          "GLUE" : "Glue type is for running AWS Glue job transforms.",
           "HADOOP_JAVA" : "Runs a java program with ability to access Hadoop cluster.\nhttps://azkaban.readthedocs.io/en/latest/jobTypes.html#java-job-type",
           "HADOOP_SHELL" : "In large part, this is the same Command type. The difference is its ability to talk to a Hadoop cluster\nsecurely, via Hadoop tokens.",
           "HIVE" : "Hive type is for running Hive jobs.",

diff --git a/metadata-ingestion/README.md b/metadata-ingestion/README.md
@@ -485,11 +485,11 @@ source:
   config:
     aws_region: # aws_region_name, i.e. "eu-west-1"
     env: # environment for the DatasetSnapshot URN, one of "DEV", "EI", "PROD" or "CORP". Defaults to "PROD".
-    
+
     # Filtering patterns for databases and tables to scan
     database_pattern: # Optional, to filter databases scanned, same as schema_pattern above.
     table_pattern: # Optional, to filter tables scanned, same as table_pattern above.
-    
+
     # Credentials. If not specified here, these are picked up according to boto3 rules.
     # (see https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html)
     aws_access_key_id: # Optional.

diff --git a/metadata-ingestion/examples/recipes/glue_to_datahub.yml b/metadata-ingestion/examples/recipes/glue_to_datahub.yml
@@ -1,9 +1,10 @@
 source:
   type: glue
   config:
-    aws_region: "us-east-1"
+    aws_region: "us-west-2"
+    extract_transforms: true
 
 sink:
   type: "datahub-rest"
   config:
-    server: 'http://localhost:8080'
+    server: "http://localhost:8080"
diff --git a/metadata-ingestion/scripts/update_golden_files.sh b/metadata-ingestion/scripts/update_golden_files.sh
@@ -13,6 +13,7 @@ cp tmp/test_mysql_ingest0/mysql_mces.json tests/integration/mysql/mysql_mce_gold
 cp tmp/test_mssql_ingest0/mssql_mces.json tests/integration/sql_server/mssql_mce_golden.json
 cp tmp/test_mongodb_ingest0/mongodb_mces.json tests/integration/mongodb/mongodb_mce_golden.json
 cp tmp/test_feast_ingest0/feast_mces.json tests/integration/feast/feast_mce_golden.json
+cp tmp/test_glue_ingest0/glue_mce.json tests/unit/glue/glue_mce_golden.json
 
 # Print success message.
 set +x