From 69f945cefa1a00ac8a1b13064a3ef85c8a180cf1 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 10 Nov 2022 10:57:46 +0530 Subject: [PATCH 01/53] powerbi package --- .../src/datahub/ingestion/source/{ => powerbi}/powerbi.py | 2 ++ 1 file changed, 2 insertions(+) rename metadata-ingestion/src/datahub/ingestion/source/{ => powerbi}/powerbi.py (99%) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py similarity index 99% rename from metadata-ingestion/src/datahub/ingestion/source/powerbi.py rename to metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 993e74a76f9ab..f709d20637e9e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -900,6 +900,8 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: # Scan is complete lets take the result scan_result = get_scan_result(scan_id=scan_id) LOGGER.debug("scan result = {}".format(scan_result)) + import json + print(json.dumps(scan_result, indent=1)) workspace = PowerBiAPI.Workspace( id=scan_result["id"], name=scan_result["name"], From d68230ba911d9dfc2dc636097c380e80bc223f74 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 14 Nov 2022 10:04:51 +0530 Subject: [PATCH 02/53] restructure powerbi --- .../ingestion/source/powerbi/__init__.py | 1 + .../source/powerbi/expression_parser.py | 79 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py new file mode 100644 index 0000000000000..85296f8b7a31e --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py @@ -0,0 +1 @@ +from powerbi import PowerBiDashboardSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py new file mode 100644 index 0000000000000..f5d437f6a23d2 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py @@ -0,0 +1,79 @@ +from abc import ABC, abstractmethod +from typing import Optional, List, Dict + + +class Token(ABC): + @abstractmethod + def parse_raw_token(self) -> str: + pass + + +class BaseToken(Token, ABC): + _raw_token: str + _nested_tokens: Optional[List["BaseToken"]] + + def __init__(self, raw_token: str, nested_tokens: Optional[List["BaseToken"]]): + self._raw_token = raw_token + self._nested_tokens = nested_tokens + self.parse_raw_token(self._raw_token) + + +class LetToken(BaseToken): + def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["Token"]]): + super().__init__(raw_token, nested_raw_tokens) + + def parse_raw_token(self) -> str: + pass + + +class TableFuncToken(BaseToken): + def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["BaseToken"]]): + super().__init__(raw_token, nested_raw_tokens) + + def parse_raw_token(self) -> str: + pass + + +class DataAccessToken(BaseToken): + def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["BaseToken"]]): + super().__init__(raw_token, nested_raw_tokens) + + def parse_raw_token(self) -> str: + pass + + +class OracleDataAccessToken(BaseToken): + def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["BaseToken"]]): + super().__init__(raw_token, nested_raw_tokens) + + def parse_raw_token(self) -> str: + pass + + +class Step: + tokens: List[BaseToken] + def __init__(self, tokens: List[BaseToken]): + self.tokens = tokens + + +token_registry: Dict[str, BaseToken] = { + "let": LetToken, + "Table": TableFuncToken, + "PostgreSQL.Database": DataAccessToken, + "DB2.Database": DataAccessToken, + "Sql.Database": DataAccessToken, + "Oracle.Database": OracleDataAccessToken, +} + + +# identifier with space are not supported. +# This is one of the way to create identifier in M https://learn.microsoft.com/en-us/powerquery-m/expression-identifier +def parse_expression(expression: str) -> List[Step]: + strip_expression: str = expression.strip() + raw_token: str = "" + index: int = 0 + for c in strip_expression: + if c == ' ': + continue + + raw_token = raw_token + c From 51d68200c44d9db6d9314f674f93a95ef79832f7 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 16 Nov 2022 13:45:53 +0530 Subject: [PATCH 03/53] lexical rules --- metadata-ingestion/setup.py | 2 +- .../ingestion/source/powerbi/__init__.py | 2 +- .../{expression_parser.py => m_parser.py} | 15 +- .../powerbi/powerbi-lexical-grammar.rule | 551 ++++++++++++++++++ .../integration/powerbi/test_m_parser.py | 5 + 5 files changed, 563 insertions(+), 12 deletions(-) rename metadata-ingestion/src/datahub/ingestion/source/powerbi/{expression_parser.py => m_parser.py} (83%) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule create mode 100644 metadata-ingestion/tests/integration/powerbi/test_m_parser.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index c81ac28577ff3..4e61fdf436cbb 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -325,7 +325,7 @@ def get_long_description(): "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging"}, - "powerbi": microsoft_common, + "powerbi": microsoft_common | {"lark[regex]==1.1.4"}, "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"sqlalchemy-vertica[vertica-python]==0.0.5"}, "unity-catalog": databricks_cli | {"requests"}, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py index 85296f8b7a31e..1068f335e8f8e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/__init__.py @@ -1 +1 @@ -from powerbi import PowerBiDashboardSource +from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py similarity index 83% rename from metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py rename to metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index f5d437f6a23d2..7ebfd9a2e3966 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/expression_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,5 +1,7 @@ from abc import ABC, abstractmethod from typing import Optional, List, Dict +import importlib.resources as pkg_resource +from lark import Lark class Token(ABC): @@ -66,14 +68,7 @@ def __init__(self, tokens: List[BaseToken]): } -# identifier with space are not supported. -# This is one of the way to create identifier in M https://learn.microsoft.com/en-us/powerquery-m/expression-identifier def parse_expression(expression: str) -> List[Step]: - strip_expression: str = expression.strip() - raw_token: str = "" - index: int = 0 - for c in strip_expression: - if c == ' ': - continue - - raw_token = raw_token + c + grammar: str = pkg_resource.read_text("datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule") + lark_parser = Lark(grammar, start="let_expression", regex=True) + print(lark_parser.parse(expression).pretty()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule new file mode 100644 index 0000000000000..4ca73fb4625c8 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule @@ -0,0 +1,551 @@ +lexical_unit: lexical_elements? + +lexical_elements: lexical_element + | lexical_elements? + +lexical_element: whitespace + | token comment + +whitespace: WS + | new_line_character + +new_line_character: CR + | LF + | NEWLINE + +comment: single_line_comment + | delimited_comment + + +single_line_comment: single_line_comment_characters? + +single_line_comment_characters: single_line_comment_character + | single_line_comment_characters? + +single_line_comment_character: CPP_COMMENT + +delimited_comment: C_COMMENT + +asterisks: "*" + | asterisks? + +token: identifier + | keyword + | literal + | operator_or_punctuator + +character_escape_sequence: "#(" escape_sequence_list ")" + +escape_sequence_list: single_escape_sequence + | escape_sequence_list "," single_escape_sequence + +single_escape_sequence: long_unicode_escape_sequence + | short_unicode_escape_sequence + | control_character_escape_sequence + | escape_escape + +long_unicode_escape_sequence: hex_digit hex_digit hex_digit hex_digit hex_digit hex_digit hex_digit hex_digit + +short_unicode_escape_sequence: hex_digit hex_digit hex_digit hex_digit + +control_character_escape_sequence: control_character + +control_character: CR + | LF + | /\t/ + +escape_escape: "#" + +literal: logical_literal + | number_literal + | text_literal + | null_literal + | verbatim_literal + +logical_literal: "true" + | "false" + +number_literal: decimal_number_literal + | hexadecimal_number_literal + +decimal_digits: decimal_digit + | decimal_digits? + +decimal_digit: DIGIT + +hexadecimal_number_literal: "0x" hex_digits + | "0X" hex_digits + +hex_digits: hex_digit + | hex_digits? + +hex_digit: HEXDIGIT + +decimal_number_literal: decimal_digits "." decimal_digits exponent_part? + | decimal_digits exponent_part? + | decimal_digits exponent_part? + +exponent_part: "e" sign? decimal_digits + | "E" sign? decimal_digits + +sign: ["+"|"-"] + +text_literal: ESCAPED_STRING + +text_literal_characters: text_literal_character + | text_literal_characters? + +text_literal_character: single_text_character + | character_escape_sequence + | double_quote_escape_sequence + +single_text_character: /./ + | /[^#]/ + +double_quote_escape_sequence: "\"\"" + +null_literal: "null" + +verbatim_literal: "#!\"" text_literal_characters? "\"" + +identifier: regular_identifier + | quoted_identifier + +regular_identifier: available_identifier + | available_identifier dot_character regular_identifier + +available_identifier: keyword_or_identifier + +keyword_or_identifier: letter_character + | underscore_character + | identifier_start_character identifier_part_characters + +identifier_start_character: letter_character + | underscore_character + +identifier_part_characters: identifier_part_character identifier_part_characters? + +identifier_part_character: letter_character + | decimal_digit_character + | underscore_character + | connecting_character + | combining_character + | formatting_character + +generalized_identifier: generalized_identifier_part + | generalized_identifier WS_INLINE generalized_identifier_part + +generalized_identifier_part: generalized_identifier_segment + | decimal_digit_character generalized_identifier_segment + +generalized_identifier_segment: keyword_or_identifier + | keyword_or_identifier dot_character keyword_or_identifier + +dot_character: "." + +underscore_character: "_" + +letter_character: /[_\-\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}]+/ + +combining_character: /[_\p{Mn}\p{Mc}]+/ + +decimal_digit_character: /[\p{Nd}]+/ + +connecting_character: /[\p{Pc}]+/ + +formatting_character: /[\p{Cf}]+/ + +quoted_identifier: "#" "\"" text_literal_characters? "\"" + +keyword: "and" + | "as" + | "each" + | "else" + | "error" + | "false" + | "if" + | "in" + | "is" + | "let" + | "meta" + | "not" + | "null" + | "or" + | "otherwise" + | "section" + | "shared" + | "then" + | "true" + | "try" + | "type" + | "#binary" + | "#date" + | "#datetime" + | "#datetimezone" + | "#duration" + | "#infinity" + | "#nan" + | "#sections" + | "#shared" + | "#table" + | "#time" + + +operator_or_punctuator: "," + | ";" + | "=" + | "<" + | "<=" + | ">" + | ">=" + | "<>" + | "+" + | "_" + | "*" + | "/" + | "&" + | "(" + | ")" + | "[" + | "]" + | "{" + | "}" + | "@" + | "?" + | "??" + | "=>" + | ".." + | "..." + +document: section_document + | expression_document + +section_document: section + +section: literal_attributes? + | section + | section_name ";" section_members? + +section_name: identifier + +section_members: section_member + | section_members? + +section_member: literal_attributes? + | "shared"? + | section_member_name "=" expression ";" + +section_member_name: identifier + +expression_document: expression + +expression: logical_or_expression + | each_expression + | function_expression + | let_expression + | if_expression + | error_raising_expression + | error_handling_expression + +logical_or_expression: logical_and_expression + | logical_and_expression "or" logical_or_expression + +logical_and_expression: is_expression + | logical_and_expression "and" is_expression + +is_expression: as_expression + | is_expression "is" nullable_primitive_type + +nullable_primitive_type: "nullable"? primitive_type + +as_expression: equality_expression + | as_expression "as" nullable_primitive_type + +equality_expression: relational_expression + | relational_expression "=" equality_expression + | relational_expression "<>" equality_expression + +relational_expression: additive_expression + | additive_expression "<" relational_expression + | additive_expression ">" relational_expression + | additive_expression "<=" relational_expression + | additive_expression ">=" relational_expression + +additive_expression: multiplicative_expression + | multiplicative_expression "+" additive_expression + | multiplicative_expression "_" additive_expression + | multiplicative_expression "&" "_" additive_expression + +multiplicative_expression: metadata_expression + | metadata_expression "*" multiplicative_expression + | metadata_expression "/" multiplicative_expression + +metadata_expression: unary_expression + | unary_expression + | "meta" + | unary_expression + +unary_expression: type_expression + | "+" unary_expression + | "_" unary_expression + | "not" unary_expression + +primary_expression: literal_expression + | list_expression + | record_expression + | identifier_expression + | section_access_expression + | parenthesized_expression + | field_access_expression + | item_access_expression + | invoke_expression + | not_implemented_expression + +literal_expression: literal + +identifier_expression: identifier_reference + +identifier_reference: exclusive_identifier_reference + | inclusive_identifier_reference + +exclusive_identifier_reference: identifier + +inclusive_identifier_reference: "@" identifier + +section_access_expression: identifier "!" identifier + +parenthesized_expression: "(" WS_INLINE? expression WS_INLINE? ")" + +not_implemented_expression: "..." + +invoke_expression: primary_expression WS_INLINE? "(" WS_INLINE? argument_list? WS_INLINE? ")" + +argument_list: expression + | expression "," argument_list + | "\"" identifier "\"" + | "\"" identifier "\"" "," argument_list + + +list_expression: "{" item_list? "}" + +item_list: item + | item "," item_list + +item: expression + | expression ".." expression + +record_expression: "[" field_list? "]" + +field_list: field + | field "," field_list + +field: field_name WS_INLINE? "=" WS_INLINE? expression + +field_name: generalized_identifier + | quoted_identifier + +item_access_expression: item_selection + | optional_item_selection + +item_selection: primary_expression "{" item_selector "}" + +optional_item_selection: primary_expression "{" item_selector "}" "?" + +item_selector: expression + +field_access_expression: field_selection + | implicit_target_field_selection + | projection + | implicit_target_projection + +field_selection: primary_expression field_selector + +field_selector: required_field_selector + | optional_field_selector + +required_field_selector: "[" WS_INLINE? field_name WS_INLINE? "]" + +optional_field_selector: "[" WS_INLINE? field_name WS_INLINE? "]" "?" + +implicit_target_field_selection: field_selector + +projection: primary_expression required_projection + | primary_expression optional_projection + +required_projection: "[" WS_INLINE? required_selector_list WS_INLINE? "]" + +optional_projection: "[" WS_INLINE? required_selector_list WS_INLINE? "]" "?" + +required_selector_list: required_field_selector + | required_field_selector "," required_selector_list + +implicit_target_projection: required_projection + | optional_projection + +function_expression: "(" WS_INLINE? parameter_list? WS_INLINE? ")" WS_INLINE return_type? "=>" function_body + +function_body: expression + +parameter_list: fixed_parameter_list + | fixed_parameter_list "," optional_parameter_list + | optional_parameter_list + +fixed_parameter_list: parameter + | parameter "," fixed_parameter_list + +parameter: parameter_name parameter_type? + +parameter_name: identifier + +parameter_type: assertion + +return_type: assertion + +assertion: "as" WS_INLINE nullable_primitive_type + +optional_parameter_list: optional_parameter + | optional_parameter "," optional_parameter_list + +optional_parameter: "optional" WS_INLINE parameter + +each_expression: "each" WS_INLINE each_expression_body + +each_expression_body: function_body + +let_expression: "let" NEWLINE WS_INLINE? variable_list WS_INLINE? NEWLINE? in_expression + +in_expression: "in" NEWLINE? WS_INLINE NEWLINE? expression + +variable_list: variable + | variable NEWLINE? WS_INLINE? "," NEWLINE? WS_INLINE? variable_list + +variable: variable_name WS_INLINE? "=" WS_INLINE? expression + +variable_name: identifier + +if_expression: "if" WS_INLINE if_condition WS_INLINE NEWLINE? "then" WS_INLINE true_expression WS_INLINE "else" WS_INLINE false_expression + +if_condition: expression + +true_expression: expression + +false_expression: expression + +type_expression: primary_expression + | "type" WS_INLINE primary_type + +type: parenthesized_expression + | primary_type + +primary_type: primitive_type + | record_type + | list_type + | function_type + | table_type + | nullable_type + +primitive_type: "any" + | "anynonnull" + | "binary" + | "date" + | "datetime" + | "datetimezone" + | "duration" + | "function" + | "list" + | "logical" + | "none" + | "null" + | "number" + | "record" + | "table" + | "text" + | "time" + | "type" + +record_type: "[" WS_INLINE? open_record_marker WS_INLINE? "]" + | "[" WS_INLINE? field_specification_list? WS_INLINE "]" + | "[" WS_INLINE? field_specification_list WS_INLINE "," WS_INLINE? open_record_marker WS_INLINE? "]" + +field_specification_list: field_specification + | field_specification WS_INLINE? "," WS_INLINE? field_specification_list + +field_specification: "optional"? WS_INLINE? field_name WS_INLINE field_type_specification? + +field_type_specification: "=" WS_INLINE field_type + +field_type: type + +open_record_marker: "..." + +list_type: "{" WS_INLINE? item_type WS_INLINE? "}" + +item_type: type + +function_type: "function" WS_INLINE? "(" WS_INLINE? parameter_specification_list? WS_INLINE? ")" WS_INLINE? return_type + +parameter_specification_list: required_parameter_specification_list + | required_parameter_specification_list WS_INLINE? "," WS_INLINE? optional_parameter_specification_list + | optional_parameter_specification_list + +required_parameter_specification_list: required_parameter_specification + | required_parameter_specification WS_INLINE? "," WS_INLINE? required_parameter_specification_list + +required_parameter_specification: parameter_specification + +optional_parameter_specification_list: optional_parameter_specification + | optional_parameter_specification WS_INLINE? "," WS_INLINE? optional_parameter_specification_list + +optional_parameter_specification: "optional" parameter_specification + +parameter_specification: parameter_name WS_INLINE parameter_type + +table_type: "table" WS_INLINE row_type + +row_type: "[" WS_INLINE? field_specification_list? WS_INLINE? "]" + +nullable_type: "nullable" WS_INLINE type + +error_raising_expression: "error" WS_INLINE expression "_" + +error_handling_expression: "try" WS_INLINE protected_expression WS_INLINE otherwise_clause? + +protected_expression: expression + +otherwise_clause: "otherwise" WS_INLINE default_expression + +default_expression: expression + +literal_attributes: record_literal + +record_literal: "[" WS_INLINE? literal_field_list? WS_INLINE? "]" + +literal_field_list: literal_field + | literal_field WS_INLINE? "," WS_INLINE? literal_field_list + +literal_field: field_name WS_INLINE? "=" WS_INLINE? any_literal + +list_literal: "{" WS_INLINE? literal_item_list? WS_INLINE? "}" + +literal_item_list: any_literal + | any_literal WS_INLINE? "," WS_INLINE? literal_item_list + +any_literal: record_literal + | list_literal + | logical_literal + | number_literal + | text_literal + | null_literal + + +%import common.WORD +%import common.WS_INLINE +%import common.CPP_COMMENT +%import common.C_COMMENT +%import common.WS +%import common.NEWLINE +%import common.HEXDIGIT +%import common.DIGIT +%import common.LF +%import common.CR +%import common.ESCAPED_STRING \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py new file mode 100644 index 0000000000000..9642d9a849dd9 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -0,0 +1,5 @@ +from datahub.ingestion.source.powerbi import m_parser + +def test_parse_m_query(): + expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table" + m_parser.parse_expression(expression) From 24e0ba9fb142c7284ff700b4285e4903e98b5e46 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 16 Nov 2022 16:07:25 +0530 Subject: [PATCH 04/53] 12 expression test case --- .../ingestion/source/powerbi/m_parser.py | 2 +- .../integration/powerbi/test_m_parser.py | 58 ++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 7ebfd9a2e3966..459201b55d853 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -70,5 +70,5 @@ def __init__(self, tokens: List[BaseToken]): def parse_expression(expression: str) -> List[Step]: grammar: str = pkg_resource.read_text("datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule") - lark_parser = Lark(grammar, start="let_expression", regex=True) + lark_parser = Lark(grammar, start="let_expression", parser="lalr", regex=True) print(lark_parser.parse(expression).pretty()) diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 9642d9a849dd9..d413cd0c3d7e5 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,5 +1,61 @@ from datahub.ingestion.source.powerbi import m_parser -def test_parse_m_query(): + +def test_parse_m_query1(): expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table" m_parser.parse_expression(expression) + + +def test_parse_m_query2(): + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"" + m_parser.parse_expression(expression) + + +# def test_parse_m_query3(): +# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Has PS Software Quota?\", each if [TIER] = \"Expansion (Medium)\" then \"Yes\" else if [TIER] = \"Acquisition\" then \"Yes\" else \"No\")\nin\n #\"Added Conditional Column\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query4(): +# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022\"]),\n #\"Added Custom\" = Table.AddColumn(Source, \"OIP in $(*$350)\", each [SALES_INVOICE_AMOUNT] * 350),\n #\"Changed Type\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"OIP in $(*$350)\", type number}})\nin\n #\"Changed Type\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query5(): +# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"CLIENT_ID\", Int64.Type}}),\n #\"Added Conditional Column\" = Table.AddColumn(#\"Changed Type\", \"PS Software (One-Off)\", each if Text.Contains([REVENUE_TYPE], \"Software\") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], \"Tax Seminar\") then [Inv_Amt] else 0),\n #\"Filtered Rows\" = Table.SelectRows(#\"Added Conditional Column\", each true),\n #\"Duplicated Column\" = Table.DuplicateColumn(#\"Filtered Rows\", \"CLIENT_ID\", \"CLIENT_ID - Copy\"),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Duplicated Column\",{{\"CLIENT_ID - Copy\", type text}}),\n #\"Renamed Columns\" = Table.RenameColumns(#\"Changed Type1\",{{\"CLIENT_ID - Copy\", \"CLIENT_ID for Filter\"}})\nin\n #\"Renamed Columns\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query6(): +# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_DATE\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([MONTH_DATE]))\nin\n #\"Added Custom\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query7(): +# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query8(): +# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query9(): +# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_WID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"MONTH_DATE\", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & \"-\" &\nText.Range([MONTH_WID], 4,2)\n)),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom\", \"Month\", each Date.Month([MONTH_DATE])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom2\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query10(): +# expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"OPERATIONS_ANALYTICS_WAREHOUSE_PROD\",[Role=\"OPERATIONS_ANALYTICS_MEMBER_AD\"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name=\"OPERATIONS_ANALYTICS\",Kind=\"Database\"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name=\"LZ_MIGRATION_DOWNLOAD\",Kind=\"View\"]}[Data],\n #\"Changed Type\" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{\"MIGRATION_MONTH_ID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Migration Month\", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & \"-\" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"Migration Month\", type date}})\nin\n #\"Changed Type1\"" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query11(): +# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" +# m_parser.parse_expression(expression) +# +# +# def test_parse_m_query12(): +# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,'-',''))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = 'Software' and (NOT(PRODUCT in ('ADV', 'Adv') and left(ACCOUNT_ID,2)='10') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = 'Manual Adjustment') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN ('Recurring','0') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Services\", each if [CLASS] = \"Services\" then [INVOICE_AMOUNT] else 0),\n #\"Added Custom\" = Table.AddColumn(#\"Added Conditional Column\", \"Advanced New Sites\", each if [PRODUCT] = \"ADV\"\nor [PRODUCT] = \"Adv\"\nthen [NEW_SITE]\nelse 0)\nin\n #\"Added Custom\"" +# m_parser.parse_expression(expression) From c539b086fbc7128ab525632b7915a9693eff4cba Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 17 Nov 2022 15:40:14 +0530 Subject: [PATCH 05/53] 12 M query expression parsed --- .../ingestion/source/powerbi/m_parser.py | 2 +- .../powerbi/powerbi-lexical-grammar.rule | 56 ++++++---- .../integration/powerbi/test_m_parser.py | 104 +++++++++--------- 3 files changed, 89 insertions(+), 73 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 459201b55d853..73b24b176c009 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -70,5 +70,5 @@ def __init__(self, tokens: List[BaseToken]): def parse_expression(expression: str) -> List[Step]: grammar: str = pkg_resource.read_text("datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule") - lark_parser = Lark(grammar, start="let_expression", parser="lalr", regex=True) + lark_parser = Lark(grammar, start="let_expression", regex=True, debug=True) print(lark_parser.parse(expression).pretty()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule index 4ca73fb4625c8..2f84d2cf6365f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule @@ -71,7 +71,7 @@ number_literal: decimal_number_literal decimal_digits: decimal_digit | decimal_digits? -decimal_digit: DIGIT +decimal_digit: /\d+/ hexadecimal_number_literal: "0x" hex_digits | "0X" hex_digits @@ -81,7 +81,8 @@ hex_digits: hex_digit hex_digit: HEXDIGIT -decimal_number_literal: decimal_digits "." decimal_digits exponent_part? +decimal_number_literal: decimal_digits + | decimal_digits "." decimal_digits exponent_part? | decimal_digits exponent_part? | decimal_digits exponent_part? @@ -100,7 +101,7 @@ text_literal_character: single_text_character | double_quote_escape_sequence single_text_character: /./ - | /[^#]/ + | /[^#]/ double_quote_escape_sequence: "\"\"" @@ -155,7 +156,7 @@ connecting_character: /[\p{Pc}]+/ formatting_character: /[\p{Cf}]+/ -quoted_identifier: "#" "\"" text_literal_characters? "\"" +quoted_identifier: "#" ESCAPED_STRING keyword: "and" | "as" @@ -250,20 +251,25 @@ expression: logical_or_expression logical_or_expression: logical_and_expression | logical_and_expression "or" logical_or_expression -logical_and_expression: is_expression - | logical_and_expression "and" is_expression +logical_and_expression: WS_INLINE? NEWLINE? is_expression WS_INLINE? NEWLINE? + | WS_INLINE? + | NEWLINE? + | logical_and_expression WS_INLINE? "and" WS_INLINE? is_expression is_expression: as_expression + | WS_INLINE? + | NEWLINE? | is_expression "is" nullable_primitive_type nullable_primitive_type: "nullable"? primitive_type -as_expression: equality_expression +as_expression: WS_INLINE? equality_expression | as_expression "as" nullable_primitive_type + | WS_INLINE? multiplicative_expression -equality_expression: relational_expression - | relational_expression "=" equality_expression - | relational_expression "<>" equality_expression +equality_expression: WS_INLINE? relational_expression + | WS_INLINE? relational_expression WS_INLINE? "=" WS_INLINE? equality_expression + | relational_expression WS_INLINE? "<>" WS_INLINE? equality_expression relational_expression: additive_expression | additive_expression "<" relational_expression @@ -273,11 +279,12 @@ relational_expression: additive_expression additive_expression: multiplicative_expression | multiplicative_expression "+" additive_expression - | multiplicative_expression "_" additive_expression - | multiplicative_expression "&" "_" additive_expression + | multiplicative_expression WS_INLINE? "_" WS_INLINE? additive_expression + | multiplicative_expression WS_INLINE? NEWLINE? WS_INLINE? "&" WS_INLINE? NEWLINE? WS_INLINE? additive_expression -multiplicative_expression: metadata_expression - | metadata_expression "*" multiplicative_expression + +multiplicative_expression: WS_INLINE? metadata_expression + | metadata_expression WS_INLINE? "*" WS_INLINE? multiplicative_expression | metadata_expression "/" multiplicative_expression metadata_expression: unary_expression @@ -318,12 +325,19 @@ parenthesized_expression: "(" WS_INLINE? expression WS_INLINE? ")" not_implemented_expression: "..." -invoke_expression: primary_expression WS_INLINE? "(" WS_INLINE? argument_list? WS_INLINE? ")" +invoke_expression: "#"? primary_expression "(" NEWLINE? argument_list? NEWLINE? ")" -argument_list: expression - | expression "," argument_list +argument_list: WS_INLINE? expression + | WS_INLINE? expression WS_INLINE? "," WS_INLINE? argument_list | "\"" identifier "\"" | "\"" identifier "\"" "," argument_list + | WS_INLINE + | WS_INLINE? ESCAPED_STRING + | WS_INLINE? ESCAPED_STRING "," argument_list + | WS_INLINE? record_literal + | WS_INLINE? record_literal "," argument_list + | WS_INLINE? null_literal + | WS_INLINE? null_literal "," argument_list list_expression: "{" item_list? "}" @@ -423,11 +437,12 @@ variable: variable_name WS_INLINE? "=" WS_INLINE? expression variable_name: identifier -if_expression: "if" WS_INLINE if_condition WS_INLINE NEWLINE? "then" WS_INLINE true_expression WS_INLINE "else" WS_INLINE false_expression +if_expression: "if" WS_INLINE if_condition WS_INLINE? NEWLINE? WS_INLINE? "then" WS_INLINE? NEWLINE? true_expression WS_INLINE? NEWLINE? "else" WS_INLINE false_expression if_condition: expression -true_expression: expression +true_expression: NEWLINE? WS_INLINE? expression + | multiplicative_expression false_expression: expression @@ -523,7 +538,8 @@ record_literal: "[" WS_INLINE? literal_field_list? WS_INLINE? "]" literal_field_list: literal_field | literal_field WS_INLINE? "," WS_INLINE? literal_field_list -literal_field: field_name WS_INLINE? "=" WS_INLINE? any_literal +literal_field: field_name WS_INLINE? "=" WS_INLINE? any_literal + | field_name WS_INLINE? "=" WS_INLINE? invoke_expression list_literal: "{" WS_INLINE? literal_item_list? WS_INLINE? "}" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index d413cd0c3d7e5..62d6d6d02043f 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -7,55 +7,55 @@ def test_parse_m_query1(): def test_parse_m_query2(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"" - m_parser.parse_expression(expression) - - -# def test_parse_m_query3(): -# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Has PS Software Quota?\", each if [TIER] = \"Expansion (Medium)\" then \"Yes\" else if [TIER] = \"Acquisition\" then \"Yes\" else \"No\")\nin\n #\"Added Conditional Column\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query4(): -# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022\"]),\n #\"Added Custom\" = Table.AddColumn(Source, \"OIP in $(*$350)\", each [SALES_INVOICE_AMOUNT] * 350),\n #\"Changed Type\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"OIP in $(*$350)\", type number}})\nin\n #\"Changed Type\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query5(): -# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"CLIENT_ID\", Int64.Type}}),\n #\"Added Conditional Column\" = Table.AddColumn(#\"Changed Type\", \"PS Software (One-Off)\", each if Text.Contains([REVENUE_TYPE], \"Software\") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], \"Tax Seminar\") then [Inv_Amt] else 0),\n #\"Filtered Rows\" = Table.SelectRows(#\"Added Conditional Column\", each true),\n #\"Duplicated Column\" = Table.DuplicateColumn(#\"Filtered Rows\", \"CLIENT_ID\", \"CLIENT_ID - Copy\"),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Duplicated Column\",{{\"CLIENT_ID - Copy\", type text}}),\n #\"Renamed Columns\" = Table.RenameColumns(#\"Changed Type1\",{{\"CLIENT_ID - Copy\", \"CLIENT_ID for Filter\"}})\nin\n #\"Renamed Columns\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query6(): -# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_DATE\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([MONTH_DATE]))\nin\n #\"Added Custom\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query7(): -# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query8(): -# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query9(): -# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_WID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"MONTH_DATE\", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & \"-\" &\nText.Range([MONTH_WID], 4,2)\n)),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom\", \"Month\", each Date.Month([MONTH_DATE])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom2\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query10(): -# expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"OPERATIONS_ANALYTICS_WAREHOUSE_PROD\",[Role=\"OPERATIONS_ANALYTICS_MEMBER_AD\"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name=\"OPERATIONS_ANALYTICS\",Kind=\"Database\"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name=\"LZ_MIGRATION_DOWNLOAD\",Kind=\"View\"]}[Data],\n #\"Changed Type\" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{\"MIGRATION_MONTH_ID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Migration Month\", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & \"-\" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"Migration Month\", type date}})\nin\n #\"Changed Type1\"" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query11(): -# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" -# m_parser.parse_expression(expression) -# -# -# def test_parse_m_query12(): -# expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,'-',''))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = 'Software' and (NOT(PRODUCT in ('ADV', 'Adv') and left(ACCOUNT_ID,2)='10') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = 'Manual Adjustment') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN ('Recurring','0') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Services\", each if [CLASS] = \"Services\" then [INVOICE_AMOUNT] else 0),\n #\"Added Custom\" = Table.AddColumn(#\"Added Conditional Column\", \"Advanced New Sites\", each if [PRODUCT] = \"ADV\"\nor [PRODUCT] = \"Adv\"\nthen [NEW_SITE]\nelse 0)\nin\n #\"Added Custom\"" -# m_parser.parse_expression(expression) + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"ADDed Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query3(): + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Has PS Software Quota?\", each if [TIER] = \"Expansion (Medium)\" then \"Yes\" else if [TIER] = \"Acquisition\" then \"Yes\" else \"No\")\nin\n #\"Added Conditional Column\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query4(): + expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022\"]),\n #\"Added Custom\" = Table.AddColumn(Source, \"OIP in $(*$350)\", each [SALES_INVOICE_AMOUNT] * 350),\n #\"Changed Type\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"OIP in $(*$350)\", type number}})\nin\n #\"Changed Type\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query5(): + expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"CLIENT_ID\", Int64.Type}}),\n #\"Added Conditional Column\" = Table.AddColumn(#\"Changed Type\", \"PS Software (One-Off)\", each if Text.Contains([REVENUE_TYPE], \"Software\") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], \"Tax Seminar\") then [Inv_Amt] else 0),\n #\"Filtered Rows\" = Table.SelectRows(#\"Added Conditional Column\", each true),\n #\"Duplicated Column\" = Table.DuplicateColumn(#\"Filtered Rows\", \"CLIENT_ID\", \"CLIENT_ID - Copy\"),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Duplicated Column\",{{\"CLIENT_ID - Copy\", type text}}),\n #\"Renamed Columns\" = Table.RenameColumns(#\"Changed Type1\",{{\"CLIENT_ID - Copy\", \"CLIENT_ID for Filter\"}})\nin\n #\"Renamed Columns\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query6(): + expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_DATE\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([MONTH_DATE]))\nin\n #\"Added Custom\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query7(): + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" + m_parser.parse_expression(expression) + + +def test_parse_m_query8(): + expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query9(): + expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_WID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"MONTH_DATE\", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & \"-\" &\nText.Range([MONTH_WID], 4,2)\n)),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom\", \"Month\", each Date.Month([MONTH_DATE])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom2\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query10(): + expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"OPERATIONS_ANALYTICS_WAREHOUSE_PROD\",[Role=\"OPERATIONS_ANALYTICS_MEMBER_AD\"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name=\"OPERATIONS_ANALYTICS\",Kind=\"Database\"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name=\"LZ_MIGRATION_DOWNLOAD\",Kind=\"View\"]}[Data],\n #\"Changed Type\" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{\"MIGRATION_MONTH_ID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Migration Month\", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & \"-\" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"Migration Month\", type date}})\nin\n #\"Changed Type1\"" + m_parser.parse_expression(expression) + + +def test_parse_m_query11(): + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" + m_parser.parse_expression(expression) + + +def test_parse_m_query12(): + expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,'-',''))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = 'Software' and (NOT(PRODUCT in ('ADV', 'Adv') and left(ACCOUNT_ID,2)='10') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = 'Manual Adjustment') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN ('Recurring','0') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Services\", each if [CLASS] = \"Services\" then [INVOICE_AMOUNT] else 0),\n #\"Added Custom\" = Table.AddColumn(#\"Added Conditional Column\", \"Advanced New Sites\", each if [PRODUCT] = \"ADV\"\nor [PRODUCT] = \"Adv\"\nthen [NEW_SITE]\nelse 0)\nin\n #\"Added Custom\"" + m_parser.parse_expression(expression) From 9651e5413af15f0e69a64eee40ea624287b3080d Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 17 Nov 2022 20:03:30 +0530 Subject: [PATCH 06/53] test cases --- .../ingestion/source/powerbi/m_parser.py | 17 ++++-- .../integration/powerbi/test_m_parser.py | 54 ++++++++++++++----- 2 files changed, 55 insertions(+), 16 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 73b24b176c009..e29ffc40167ef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,7 +1,10 @@ +import logging from abc import ABC, abstractmethod from typing import Optional, List, Dict import importlib.resources as pkg_resource -from lark import Lark +from lark import Lark, Tree + +logger = logging.getLogger(__name__) class Token(ABC): @@ -68,7 +71,13 @@ def __init__(self, tokens: List[BaseToken]): } -def parse_expression(expression: str) -> List[Step]: +def parse_expression(expression: str) -> Tree: grammar: str = pkg_resource.read_text("datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule") - lark_parser = Lark(grammar, start="let_expression", regex=True, debug=True) - print(lark_parser.parse(expression).pretty()) + lark_parser = Lark(grammar, start="let_expression", regex=True) + + parse_tree: Tree = lark_parser.parse(expression) + + logger.debug("Parse Tree") + logger.debug(parse_tree.pretty()) + + return parse_tree diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 62d6d6d02043f..d3ed155ffc816 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,61 +1,91 @@ +from typing import List + from datahub.ingestion.source.powerbi import m_parser +from lark import Visitor, Tree + + +def get_output_dataset(root: Tree): + def get_token_list_for_any(tree: Tree, rules: List[str]): + for rule in rules: + token_list = [x for x in tree.find_data(rule)] + if len(token_list) > 0: + return token_list + + return [] + + for tree in root.find_data("in_expression"): + for child1 in get_token_list_for_any(tree, ["letter_character", "quoted_identifier"]): + return child1.children[0].value + def test_parse_m_query1(): expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "TESTTABLE_Table" def test_parse_m_query2(): expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"ADDed Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Added Custom2\"" def test_parse_m_query3(): expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Has PS Software Quota?\", each if [TIER] = \"Expansion (Medium)\" then \"Yes\" else if [TIER] = \"Acquisition\" then \"Yes\" else \"No\")\nin\n #\"Added Conditional Column\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Added Conditional Column\"" def test_parse_m_query4(): expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022\"]),\n #\"Added Custom\" = Table.AddColumn(Source, \"OIP in $(*$350)\", each [SALES_INVOICE_AMOUNT] * 350),\n #\"Changed Type\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"OIP in $(*$350)\", type number}})\nin\n #\"Changed Type\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Changed Type\"" def test_parse_m_query5(): expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"CLIENT_ID\", Int64.Type}}),\n #\"Added Conditional Column\" = Table.AddColumn(#\"Changed Type\", \"PS Software (One-Off)\", each if Text.Contains([REVENUE_TYPE], \"Software\") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], \"Tax Seminar\") then [Inv_Amt] else 0),\n #\"Filtered Rows\" = Table.SelectRows(#\"Added Conditional Column\", each true),\n #\"Duplicated Column\" = Table.DuplicateColumn(#\"Filtered Rows\", \"CLIENT_ID\", \"CLIENT_ID - Copy\"),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Duplicated Column\",{{\"CLIENT_ID - Copy\", type text}}),\n #\"Renamed Columns\" = Table.RenameColumns(#\"Changed Type1\",{{\"CLIENT_ID - Copy\", \"CLIENT_ID for Filter\"}})\nin\n #\"Renamed Columns\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Renamed Columns\"" def test_parse_m_query6(): expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_DATE\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([MONTH_DATE]))\nin\n #\"Added Custom\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Added Custom\"" def test_parse_m_query7(): expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "Source" def test_parse_m_query8(): expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Added Custom1\"" def test_parse_m_query9(): expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_WID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"MONTH_DATE\", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & \"-\" &\nText.Range([MONTH_WID], 4,2)\n)),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom\", \"Month\", each Date.Month([MONTH_DATE])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom2\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Added Custom1\"" def test_parse_m_query10(): expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"OPERATIONS_ANALYTICS_WAREHOUSE_PROD\",[Role=\"OPERATIONS_ANALYTICS_MEMBER_AD\"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name=\"OPERATIONS_ANALYTICS\",Kind=\"Database\"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name=\"LZ_MIGRATION_DOWNLOAD\",Kind=\"View\"]}[Data],\n #\"Changed Type\" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{\"MIGRATION_MONTH_ID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Migration Month\", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & \"-\" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"Migration Month\", type date}})\nin\n #\"Changed Type1\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Changed Type1\"" def test_parse_m_query11(): expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "Source" def test_parse_m_query12(): expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,'-',''))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = 'Software' and (NOT(PRODUCT in ('ADV', 'Adv') and left(ACCOUNT_ID,2)='10') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = 'Manual Adjustment') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN ('Recurring','0') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Services\", each if [CLASS] = \"Services\" then [INVOICE_AMOUNT] else 0),\n #\"Added Custom\" = Table.AddColumn(#\"Added Conditional Column\", \"Advanced New Sites\", each if [PRODUCT] = \"ADV\"\nor [PRODUCT] = \"Adv\"\nthen [NEW_SITE]\nelse 0)\nin\n #\"Added Custom\"" - m_parser.parse_expression(expression) + parse_tree: Tree = m_parser.parse_expression(expression) + assert get_output_dataset(parse_tree) == "\"Added Custom\"" From 6f4d0cc3365bc94da2555d2018f5099a6f8ef12b Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 23 Nov 2022 21:11:08 +0530 Subject: [PATCH 07/53] WIP --- .../ingestion/source/powerbi/m_parser.py | 75 +---- .../ingestion/source/powerbi/powerbi.py | 276 ++++++++++-------- .../integration/powerbi/test_m_parser.py | 46 +-- .../tests/integration/powerbi/test_powerbi.py | 5 + 4 files changed, 192 insertions(+), 210 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index e29ffc40167ef..2a9f96742f59b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,79 +1,16 @@ -import logging -from abc import ABC, abstractmethod -from typing import Optional, List, Dict import importlib.resources as pkg_resource +import logging + from lark import Lark, Tree logger = logging.getLogger(__name__) -class Token(ABC): - @abstractmethod - def parse_raw_token(self) -> str: - pass - - -class BaseToken(Token, ABC): - _raw_token: str - _nested_tokens: Optional[List["BaseToken"]] - - def __init__(self, raw_token: str, nested_tokens: Optional[List["BaseToken"]]): - self._raw_token = raw_token - self._nested_tokens = nested_tokens - self.parse_raw_token(self._raw_token) - - -class LetToken(BaseToken): - def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["Token"]]): - super().__init__(raw_token, nested_raw_tokens) - - def parse_raw_token(self) -> str: - pass - - -class TableFuncToken(BaseToken): - def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["BaseToken"]]): - super().__init__(raw_token, nested_raw_tokens) - - def parse_raw_token(self) -> str: - pass - - -class DataAccessToken(BaseToken): - def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["BaseToken"]]): - super().__init__(raw_token, nested_raw_tokens) - - def parse_raw_token(self) -> str: - pass - - -class OracleDataAccessToken(BaseToken): - def __init__(self, raw_token: str, nested_raw_tokens: Optional[List["BaseToken"]]): - super().__init__(raw_token, nested_raw_tokens) - - def parse_raw_token(self) -> str: - pass - - -class Step: - tokens: List[BaseToken] - def __init__(self, tokens: List[BaseToken]): - self.tokens = tokens - - -token_registry: Dict[str, BaseToken] = { - "let": LetToken, - "Table": TableFuncToken, - "PostgreSQL.Database": DataAccessToken, - "DB2.Database": DataAccessToken, - "Sql.Database": DataAccessToken, - "Oracle.Database": OracleDataAccessToken, -} - - def parse_expression(expression: str) -> Tree: - grammar: str = pkg_resource.read_text("datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule") - lark_parser = Lark(grammar, start="let_expression", regex=True) + grammar: str = pkg_resource.read_text( + "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" + ) + lark_parser = Lark(grammar, start="let_expression", regex=True) parse_tree: Tree = lark_parser.parse(expression) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index f709d20637e9e..cfe65a5a99726 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -40,8 +40,6 @@ CorpUserKeyClass, DashboardInfoClass, DashboardKeyClass, - DataPlatformInfoClass, - DatasetKeyClass, DatasetPropertiesClass, OwnerClass, OwnershipClass, @@ -104,6 +102,26 @@ class Constant: HTTP_RESPONSE_STATUS_CODE = "HttpResponseStatusCode" +@dataclass +class PowerBiDashboardSourceReport(SourceReport): + dashboards_scanned: int = 0 + charts_scanned: int = 0 + filtered_dashboards: List[str] = dataclass_field(default_factory=list) + filtered_charts: List[str] = dataclass_field(default_factory=list) + + def report_dashboards_scanned(self, count: int = 1) -> None: + self.dashboards_scanned += count + + def report_charts_scanned(self, count: int = 1) -> None: + self.charts_scanned += count + + def report_dashboards_dropped(self, model: str) -> None: + self.filtered_dashboards.append(model) + + def report_charts_dropped(self, view: str) -> None: + self.filtered_charts.append(view) + + class PowerBiAPIConfig(EnvBasedSourceConfigBase): # Organsation Identifier tenant_id: str = pydantic.Field(description="PowerBI tenant identifier") @@ -172,19 +190,9 @@ class DataSource: PowerBi """ - @dataclass - class MetaData: - """ - MetaData about DataSource - """ - - is_relational: Boolean - id: str type: str - database: Optional[str] - server: Optional[str] - metadata: Any + raw_connection_detail: Dict def __members(self): return (self.id,) @@ -200,19 +208,19 @@ def __hash__(self): # dataclasses for PowerBi Dashboard @dataclass - class Dataset: + class PowerBIDataset: @dataclass class Table: name: str - schema_name: str + full_name: str + data_source: "PowerBiAPI.DataSource" # We are supporting single data_source for the table id: str name: str webUrl: Optional[str] workspace_id: str - datasource: Any # Table in datasets - tables: List[Any] + tables: List["Table"] def get_urn_part(self): return f"datasets.{self.id}" @@ -222,7 +230,7 @@ def __members(self): def __eq__(self, instance): return ( - isinstance(instance, PowerBiAPI.Dataset) + isinstance(instance, PowerBiAPI.PowerBIDataset) and self.__members() == instance.__members() ) @@ -312,7 +320,6 @@ def __hash__(self): def __init__(self, config: PowerBiAPIConfig) -> None: self.__config: PowerBiAPIConfig = config self.__access_token: str = "" - # Power-Bi Auth (Service Principal Auth) self.__msal_client = msal.ConfidentialClientApplication( self.__config.client_id, @@ -542,7 +549,7 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: response_dict = response.json() LOGGER.debug("datasets = {}".format(response_dict)) # PowerBi Always return the webURL, in-case if it is None then setting complete webURL to None instead of None/details - return PowerBiAPI.Dataset( + return PowerBiAPI.PowerBIDataset( id=response_dict.get("id"), name=response_dict.get("name"), webUrl="{}/details".format(response_dict.get("webUrl")) @@ -550,10 +557,11 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: else None, workspace_id=workspace_id, tables=[], - datasource=None, ) - def get_data_source(self, dataset: Dataset) -> Any: + def get_data_sources( + self, dataset: PowerBIDataset + ) -> Dict[str, "PowerBiAPI.DataSource"]: """ Fetch the data source from PowerBi for the given dataset """ @@ -594,43 +602,21 @@ def get_data_source(self, dataset: Dataset) -> Any: return None - if len(value) > 1: - # We are currently supporting data-set having single relational database - LOGGER.warning( - "More than one data-source found for {}({})".format( - dataset.name, dataset.id - ) - ) - LOGGER.debug(value) - return None - - # Consider only zero index datasource - datasource_dict = value[0] + data_sources: Dict[str, "PowerBiAPI.DataSource"] = {} LOGGER.debug("data-sources = {}".format(value)) - # Create datasource instance with basic detail available - datasource = PowerBiAPI.DataSource( - id=datasource_dict.get( - "datasourceId" - ), # datasourceId is not available in all cases - type=datasource_dict["datasourceType"], - server=None, - database=None, - metadata=None, - ) - - # Check if datasource is relational as per our relation mapping - if self.__config.dataset_type_mapping.get(datasource.type) is not None: - # Now set the database detail as it is relational data source - datasource.metadata = PowerBiAPI.DataSource.MetaData(is_relational=True) - datasource.database = datasource_dict["connectionDetails"]["database"] - datasource.server = datasource_dict["connectionDetails"]["server"] - else: - datasource.metadata = PowerBiAPI.DataSource.MetaData(is_relational=False) - LOGGER.warning( - "Non relational data-source found = {}".format(datasource_dict) + for datasource_dict in value: + # Create datasource instance with basic detail available + datasource = PowerBiAPI.DataSource( + id=datasource_dict.get( + "datasourceId" + ), # datasourceId is not available in all cases + type=datasource_dict["datasourceType"], + raw_connection_detail=datasource_dict["connectionDetails"], ) - return datasource + data_sources[datasource.id] = datasource + + return data_sources def get_tiles(self, workspace: Workspace, dashboard: Dashboard) -> List[Tile]: @@ -712,10 +698,46 @@ def new_dataset_or_report(tile_instance: Any) -> dict: return tiles + def process_extension_table( + self, data_source: "PowerBiAPI.DataSource", raw_table: dict + ) -> (str, str, str): + # All below four condition should meet to process the Extension data-source type + if data_source.type != "Extension": + LOGGER.debug(f"data_source ({data_source.id}) type is not Extension") + return None, None + if data_source.raw_connection_detail.get("connectionDetails") is None: + LOGGER.debug( + f"data_source ({data_source.id}) type is missing connectionDetails" + ) + return None, None + if ( + data_source.raw_connection_detail["connectionDetails"].get( + "extensionDataSourceKind" + ) + is None + ): + LOGGER.debug( + f"data_source ({data_source.id}) type is missing extensionDataSourceKind" + ) + return None, None + + if ( + data_source.raw_connection_detail["connectionDetails"][ + "extensionDataSourceKind" + ] + not in self.__config.dataset_type_mapping + ): + LOGGER.debug(f"expected platforms are {self.__config.dataset_type_mapping}") + return None, None + # fake and foo need to be find out from M-Query + return raw_table["name"], "foo_db.fake_schema.{}".format(raw_table["name"]) + # flake8: noqa: C901 - def get_workspace(self, workspace_id: str) -> Workspace: + def get_workspace( + self, workspace_id: str, reporter: PowerBiDashboardSourceReport + ) -> Workspace: """ - Return Workspace for the given workspace identifier i.e workspace_id + Return Workspace for the given workspace identifier i.e. workspace_id """ scan_create_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_CREATE] scan_create_endpoint = scan_create_endpoint.format( @@ -839,40 +861,85 @@ def json_to_dataset_map(scan_result: dict) -> dict: return dataset_map for dataset_dict in datasets: - dataset_instance: PowerBiAPI.Dataset = self.get_dataset( + dataset_instance: PowerBiAPI.PowerBIDataset = self.get_dataset( workspace_id=scan_result["id"], dataset_id=dataset_dict["id"], ) - dataset_map[dataset_instance.id] = dataset_instance - # set dataset's DataSource - dataset_instance.datasource = self.get_data_source(dataset_instance) - # Set table only if the datasource is relational and dataset is not created from custom SQL i.e Value.NativeQuery( - # There are dataset which doesn't have DataSource - if ( - dataset_instance.datasource - and dataset_instance.datasource.metadata.is_relational is True - ): - LOGGER.info( - f"Processing tables attribute for dataset {dataset_instance.name}({dataset_instance.id})" + # Map of data-source attached to this dataset + data_source_map: Dict[ + str, PowerBiAPI.DataSource + ] = self.get_data_sources(dataset_instance) + for table in dataset_dict["tables"]: + warning_key_prefix: str = "{}_{}".format( + dataset_dict.get("id") if dataset_dict.get("name") is None else dataset_dict.get("name"), table["name"] ) - for table in dataset_dict["tables"]: - if "Value.NativeQuery(" in table["source"][0]["expression"]: - LOGGER.warning( - f'Table {table["name"]} is created from Custom SQL. Ignoring in processing' - ) + if table.get("source") is None: + reporter.report_warning( + f"{warning_key_prefix}-source", + "table without source is not supported", + ) + continue - continue + if "Value.NativeQuery(" in table["source"][0]["expression"]: + reporter.report_warning( + f"{warning_key_prefix}-native-query", + "NativeQuery is not supported", + ) + continue + if table.get("datasourceUsages") is None: + reporter.report_warning( + f"{warning_key_prefix}-no-source", + "table does not have any source", + ) + continue + + if len(table["datasourceUsages"]) > 1: + reporter.report_warning( + f"{warning_key_prefix}-many-source", + "Multiple data-sources for single table is not supported", + ) + continue + + data_source: PowerBiAPI.DataSource = data_source_map[ + table["datasourceUsages"][0]["datasourceInstanceId"] + ] + table_name: str = None + table_full_name: str = None + if data_source.type == "Extension": + table_name, table_full_name = self.process_extension_table( + data_source, table + ) + elif ( + self.__config.dataset_type_mapping.get(data_source.type) + is not None + ): # PowerBi table name contains schema name and table name. Format is - schema_and_name = table["name"].split(" ") - dataset_instance.tables.append( - PowerBiAPI.Dataset.Table( - schema_name=schema_and_name[0], - name=schema_and_name[1], - ) + table_name = table["name"].split(" ")[1] + table_schema_name: str = table["name"].split(" ")[0] + database_name: str = data_source.raw_connection_detail[ + "database" + ] + table_full_name = ( + f"{database_name}.{table_schema_name}.{table_name}" + ) + + if None in (table_name, table_full_name): + reporter.report_warning( + f"{warning_key_prefix}-extension", + f"The table source ({data_source.id}) is not belongs to supported platforms: {self.__config.dataset_type_mapping}", ) + continue + + dataset_instance.tables.append( + PowerBiAPI.PowerBIDataset.Table( + full_name=table_full_name, + name=table_name, + data_source=data_source, + ) + ) return dataset_map @@ -899,8 +966,8 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: # Scan is complete lets take the result scan_result = get_scan_result(scan_id=scan_id) - LOGGER.debug("scan result = {}".format(scan_result)) import json + print(json.dumps(scan_result, indent=1)) workspace = PowerBiAPI.Workspace( id=scan_result["id"], @@ -969,7 +1036,7 @@ def __to_work_unit( ) def __to_datahub_dataset( - self, dataset: Optional[PowerBiAPI.Dataset] + self, dataset: Optional[PowerBiAPI.PowerBIDataset] ) -> List[MetadataChangeProposalWrapper]: """ Map PowerBi dataset to datahub dataset. Here we are mapping each table of PowerBi Dataset to Datahub dataset. @@ -980,26 +1047,15 @@ def __to_datahub_dataset( if dataset is None: return dataset_mcps - # We are only supporting relation PowerBi DataSources - if ( - dataset.datasource is None - or dataset.datasource.metadata.is_relational is False - ): - LOGGER.warning( - f"Dataset {dataset.name}({dataset.id}) is not created from relational datasource" - ) - - return dataset_mcps - LOGGER.info( f"Converting dataset={dataset.name}(id={dataset.id}) to datahub dataset" ) for table in dataset.tables: - # Create an URN for dataset + # Create a URN for dataset ds_urn = builder.make_dataset_urn( - platform=self.__config.dataset_type_mapping[dataset.datasource.type], - name=f"{dataset.datasource.database}.{table.schema_name}.{table.name}", + platform=self.__config.dataset_type_mapping[table.data_source.type], + name=f"{table.full_name}", env=self.__config.env, ) @@ -1323,26 +1379,6 @@ def to_datahub_work_units( return deduplicate_list([wu for wu in work_units if wu is not None]) -@dataclass -class PowerBiDashboardSourceReport(SourceReport): - dashboards_scanned: int = 0 - charts_scanned: int = 0 - filtered_dashboards: List[str] = dataclass_field(default_factory=list) - filtered_charts: List[str] = dataclass_field(default_factory=list) - - def report_dashboards_scanned(self, count: int = 1) -> None: - self.dashboards_scanned += count - - def report_charts_scanned(self, count: int = 1) -> None: - self.charts_scanned += count - - def report_dashboards_dropped(self, model: str) -> None: - self.filtered_dashboards.append(model) - - def report_charts_dropped(self, view: str) -> None: - self.filtered_charts.append(view) - - @platform_name("PowerBI") @config_class(PowerBiDashboardSourceConfig) @support_status(SupportStatus.CERTIFIED) @@ -1381,7 +1417,9 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: LOGGER.info("PowerBi plugin execution is started") # Fetch PowerBi workspace for given workspace identifier - workspace = self.powerbi_client.get_workspace(self.source_config.workspace_id) + workspace = self.powerbi_client.get_workspace( + self.source_config.workspace_id, self.reporter + ) for dashboard in workspace.dashboards: diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index d3ed155ffc816..7703b93240fc9 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,8 +1,8 @@ from typing import List -from datahub.ingestion.source.powerbi import m_parser +from lark import Tree -from lark import Visitor, Tree +from datahub.ingestion.source.powerbi import m_parser def get_output_dataset(root: Tree): @@ -15,44 +15,46 @@ def get_token_list_for_any(tree: Tree, rules: List[str]): return [] for tree in root.find_data("in_expression"): - for child1 in get_token_list_for_any(tree, ["letter_character", "quoted_identifier"]): + for child1 in get_token_list_for_any( + tree, ["letter_character", "quoted_identifier"] + ): return child1.children[0].value def test_parse_m_query1(): - expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table" + expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table' parse_tree: Tree = m_parser.parse_expression(expression) assert get_output_dataset(parse_tree) == "TESTTABLE_Table" def test_parse_m_query2(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"ADDed Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"" + expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Added Custom2\"" + assert get_output_dataset(parse_tree) == '"Added Custom2"' def test_parse_m_query3(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Has PS Software Quota?\", each if [TIER] = \"Expansion (Medium)\" then \"Yes\" else if [TIER] = \"Acquisition\" then \"Yes\" else \"No\")\nin\n #\"Added Conditional Column\"" + expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Added Conditional Column\"" + assert get_output_dataset(parse_tree) == '"Added Conditional Column"' def test_parse_m_query4(): - expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022\"]),\n #\"Added Custom\" = Table.AddColumn(Source, \"OIP in $(*$350)\", each [SALES_INVOICE_AMOUNT] * 350),\n #\"Changed Type\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"OIP in $(*$350)\", type number}})\nin\n #\"Changed Type\"" + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Changed Type\"" + assert get_output_dataset(parse_tree) == '"Changed Type"' def test_parse_m_query5(): - expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"CLIENT_ID\", Int64.Type}}),\n #\"Added Conditional Column\" = Table.AddColumn(#\"Changed Type\", \"PS Software (One-Off)\", each if Text.Contains([REVENUE_TYPE], \"Software\") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], \"Tax Seminar\") then [Inv_Amt] else 0),\n #\"Filtered Rows\" = Table.SelectRows(#\"Added Conditional Column\", each true),\n #\"Duplicated Column\" = Table.DuplicateColumn(#\"Filtered Rows\", \"CLIENT_ID\", \"CLIENT_ID - Copy\"),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Duplicated Column\",{{\"CLIENT_ID - Copy\", type text}}),\n #\"Renamed Columns\" = Table.RenameColumns(#\"Changed Type1\",{{\"CLIENT_ID - Copy\", \"CLIENT_ID for Filter\"}})\nin\n #\"Renamed Columns\"" + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Renamed Columns\"" + assert get_output_dataset(parse_tree) == '"Renamed Columns"' def test_parse_m_query6(): - expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS\"]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_DATE\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([MONTH_DATE]))\nin\n #\"Added Custom\"" + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Added Custom\"" + assert get_output_dataset(parse_tree) == '"Added Custom"' def test_parse_m_query7(): @@ -62,21 +64,21 @@ def test_parse_m_query7(): def test_parse_m_query8(): - expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Added Custom1\"" + assert get_output_dataset(parse_tree) == '"Added Custom1"' def test_parse_m_query9(): - expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"MONTH_WID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"MONTH_DATE\", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & \"-\" &\nText.Range([MONTH_WID], 4,2)\n)),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom\", \"Month\", each Date.Month([MONTH_DATE])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom2\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"" + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Added Custom1\"" + assert get_output_dataset(parse_tree) == '"Added Custom1"' def test_parse_m_query10(): - expression: str = "let\n Source = Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"OPERATIONS_ANALYTICS_WAREHOUSE_PROD\",[Role=\"OPERATIONS_ANALYTICS_MEMBER_AD\"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name=\"OPERATIONS_ANALYTICS\",Kind=\"Database\"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name=\"LZ_MIGRATION_DOWNLOAD\",Kind=\"View\"]}[Data],\n #\"Changed Type\" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{\"MIGRATION_MONTH_ID\", type text}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Migration Month\", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & \"-\" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #\"Changed Type1\" = Table.TransformColumnTypes(#\"Added Custom\",{{\"Migration Month\", type date}})\nin\n #\"Changed Type1\"" + expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Changed Type1\"" + assert get_output_dataset(parse_tree) == '"Changed Type1"' def test_parse_m_query11(): @@ -86,6 +88,6 @@ def test_parse_m_query11(): def test_parse_m_query12(): - expression: str = "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,'-',''))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = 'Software' and (NOT(PRODUCT in ('ADV', 'Adv') and left(ACCOUNT_ID,2)='10') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = 'Manual Adjustment') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN ('Recurring','0') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"Services\", each if [CLASS] = \"Services\" then [INVOICE_AMOUNT] else 0),\n #\"Added Custom\" = Table.AddColumn(#\"Added Conditional Column\", \"Advanced New Sites\", each if [PRODUCT] = \"ADV\"\nor [PRODUCT] = \"Adv\"\nthen [NEW_SITE]\nelse 0)\nin\n #\"Added Custom\"" + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "\"Added Custom\"" + assert get_output_dataset(parse_tree) == '"Added Custom"' diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index bbd60f856bd96..46ec1b00081c3 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -125,6 +125,11 @@ def register_mock_api(request_mock): "expression": "dummy", } ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ] } ], } From b4dd785bdc96c18346458f7a603f8049e6ba8e91 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 24 Nov 2022 21:01:15 +0530 Subject: [PATCH 08/53] Current behaviour --- .../ingestion/source/powerbi/m_parser.py | 17 ++ .../ingestion/source/powerbi/powerbi.py | 1 + .../integration/powerbi/test_m_parser.py | 171 +++++++++--------- 3 files changed, 101 insertions(+), 88 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 2a9f96742f59b..3d9a5723ad017 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,11 +1,28 @@ import importlib.resources as pkg_resource import logging +from typing import List from lark import Lark, Tree logger = logging.getLogger(__name__) +def get_output_dataset(root: Tree): + def get_token_list_for_any(tree: Tree, rules: List[str]): + for rule in rules: + token_list = [x for x in tree.find_data(rule)] + if len(token_list) > 0: + return token_list + + return [] + + for tree in root.find_data("in_expression"): + for child1 in get_token_list_for_any( + tree, ["letter_character", "quoted_identifier"] + ): + return child1.children[0].value + + def parse_expression(expression: str) -> Tree: grammar: str = pkg_resource.read_text( "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index b2a2a7fe62ad4..ac9782f1dcf7a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -1095,6 +1095,7 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: ) # Get workspace dashboards workspace.dashboards = self.get_dashboards(workspace) + workspace.datasets = json_to_dataset_map(scan_result) init_dashboard_tiles(workspace) diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 7703b93240fc9..15d9261898212 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,93 +1,88 @@ from typing import List - +import lark from lark import Tree from datahub.ingestion.source.powerbi import m_parser - -def get_output_dataset(root: Tree): - def get_token_list_for_any(tree: Tree, rules: List[str]): - for rule in rules: - token_list = [x for x in tree.find_data(rule)] - if len(token_list) > 0: - return token_list - - return [] - - for tree in root.find_data("in_expression"): - for child1 in get_token_list_for_any( - tree, ["letter_character", "quoted_identifier"] - ): - return child1.children[0].value - - -def test_parse_m_query1(): - expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "TESTTABLE_Table" - - -def test_parse_m_query2(): - expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Added Custom2"' - - -def test_parse_m_query3(): - expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Added Conditional Column"' - - -def test_parse_m_query4(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Changed Type"' - - -def test_parse_m_query5(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Renamed Columns"' - - -def test_parse_m_query6(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Added Custom"' - - -def test_parse_m_query7(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "Source" - - -def test_parse_m_query8(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query9(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query10(): - expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Changed Type1"' - - -def test_parse_m_query11(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == "Source" - - -def test_parse_m_query12(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert get_output_dataset(parse_tree) == '"Added Custom"' +# def test_parse_m_query1(): +# expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == "TESTTABLE_Table" +# +# +# def test_parse_m_query2(): +# expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Added Custom2"' +# +# +# def test_parse_m_query3(): +# expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Added Conditional Column"' +# +# +# def test_parse_m_query4(): +# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Changed Type"' +# +# +# def test_parse_m_query5(): +# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Renamed Columns"' +# +# +# def test_parse_m_query6(): +# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query7(): +# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == "Source" +# +# +# def test_parse_m_query8(): +# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query9(): +# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query10(): +# expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Changed Type1"' +# +# +# def test_parse_m_query11(): +# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == "Source" +# +# +# def test_parse_m_query12(): +# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert get_output_dataset(parse_tree) == '"Added Custom"' + +def test_find_schema_detail(): + expression: str = "let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n DG_RBAC_ACL_Schema = GSL_TEST_DB_Database{[Name=\"DG_RBAC_ACL\",Kind=\"Schema\"]}[Data],\n SALES_Table = DG_RBAC_ACL_Schema{[Name=\"SALES\",Kind=\"Table\"]}[Data]\nin\n SALES_Table" + full_name: str = m_parser.find_full_name_and_lineage() + +def test_x(): + expression: str = "CALCULATE(SUM('Invoiced Revenue'[Services]),Targets[SERVICE_QUOTA] > 0)" + try: + m_parser.parse_expression(expression) + assert 1 != 1 + except lark.exceptions.UnexpectedCharacters: + pass From 281bc568571038c9d352326c252812e9fde5ba1b Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 24 Nov 2022 22:39:11 +0530 Subject: [PATCH 09/53] new behaviour where data-platform is powerbi --- .../ingestion/source/powerbi/m_parser.py | 10 +- .../ingestion/source/powerbi/powerbi.py | 135 +++------------ .../golden_test_disabled_ownership.json | 10 +- .../powerbi/golden_test_ingest.json | 10 +- .../powerbi/golden_test_report.json | 20 +-- .../integration/powerbi/test_m_parser.py | 155 ++++++++---------- .../tests/integration/powerbi/test_powerbi.py | 2 +- 7 files changed, 124 insertions(+), 218 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 3d9a5723ad017..28d940ed12e37 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,14 +1,14 @@ import importlib.resources as pkg_resource import logging -from typing import List +from typing import List, Optional from lark import Lark, Tree logger = logging.getLogger(__name__) -def get_output_dataset(root: Tree): - def get_token_list_for_any(tree: Tree, rules: List[str]): +def get_output_dataset(root: Tree) -> Optional[str]: + def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: for rule in rules: token_list = [x for x in tree.find_data(rule)] if len(token_list) > 0: @@ -20,7 +20,9 @@ def get_token_list_for_any(tree: Tree, rules: List[str]): for child1 in get_token_list_for_any( tree, ["letter_character", "quoted_identifier"] ): - return child1.children[0].value + return child1.children[0].value # type: ignore + + return None def parse_expression(expression: str) -> Tree: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index ac9782f1dcf7a..ec4f25fa784ee 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -191,7 +191,7 @@ class Workspace: name: str state: str dashboards: List[Any] - datasets: Dict[str, "PowerBiAPI.Dataset"] + datasets: Dict[str, "PowerBiAPI.PowerBIDataset"] @dataclass class DataSource: @@ -222,7 +222,7 @@ class PowerBIDataset: class Table: name: str full_name: str - data_source: "PowerBiAPI.DataSource" # We are supporting single data_source for the table + expression: Optional[str] id: str name: str @@ -286,7 +286,7 @@ class Report: webUrl: str embedUrl: str description: str - dataset: Optional["PowerBiAPI.Dataset"] + dataset: Optional["PowerBiAPI.PowerBIDataset"] pages: List["PowerBiAPI.Page"] users: List["PowerBiAPI.User"] @@ -304,7 +304,7 @@ class CreatedFrom(Enum): id: str title: str embedUrl: str - dataset: Optional["PowerBiAPI.Dataset"] + dataset: Optional["PowerBiAPI.PowerBIDataset"] report: Optional[Any] createdFrom: CreatedFrom @@ -584,7 +584,7 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: def get_data_sources( self, dataset: PowerBIDataset - ) -> Dict[str, "PowerBiAPI.DataSource"]: + ) -> Optional[Dict[str, "PowerBiAPI.DataSource"]]: """ Fetch the data source from PowerBi for the given dataset """ @@ -721,40 +721,6 @@ def new_dataset_or_report(tile_instance: Any) -> dict: return tiles - def process_extension_table( - self, data_source: "PowerBiAPI.DataSource", raw_table: dict - ) -> (str, str, str): - # All below four condition should meet to process the Extension data-source type - if data_source.type != "Extension": - LOGGER.debug(f"data_source ({data_source.id}) type is not Extension") - return None, None - if data_source.raw_connection_detail.get("connectionDetails") is None: - LOGGER.debug( - f"data_source ({data_source.id}) type is missing connectionDetails" - ) - return None, None - if ( - data_source.raw_connection_detail["connectionDetails"].get( - "extensionDataSourceKind" - ) - is None - ): - LOGGER.debug( - f"data_source ({data_source.id}) type is missing extensionDataSourceKind" - ) - return None, None - - if ( - data_source.raw_connection_detail["connectionDetails"][ - "extensionDataSourceKind" - ] - not in self.__config.dataset_type_mapping - ): - LOGGER.debug(f"expected platforms are {self.__config.dataset_type_mapping}") - return None, None - # fake and foo need to be find out from M-Query - return raw_table["name"], "foo_db.fake_schema.{}".format(raw_table["name"]) - def get_pages_by_report( self, workspace_id: str, report_id: str ) -> List["PowerBiAPI.Page"]: @@ -983,78 +949,27 @@ def json_to_dataset_map(scan_result: dict) -> dict: dataset_id=dataset_dict["id"], ) dataset_map[dataset_instance.id] = dataset_instance - # Map of data-source attached to this dataset - data_source_map: Dict[ - str, PowerBiAPI.DataSource - ] = self.get_data_sources(dataset_instance) + # set dataset-name + dataset_name: str = ( + dataset_instance.name + if dataset_instance.name is not None + else dataset_instance.id + ) + for table in dataset_dict["tables"]: - warning_key_prefix: str = "{}_{}".format( - dataset_dict.get("id") if dataset_dict.get("name") is None else dataset_dict.get("name"), table["name"] + expression: str = ( + table["source"][0]["expression"] + if table.get("source") is not None and len(table["source"]) > 0 + else None ) - - if table.get("source") is None: - reporter.report_warning( - f"{warning_key_prefix}-source", - "table without source is not supported", - ) - continue - - if "Value.NativeQuery(" in table["source"][0]["expression"]: - reporter.report_warning( - f"{warning_key_prefix}-native-query", - "NativeQuery is not supported", - ) - continue - - if table.get("datasourceUsages") is None: - reporter.report_warning( - f"{warning_key_prefix}-no-source", - "table does not have any source", - ) - continue - - if len(table["datasourceUsages"]) > 1: - reporter.report_warning( - f"{warning_key_prefix}-many-source", - "Multiple data-sources for single table is not supported", - ) - continue - - data_source: PowerBiAPI.DataSource = data_source_map[ - table["datasourceUsages"][0]["datasourceInstanceId"] - ] - table_name: str = None - table_full_name: str = None - if data_source.type == "Extension": - table_name, table_full_name = self.process_extension_table( - data_source, table - ) - elif ( - self.__config.dataset_type_mapping.get(data_source.type) - is not None - ): - # PowerBi table name contains schema name and table name. Format is - table_name = table["name"].split(" ")[1] - table_schema_name: str = table["name"].split(" ")[0] - database_name: str = data_source.raw_connection_detail[ - "database" - ] - table_full_name = ( - f"{database_name}.{table_schema_name}.{table_name}" - ) - - if None in (table_name, table_full_name): - reporter.report_warning( - f"{warning_key_prefix}-extension", - f"The table source ({data_source.id}) is not belongs to supported platforms: {self.__config.dataset_type_mapping}", - ) - continue - dataset_instance.tables.append( PowerBiAPI.PowerBIDataset.Table( - full_name=table_full_name, - name=table_name, - data_source=data_source, + name=table["name"], + full_name="{}.{}".format( + dataset_name.replace(" ", "_"), + table["name"].replace(" ", "_"), + ), + expression=expression, ) ) @@ -1172,14 +1087,16 @@ def __to_datahub_dataset( for table in dataset.tables: # Create a URN for dataset ds_urn = builder.make_dataset_urn( - platform=self.__config.dataset_type_mapping[table.data_source.type], + platform=self.__config.platform_name, name=f"{table.full_name}", env=self.__config.env, ) LOGGER.info(f"{Constant.Dataset_URN}={ds_urn}") # Create datasetProperties mcp - ds_properties = DatasetPropertiesClass(description=table.name) + ds_properties = DatasetPropertiesClass( + name=table.name, description=table.name + ) info_mcp = self.new_mcp( entity_type=Constant.DATASET, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json index 10aa0d3295e66..2154e4d7c2b56 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json @@ -1,11 +1,11 @@ [ { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { - "value": "{\"customProperties\": {}, \"description\": \"issue_history\", \"tags\": []}", + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", "contentType": "application/json" }, "systemMetadata": { @@ -15,7 +15,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -33,7 +33,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -125,4 +125,4 @@ "runId": "powerbi-test" } } -] \ No newline at end of file +] diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json index 49bdb95b08602..331e4fde518dd 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json @@ -1,11 +1,11 @@ [ { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { - "value": "{\"customProperties\": {}, \"description\": \"issue_history\", \"tags\": []}", + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", "contentType": "application/json" }, "systemMetadata": { @@ -15,7 +15,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -117,7 +117,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -223,4 +223,4 @@ "runId": "powerbi-test" } } -] \ No newline at end of file +] diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json index 20b51df7734a6..cfafce5d452a5 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json @@ -1,11 +1,11 @@ [ { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { - "value": "{\"customProperties\": {}, \"description\": \"issue_history\", \"tags\": []}", + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", "contentType": "application/json" }, "systemMetadata": { @@ -15,7 +15,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -117,7 +117,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -225,11 +225,11 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { - "value": "{\"customProperties\": {}, \"description\": \"issue_history\", \"tags\": []}", + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", "contentType": "application/json" }, "systemMetadata": { @@ -239,7 +239,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -341,7 +341,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"order\": \"0\"}, \"title\": \"ReportSection\", \"description\": \"Regional Sales Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"order\": \"0\"}, \"title\": \"ReportSection\", \"description\": \"Regional Sales Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -369,7 +369,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"order\": \"1\"}, \"title\": \"ReportSection1\", \"description\": \"Geographic Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"order\": \"1\"}, \"title\": \"ReportSection1\", \"description\": \"Geographic Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -475,4 +475,4 @@ "runId": "powerbi-test" } } -] \ No newline at end of file +] diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 15d9261898212..29497bdd122c2 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,88 +1,75 @@ -from typing import List -import lark from lark import Tree from datahub.ingestion.source.powerbi import m_parser -# def test_parse_m_query1(): -# expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == "TESTTABLE_Table" -# -# -# def test_parse_m_query2(): -# expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Added Custom2"' -# -# -# def test_parse_m_query3(): -# expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Added Conditional Column"' -# -# -# def test_parse_m_query4(): -# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Changed Type"' -# -# -# def test_parse_m_query5(): -# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Renamed Columns"' -# -# -# def test_parse_m_query6(): -# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query7(): -# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == "Source" -# -# -# def test_parse_m_query8(): -# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query9(): -# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query10(): -# expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Changed Type1"' -# -# -# def test_parse_m_query11(): -# expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == "Source" -# -# -# def test_parse_m_query12(): -# expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert get_output_dataset(parse_tree) == '"Added Custom"' - -def test_find_schema_detail(): - expression: str = "let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n DG_RBAC_ACL_Schema = GSL_TEST_DB_Database{[Name=\"DG_RBAC_ACL\",Kind=\"Schema\"]}[Data],\n SALES_Table = DG_RBAC_ACL_Schema{[Name=\"SALES\",Kind=\"Table\"]}[Data]\nin\n SALES_Table" - full_name: str = m_parser.find_full_name_and_lineage() - -def test_x(): - expression: str = "CALCULATE(SUM('Invoiced Revenue'[Services]),Targets[SERVICE_QUOTA] > 0)" - try: - m_parser.parse_expression(expression) - assert 1 != 1 - except lark.exceptions.UnexpectedCharacters: - pass + +def test_parse_m_query1(): + expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == "TESTTABLE_Table" + + +def test_parse_m_query2(): + expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Added Custom2"' + + +def test_parse_m_query3(): + expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Added Conditional Column"' + + +def test_parse_m_query4(): + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Changed Type"' + + +def test_parse_m_query5(): + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Renamed Columns"' + + +def test_parse_m_query6(): + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Added Custom"' + + +def test_parse_m_query7(): + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == "Source" + + +def test_parse_m_query8(): + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query9(): + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query10(): + expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Changed Type1"' + + +def test_parse_m_query11(): + expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == "Source" + + +def test_parse_m_query12(): + expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == '"Added Custom"' diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index ac608dcce9e9d..40d441b9cbc91 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -153,7 +153,7 @@ def register_mock_api(request_mock): { "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", } - ] + ], } ], } From 3b6a4224a10089786b81392f78d2d9716cf4e81e Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 24 Nov 2022 22:52:15 +0530 Subject: [PATCH 10/53] debug log --- .../src/datahub/ingestion/source/powerbi/powerbi.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index ec4f25fa784ee..ca41c3d0c429f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -998,9 +998,7 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: # Scan is complete lets take the result scan_result = get_scan_result(scan_id=scan_id) - import json - - print(json.dumps(scan_result, indent=1)) + LOGGER.debug(f"scan result = {scan_result}") workspace = PowerBiAPI.Workspace( id=scan_result["id"], name=scan_result["name"], From 90f6870a4393c7be9fe1299e4a4ecc42545b0dc4 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 1 Dec 2022 07:47:09 +0530 Subject: [PATCH 11/53] WIP --- .../data/template/StringArrayArray.class | Bin 4132 -> 0 bytes .../ingestion/source/powerbi/powerbi.py | 38 +++++++++++++++++- .../integration/powerbi/test_m_parser.py | 7 ++++ 3 files changed, 44 insertions(+), 1 deletion(-) delete mode 100644 metadata-events/mxe-schemas/bin/mainGeneratedDataTemplate/com/linkedin/data/template/StringArrayArray.class diff --git a/metadata-events/mxe-schemas/bin/mainGeneratedDataTemplate/com/linkedin/data/template/StringArrayArray.class b/metadata-events/mxe-schemas/bin/mainGeneratedDataTemplate/com/linkedin/data/template/StringArrayArray.class deleted file mode 100644 index 6dfa8dc0081ea527ef6ffa88a3185dea78c8aa4f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4132 zcmbtWYje{^6g``mD3L`IObF%~Xan^lU|xlS1I0L@xHW{w!$1;)(gg)uJ ze}O;1*A`|7Ei;sFo#~J2^zKTwMIs9*%;4y;_uPBWId@n8{QKuK07vkvgf51(W-ex` zM(u`PF>0BL>Zlnvw;aQ)+2e+-&QoF$`67+pzRXt_fRmRHtDJkeNPEMgT30PwcQF|%iDH`(SJdolRFdqx)+d;RcNh{Kd0C+^ zr+b%zHRvI^ibhR8*I1m>tuj}cAyzaswK}C*2EX?nt|hs&8)wzJci%H6JRl*>uz7V} zURu$x9T_R?QILczQ_c*(+Q%@Q2xzJqk`ifJI3M$_ebYy~(mGW4RD zG!veHCj%p1qcsU1G3;I~eULsfx>4kkoMo5^6+eJlJ9v-sO_kVZT;Ej4cnn(uU|RlS zCqE=%9W#7@ES*OrU%77vnN!}}>_!tI z`_+nFh*zjp%^LYW6Brgmw7{T9dE$_Nqh2>HN3XP&avRPo zP_e*pC_H80VM=+WD_6XlBbr%X;)t~EU+0KD^okxaD4MBTnm*ZZ>J1`u%zps1)OxkS z$x67%a5!Xf(?V{xylOkGb|ugj8gCQRI(jE8%d})z*u&4cTMBOD4jo$>1y5S#1Qp^b+LLmf>DV%Ln^A1vQxb>Id#0DtwR%U)34;vSc<$`_l$LNjLb0OVqTNM=)Fp z+f@Rc_Zs^$c2bqy^pV*`A3t=&0|RmXWkpY7^xFsCMU?K&(F*==IL01BxeqDw0FmxT zh{XmUV(1CBKF8aSF!DPs>BesQl|=ymo*TfsbkD;R^nVSyl#_H0@nhb@K1LyoUMZ=k zNL^gU=rZTiY$|eR_9PlyFS6f6arA3+VJ8AfveWsq|yyrrUkLCFtCOb=XSZpxb4KXzdQiLsqhaV>YaFs)?6n8hVZ zLeRL3&xyM}T)|Z@?TB~-M5v43XpM{ABc7%N9`^?0lS1z_x{lCYMR0J6uJ~IywTx@e z!HVfNg6pvTWYYsa3S_UUUWzm54cC)4k5J?rfyhS*`B(?!MS&dQbM8Z~i<)nGx{+$J z`{+;qjp$QYF>jX!<&9d@Q-hjIh?W1nJMQ_#7q}{_I9UV{sUQ#(BA+6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index ca41c3d0c429f..a9cf7c40e9d5d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -45,7 +45,7 @@ OwnershipClass, OwnershipTypeClass, StatusClass, - SubTypesClass, + SubTypesClass, UpstreamClass, DatasetLineageTypeClass, UpstreamLineageClass, ) from datahub.utilities.dedup_list import deduplicate_list @@ -999,6 +999,8 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: # Scan is complete lets take the result scan_result = get_scan_result(scan_id=scan_id) LOGGER.debug(f"scan result = {scan_result}") + import json + print(json.dumps(scan_result, indent=1)) workspace = PowerBiAPI.Workspace( id=scan_result["id"], name=scan_result["name"], @@ -1110,6 +1112,40 @@ def __to_datahub_dataset( aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) + if table.name == 'two_source_table': + upstreams: List[UpstreamClass] = [] + upstream_urn = builder.make_dataset_urn_with_platform_instance( + "snowflake", + "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW", + "GSL_TEST_WH", + ) + + upstream_table = UpstreamClass( + upstream_urn, + DatasetLineageTypeClass.TRANSFORMED, + ) + + upstreams.append(upstream_table) + + upstream_urn2 = builder.make_dataset_urn( + "postgres", + "mics.public.order_date", + ) + upstream_table2 = UpstreamClass( + upstream_urn2, + DatasetLineageTypeClass.TRANSFORMED, + ) + upstreams.append(upstream_table2) + + upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + mcp = MetadataChangeProposalWrapper( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn=ds_urn, + aspect=upstream_lineage, + ) + + dataset_mcps.extend([mcp]) dataset_mcps.extend([info_mcp, status_mcp]) diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 29497bdd122c2..8a2a7ca01b50b 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -73,3 +73,10 @@ def test_parse_m_query12(): expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' parse_tree: Tree = m_parser.parse_expression(expression) assert m_parser.get_output_dataset(parse_tree) == '"Added Custom"' + + +def test_parse_m_query13(): + expression: str = 'let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n Source2 = PostgreSQL.Database(\"localhost\", \"mics\"),\n public_order_date = Source2{[Schema=\"public\",Item=\"order_date\"]}[Data],\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name=\"PUBLIC\",Kind=\"Schema\"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name=\"SALES_ANALYST_VIEW\",Kind=\"View\"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table' + parse_tree: Tree = m_parser.parse_expression(expression) + assert m_parser.get_output_dataset(parse_tree) == 'two_source_table' + From 43f954a88c2c3ce1942e685126447442677e6a8b Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Fri, 2 Dec 2022 08:54:05 +0530 Subject: [PATCH 12/53] WIP --- .../ingestion/source/powerbi/m_parser.py | 65 ++++++- .../ingestion/source/powerbi/powerbi.py | 139 +++++--------- .../integration/powerbi/test_m_parser.py | 174 ++++++++++-------- 3 files changed, 202 insertions(+), 176 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 28d940ed12e37..a94744194b237 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,13 +1,22 @@ +from dataclasses import dataclass import importlib.resources as pkg_resource +from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport import logging -from typing import List, Optional +from typing import List, Optional, Any -from lark import Lark, Tree +from lark import Lark, Tree, Token logger = logging.getLogger(__name__) -def get_output_dataset(root: Tree) -> Optional[str]: +@dataclass +class DataPlatformTable: + name: str + full_name: str + platform_type: str + + +def get_output_variable(root: Tree) -> Optional[str]: def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: for rule in rules: token_list = [x for x in tree.find_data(rule)] @@ -26,14 +35,62 @@ def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: def parse_expression(expression: str) -> Tree: + # Read lexical grammar as text grammar: str = pkg_resource.read_text( "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" ) + + # Create lark parser for the grammar text lark_parser = Lark(grammar, start="let_expression", regex=True) parse_tree: Tree = lark_parser.parse(expression) logger.debug("Parse Tree") - logger.debug(parse_tree.pretty()) + if logger.level == logging.DEBUG: # Guard condition to avoid heavy pretty() function call + logger.debug(parse_tree.pretty()) return parse_tree + + +def get_upstream_tables(expression, reporter: PowerBiDashboardSourceReport) -> List[DataPlatformTable]: + parse_tree = parse_expression(expression) + + output_variable = get_output_variable(parse_tree) + + filter: Any = parse_tree.find_data("variable") + + def find_variable(node: Tree, variable: str) -> bool: + for internal_child in node.children: + if isinstance(internal_child, Token): + if internal_child.value == variable: + return True + continue + return find_variable(internal_child, variable) + + return False + + for tree in filter: + if find_variable(tree, output_variable): + print("Mohd1") + print(tree.pretty()) + for node in tree.find_data("field_selection"): + print("Mohd2") + print(node) + + return [ + DataPlatformTable( + name="postgres_table", + full_name="book.public.test", + platform_type="PostgreSql" + ), + DataPlatformTable( + name="oracle_table", + full_name="book.public.test", + platform_type="Oracle" + ), + DataPlatformTable( + name="snowflake_table", + full_name="book.public.test", + platform_type="Snowflake" + ), + ] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index a9cf7c40e9d5d..1a29c933bb580 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -5,19 +5,18 @@ ######################################################### import logging -from dataclasses import dataclass, field as dataclass_field from enum import Enum from time import sleep -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast from xmlrpc.client import Boolean import msal -import pydantic import requests import datahub.emitter.mce_builder as builder +from dataclasses import dataclass from datahub.configuration.common import ConfigurationError -from datahub.configuration.source_common import EnvBasedSourceConfigBase +from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -30,6 +29,7 @@ ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.powerbi.m_parser import DataPlatformTable from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps from datahub.metadata.schema_classes import ( BrowsePathsClass, @@ -48,6 +48,8 @@ SubTypesClass, UpstreamClass, DatasetLineageTypeClass, UpstreamLineageClass, ) from datahub.utilities.dedup_list import deduplicate_list +from datahub.ingestion.source.powerbi import m_parser +from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport, PowerBiDashboardSourceConfig, PowerBiAPIConfig, PlatformDetail # Logger instance LOGGER = logging.getLogger(__name__) @@ -105,61 +107,6 @@ class Constant: HTTP_RESPONSE_STATUS_CODE = "HttpResponseStatusCode" -@dataclass -class PowerBiDashboardSourceReport(SourceReport): - dashboards_scanned: int = 0 - charts_scanned: int = 0 - filtered_dashboards: List[str] = dataclass_field(default_factory=list) - filtered_charts: List[str] = dataclass_field(default_factory=list) - - def report_dashboards_scanned(self, count: int = 1) -> None: - self.dashboards_scanned += count - - def report_charts_scanned(self, count: int = 1) -> None: - self.charts_scanned += count - - def report_dashboards_dropped(self, model: str) -> None: - self.filtered_dashboards.append(model) - - def report_charts_dropped(self, view: str) -> None: - self.filtered_charts.append(view) - - -class PowerBiAPIConfig(EnvBasedSourceConfigBase): - # Organsation Identifier - tenant_id: str = pydantic.Field(description="PowerBI tenant identifier") - # PowerBi workspace identifier - workspace_id: str = pydantic.Field(description="PowerBI workspace identifier") - # Dataset type mapping - dataset_type_mapping: Dict[str, str] = pydantic.Field( - description="Mapping of PowerBI datasource type to DataHub supported data-sources. See Quickstart Recipe for mapping" - ) - # Azure app client identifier - client_id: str = pydantic.Field(description="Azure app client identifier") - # Azure app client secret - client_secret: str = pydantic.Field(description="Azure app client secret") - # timeout for meta-data scanning - scan_timeout: int = pydantic.Field( - default=60, description="timeout for PowerBI metadata scanning" - ) - # Enable/Disable extracting ownership information of Dashboard - extract_ownership: bool = pydantic.Field( - default=True, description="Whether ownership should be ingested" - ) - # Enable/Disable extracting report information - extract_reports: bool = pydantic.Field( - default=True, description="Whether reports should be ingested" - ) - - -class PowerBiDashboardSourceConfig(PowerBiAPIConfig): - platform_name: str = "powerbi" - platform_urn: str = builder.make_data_platform_urn(platform=platform_name) - # Not supporting the pattern - # dashboard_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - # chart_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - - class PowerBiAPI: # API endpoints of PowerBi to fetch dashboards, tiles, datasets API_ENDPOINTS = { @@ -215,21 +162,21 @@ def __eq__(self, instance): def __hash__(self): return hash(self.__members()) + @dataclass + class Table: + name: str + full_name: str + expression: Optional[str] + # dataclasses for PowerBi Dashboard @dataclass class PowerBIDataset: - @dataclass - class Table: - name: str - full_name: str - expression: Optional[str] - id: str name: str webUrl: Optional[str] workspace_id: str # Table in datasets - tables: List["Table"] + tables: List["PowerBiAPI.Table"] def get_urn_part(self): return f"datasets.{self.id}" @@ -1034,8 +981,9 @@ def __eq__(self, instance): def __hash__(self): return id(self.id) - def __init__(self, config: PowerBiDashboardSourceConfig): + def __init__(self, config: PowerBiDashboardSourceConfig, reporter: PowerBiDashboardSourceReport): self.__config = config + self.__reporter = reporter def new_mcp( self, @@ -1112,40 +1060,43 @@ def __to_datahub_dataset( aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) - if table.name == 'two_source_table': - upstreams: List[UpstreamClass] = [] + # Check if upstreams table is available, parse them and create dataset URN for each upstream table + upstreams: List[UpstreamClass] = [] + upstream_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table.expression, self.__reporter) + for upstream_table in upstream_tables: + platform: Union[str, PlatformDetail] = self.__config.dataset_type_mapping[upstream_table.platform_type] + platform_name: str = None + platform_instance_name: str = None + platform_env: str = DEFAULT_ENV + # Determine if PlatformDetail is provided + if isinstance(platform, PlatformDetail): + platform_name = cast(PlatformDetail, platform).platform + platform_instance_name = cast(PlatformDetail, platform).platform_instance + platform_env = cast(PlatformDetail, platform).env + else: + platform_name = platform + upstream_urn = builder.make_dataset_urn_with_platform_instance( - "snowflake", - "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW", - "GSL_TEST_WH", + platform=platform_name, + platform_instance=platform_instance_name, + env=platform_env, + name=upstream_table.full_name, ) - upstream_table = UpstreamClass( upstream_urn, DatasetLineageTypeClass.TRANSFORMED, ) - upstreams.append(upstream_table) - upstream_urn2 = builder.make_dataset_urn( - "postgres", - "mics.public.order_date", - ) - upstream_table2 = UpstreamClass( - upstream_urn2, - DatasetLineageTypeClass.TRANSFORMED, - ) - upstreams.append(upstream_table2) - - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) - mcp = MetadataChangeProposalWrapper( - entityType="dataset", - changeType=ChangeTypeClass.UPSERT, - entityUrn=ds_urn, - aspect=upstream_lineage, - ) - - dataset_mcps.extend([mcp]) + if len(upstreams) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + mcp = MetadataChangeProposalWrapper( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn=ds_urn, + aspect=upstream_lineage, + ) + dataset_mcps.extend([mcp]) dataset_mcps.extend([info_mcp, status_mcp]) @@ -1681,7 +1632,7 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): self.reporter = PowerBiDashboardSourceReport() self.auth_token = PowerBiAPI(self.source_config).get_access_token() self.powerbi_client = PowerBiAPI(self.source_config) - self.mapper = Mapper(config) + self.mapper = Mapper(config, self.reporter) @classmethod def create(cls, config_dict, ctx): diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 8a2a7ca01b50b..0ffa9e635f43f 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -2,81 +2,99 @@ from datahub.ingestion.source.powerbi import m_parser - -def test_parse_m_query1(): - expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == "TESTTABLE_Table" - - -def test_parse_m_query2(): - expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Added Custom2"' - - -def test_parse_m_query3(): - expression: str = 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Added Conditional Column"' - - -def test_parse_m_query4(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Changed Type"' - - -def test_parse_m_query5(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Renamed Columns"' - - -def test_parse_m_query6(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Added Custom"' - - -def test_parse_m_query7(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source" - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == "Source" - - -def test_parse_m_query8(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query9(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query10(): - expression: str = 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Changed Type1"' - - -def test_parse_m_query11(): - expression: str = "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source" - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == "Source" - - -def test_parse_m_query12(): - expression: str = 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == '"Added Custom"' - - -def test_parse_m_query13(): - expression: str = 'let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n Source2 = PostgreSQL.Database(\"localhost\", \"mics\"),\n public_order_date = Source2{[Schema=\"public\",Item=\"order_date\"]}[Data],\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name=\"PUBLIC\",Kind=\"Schema\"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name=\"SALES_ANALYST_VIEW\",Kind=\"View\"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table' - parse_tree: Tree = m_parser.parse_expression(expression) - assert m_parser.get_output_dataset(parse_tree) == 'two_source_table' - +M_QUERIES = [ + 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', + 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"', + 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"', + 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"', + 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"', + 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"', + "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n Source", + 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"', + 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select#(lf)CLIENT_ID,#(lf)PARTNER_ACCOUNT_NAME,#(lf)CM_CLOSING_MNTH_COUNTRY,#(lf)MONTH_WID,#(lf)PS_DELETES,#(lf)CLIENT_MANAGER_CLOSING_MONTH,#(lf)SME_DELETES,#(lf)TPV_AMV_OPENING,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_TPV_LEADERBOARD", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_WID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "MONTH_DATE", each Date.FromText(\nText.Range([MONTH_WID], 0,4) & "-" &\nText.Range([MONTH_WID], 4,2)\n)),\n #"Added Custom2" = Table.AddColumn(#"Added Custom", "Month", each Date.Month([MONTH_DATE])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom2", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"', + 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"', + "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source", + 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"', + 'let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n Source2 = PostgreSQL.Database(\"localhost\", \"mics\"),\n public_order_date = Source2{[Schema=\"public\",Item=\"order_date\"]}[Data],\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name=\"PUBLIC\",Kind=\"Schema\"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name=\"SALES_ANALYST_VIEW\",Kind=\"View\"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table', +] + + +# def test_parse_m_query1(): +# expression: str = M_QUERIES[0] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == "TESTTABLE_Table" +# +# +# def test_parse_m_query2(): +# expression: str = M_QUERIES[1] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Added Custom2"' +# +# +# def test_parse_m_query3(): +# expression: str = M_QUERIES[2] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Added Conditional Column"' +# +# +# def test_parse_m_query4(): +# expression: str = M_QUERIES[3] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Changed Type"' +# +# +# def test_parse_m_query5(): +# expression: str = M_QUERIES[4] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Renamed Columns"' +# +# +# def test_parse_m_query6(): +# expression: str = M_QUERIES[5] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query7(): +# expression: str = M_QUERIES[6] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == "Source" +# +# +# def test_parse_m_query8(): +# expression: str = M_QUERIES[7] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query9(): +# expression: str = M_QUERIES[8] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query10(): +# expression: str = M_QUERIES[9] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Changed Type1"' +# +# +# def test_parse_m_query11(): +# expression: str = M_QUERIES[10] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == "Source" +# +# +# def test_parse_m_query12(): +# expression: str = M_QUERIES[11] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query13(): +# expression: str = M_QUERIES[12] +# parse_tree: Tree = m_parser.parse_expression(expression) +# assert m_parser.get_output_variable(parse_tree) == 'two_source_table' + +def test_get_upstream(): + m_parser.get_upstream_tables(M_QUERIES[0], None) From fe7c50573baeaed4ee3988d7ab88e5f620b77a75 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 7 Dec 2022 13:46:25 +0530 Subject: [PATCH 13/53] config --- .../ingestion/source/powerbi/config.py | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py new file mode 100644 index 0000000000000..363aedfeef9b9 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -0,0 +1,74 @@ +import pydantic +import datahub.emitter.mce_builder as builder + +from dataclasses import field as dataclass_field +from typing import List + +from dataclasses import dataclass +from datahub.configuration.source_common import EnvBasedSourceConfigBase, DEFAULT_ENV +from typing import Dict, Union +from datahub.ingestion.api.source import SourceReport + + +@dataclass +class PowerBiDashboardSourceReport(SourceReport): + dashboards_scanned: int = 0 + charts_scanned: int = 0 + filtered_dashboards: List[str] = dataclass_field(default_factory=list) + filtered_charts: List[str] = dataclass_field(default_factory=list) + + def report_dashboards_scanned(self, count: int = 1) -> None: + self.dashboards_scanned += count + + def report_charts_scanned(self, count: int = 1) -> None: + self.charts_scanned += count + + def report_dashboards_dropped(self, model: str) -> None: + self.filtered_dashboards.append(model) + + def report_charts_dropped(self, view: str) -> None: + self.filtered_charts.append(view) + + +@dataclass +class PlatformDetail: + platform: str = pydantic.Field(description="DataHub platform name. Example postgres or oracle or snowflake") + platform_instance: str = pydantic.Field(default=None, description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source") + env: str = pydantic.Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) + + +class PowerBiAPIConfig(EnvBasedSourceConfigBase): + # Organisation Identifier + tenant_id: str = pydantic.Field(description="PowerBI tenant identifier") + # PowerBi workspace identifier + workspace_id: str = pydantic.Field(description="PowerBI workspace identifier") + # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI + # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is + # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on. + dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = pydantic.Field( + description="Mapping of PowerBI datasource type to DataHub supported data-sources. See Quickstart Recipe for mapping" + ) + # Azure app client identifier + client_id: str = pydantic.Field(description="Azure app client identifier") + # Azure app client secret + client_secret: str = pydantic.Field(description="Azure app client secret") + # timeout for meta-data scanning + scan_timeout: int = pydantic.Field( + default=60, description="timeout for PowerBI metadata scanning" + ) + # Enable/Disable extracting ownership information of Dashboard + extract_ownership: bool = pydantic.Field( + default=True, description="Whether ownership should be ingested" + ) + # Enable/Disable extracting report information + extract_reports: bool = pydantic.Field( + default=True, description="Whether reports should be ingested" + ) + + +class PowerBiDashboardSourceConfig(PowerBiAPIConfig): + platform_name: str = "powerbi" + platform_urn: str = builder.make_data_platform_urn(platform=platform_name) From f31c2e46273e6134316d4ae0aecc8f47bccdf8e9 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 7 Dec 2022 21:52:18 +0530 Subject: [PATCH 14/53] WIP --- .../ingestion/source/powerbi/config.py | 51 + .../ingestion/source/powerbi/m_parser.py | 91 +- .../ingestion/source/powerbi/powerbi.py | 918 +----------------- .../integration/powerbi/test_m_parser.py | 8 +- 4 files changed, 129 insertions(+), 939 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 363aedfeef9b9..164dfb63ed612 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,6 +9,57 @@ from typing import Dict, Union from datahub.ingestion.api.source import SourceReport +class Constant: + """ + keys used in powerbi plugin + """ + + PBIAccessToken = "PBIAccessToken" + DASHBOARD_LIST = "DASHBOARD_LIST" + TILE_LIST = "TILE_LIST" + REPORT_LIST = "REPORT_LIST" + PAGE_BY_REPORT = "PAGE_BY_REPORT" + DATASET_GET = "DATASET_GET" + REPORT_GET = "REPORT_GET" + DATASOURCE_GET = "DATASOURCE_GET" + TILE_GET = "TILE_GET" + ENTITY_USER_LIST = "ENTITY_USER_LIST" + SCAN_CREATE = "SCAN_CREATE" + SCAN_GET = "SCAN_GET" + SCAN_RESULT_GET = "SCAN_RESULT_GET" + Authorization = "Authorization" + WorkspaceId = "WorkspaceId" + DashboardId = "DashboardId" + DatasetId = "DatasetId" + ReportId = "ReportId" + SCAN_ID = "ScanId" + Dataset_URN = "DatasetURN" + CHART_URN = "ChartURN" + CHART = "chart" + CORP_USER = "corpuser" + CORP_USER_INFO = "corpUserInfo" + CORP_USER_KEY = "corpUserKey" + CHART_INFO = "chartInfo" + STATUS = "status" + CHART_ID = "powerbi.linkedin.com/charts/{}" + CHART_KEY = "chartKey" + DASHBOARD_ID = "powerbi.linkedin.com/dashboards/{}" + DASHBOARD = "dashboard" + DASHBOARD_KEY = "dashboardKey" + OWNERSHIP = "ownership" + BROWSERPATH = "browsePaths" + DASHBOARD_INFO = "dashboardInfo" + DATAPLATFORM_INSTANCE = "dataPlatformInstance" + DATASET = "dataset" + DATASET_ID = "powerbi.linkedin.com/datasets/{}" + DATASET_KEY = "datasetKey" + DATASET_PROPERTIES = "datasetProperties" + VALUE = "value" + ENTITY = "ENTITY" + ID = "ID" + HTTP_RESPONSE_TEXT = "HttpResponseText" + HTTP_RESPONSE_STATUS_CODE = "HttpResponseStatusCode" + @dataclass class PowerBiDashboardSourceReport(SourceReport): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index a94744194b237..04d6dd5da69f3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,8 +1,11 @@ +from abc import ABC + from dataclasses import dataclass import importlib.resources as pkg_resource from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI import logging -from typing import List, Optional, Any +from typing import List, Optional, Any, Dict from lark import Lark, Tree, Token @@ -16,6 +19,45 @@ class DataPlatformTable: platform_type: str +class AbstractMQueryResolver(ABC): + pass + + +class AbstractDataAccessMQueryResolver(AbstractMQueryResolver, ABC): + pass + + +class PostgresMQueryResolver(AbstractDataAccessMQueryResolver): + pass + + +class OracleMQueryResolver(AbstractDataAccessMQueryResolver): + pass + + +class SnowflakeMQueryResolver(AbstractDataAccessMQueryResolver): + pass + + +class AbstractTableAccessMQueryResolver(AbstractDataAccessMQueryResolver, ABC): + pass + + +class TableCombineMQueryResolver(AbstractTableAccessMQueryResolver): + pass + + +DATA_ACCESS_RESOLVER: Dict[str, AbstractMQueryResolver.__class__] = { + "PostgreSQL.Database": PostgresMQueryResolver, + "Oracle.Database": OracleMQueryResolver, + "Snowflake.Database": SnowflakeMQueryResolver, +} + +TABLE_ACCESS_RESOLVER: Dict[str, AbstractMQueryResolver.__class__] = { + "Table.Combine": TableCombineMQueryResolver, +} + + def get_output_variable(root: Tree) -> Optional[str]: def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: for rule in rules: @@ -52,30 +94,35 @@ def parse_expression(expression: str) -> Tree: return parse_tree -def get_upstream_tables(expression, reporter: PowerBiDashboardSourceReport) -> List[DataPlatformTable]: - parse_tree = parse_expression(expression) +def get_upstream_tables(table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport) -> List[DataPlatformTable]: + parse_tree = parse_expression(table.expression) output_variable = get_output_variable(parse_tree) - filter: Any = parse_tree.find_data("variable") - - def find_variable(node: Tree, variable: str) -> bool: - for internal_child in node.children: - if isinstance(internal_child, Token): - if internal_child.value == variable: - return True - continue - return find_variable(internal_child, variable) - - return False - - for tree in filter: - if find_variable(tree, output_variable): - print("Mohd1") - print(tree.pretty()) - for node in tree.find_data("field_selection"): - print("Mohd2") - print(node) + filter: Any = parse_tree.find_data("invoke_expression") + tokens: List[Any] = list(filter) + print("Length = {}".format(len(tokens))) + for tree in tokens: + print(tree.pretty()) + + # filter: Any = parse_tree.find_data("variable") + # def find_variable(node: Tree, variable: str) -> bool: + # for internal_child in node.children: + # if isinstance(internal_child, Token): + # if internal_child.value == variable: + # return True + # continue + # return find_variable(internal_child, variable) + # + # return False + # + # for tree in filter: + # if find_variable(tree, output_variable): + # print("Mohd1") + # print(tree.pretty()) + # for node in tree.find_data("field_selection"): + # print("Mohd2") + # print(node) return [ DataPlatformTable( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 1a29c933bb580..e11525c6f0cdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -5,17 +5,9 @@ ######################################################### import logging -from enum import Enum -from time import sleep from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast -from xmlrpc.client import Boolean - -import msal -import requests import datahub.emitter.mce_builder as builder -from dataclasses import dataclass -from datahub.configuration.common import ConfigurationError from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -49,919 +41,13 @@ ) from datahub.utilities.dedup_list import deduplicate_list from datahub.ingestion.source.powerbi import m_parser -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport, PowerBiDashboardSourceConfig, PowerBiAPIConfig, PlatformDetail - +from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport, PowerBiDashboardSourceConfig, PlatformDetail +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI # Logger instance LOGGER = logging.getLogger(__name__) -class Constant: - """ - keys used in powerbi plugin - """ - - PBIAccessToken = "PBIAccessToken" - DASHBOARD_LIST = "DASHBOARD_LIST" - TILE_LIST = "TILE_LIST" - REPORT_LIST = "REPORT_LIST" - PAGE_BY_REPORT = "PAGE_BY_REPORT" - DATASET_GET = "DATASET_GET" - REPORT_GET = "REPORT_GET" - DATASOURCE_GET = "DATASOURCE_GET" - TILE_GET = "TILE_GET" - ENTITY_USER_LIST = "ENTITY_USER_LIST" - SCAN_CREATE = "SCAN_CREATE" - SCAN_GET = "SCAN_GET" - SCAN_RESULT_GET = "SCAN_RESULT_GET" - Authorization = "Authorization" - WorkspaceId = "WorkspaceId" - DashboardId = "DashboardId" - DatasetId = "DatasetId" - ReportId = "ReportId" - SCAN_ID = "ScanId" - Dataset_URN = "DatasetURN" - CHART_URN = "ChartURN" - CHART = "chart" - CORP_USER = "corpuser" - CORP_USER_INFO = "corpUserInfo" - CORP_USER_KEY = "corpUserKey" - CHART_INFO = "chartInfo" - STATUS = "status" - CHART_ID = "powerbi.linkedin.com/charts/{}" - CHART_KEY = "chartKey" - DASHBOARD_ID = "powerbi.linkedin.com/dashboards/{}" - DASHBOARD = "dashboard" - DASHBOARD_KEY = "dashboardKey" - OWNERSHIP = "ownership" - BROWSERPATH = "browsePaths" - DASHBOARD_INFO = "dashboardInfo" - DATAPLATFORM_INSTANCE = "dataPlatformInstance" - DATASET = "dataset" - DATASET_ID = "powerbi.linkedin.com/datasets/{}" - DATASET_KEY = "datasetKey" - DATASET_PROPERTIES = "datasetProperties" - VALUE = "value" - ENTITY = "ENTITY" - ID = "ID" - HTTP_RESPONSE_TEXT = "HttpResponseText" - HTTP_RESPONSE_STATUS_CODE = "HttpResponseStatusCode" - - -class PowerBiAPI: - # API endpoints of PowerBi to fetch dashboards, tiles, datasets - API_ENDPOINTS = { - Constant.DASHBOARD_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/dashboards", - Constant.ENTITY_USER_LIST: "{POWERBI_ADMIN_BASE_URL}/{ENTITY}/{ENTITY_ID}/users", - Constant.TILE_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/dashboards/{DASHBOARD_ID}/tiles", - Constant.DATASET_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/datasets/{DATASET_ID}", - Constant.DATASOURCE_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/datasets/{DATASET_ID}/datasources", - Constant.REPORT_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports/{REPORT_ID}", - Constant.REPORT_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports", - Constant.SCAN_GET: "{POWERBI_ADMIN_BASE_URL}/workspaces/scanStatus/{SCAN_ID}", - Constant.SCAN_RESULT_GET: "{POWERBI_ADMIN_BASE_URL}/workspaces/scanResult/{SCAN_ID}", - Constant.SCAN_CREATE: "{POWERBI_ADMIN_BASE_URL}/workspaces/getInfo", - Constant.PAGE_BY_REPORT: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports/{REPORT_ID}/pages", - } - - SCOPE: str = "https://analysis.windows.net/powerbi/api/.default" - BASE_URL: str = "https://api.powerbi.com/v1.0/myorg/groups" - ADMIN_BASE_URL: str = "https://api.powerbi.com/v1.0/myorg/admin" - AUTHORITY: str = "https://login.microsoftonline.com/" - - @dataclass - class Workspace: - """ - PowerBi Workspace - """ - - id: str - name: str - state: str - dashboards: List[Any] - datasets: Dict[str, "PowerBiAPI.PowerBIDataset"] - - @dataclass - class DataSource: - """ - PowerBi - """ - - id: str - type: str - raw_connection_detail: Dict - - def __members(self): - return (self.id,) - - def __eq__(self, instance): - return ( - isinstance(instance, PowerBiAPI.DataSource) - and self.__members() == instance.__members() - ) - - def __hash__(self): - return hash(self.__members()) - - @dataclass - class Table: - name: str - full_name: str - expression: Optional[str] - - # dataclasses for PowerBi Dashboard - @dataclass - class PowerBIDataset: - id: str - name: str - webUrl: Optional[str] - workspace_id: str - # Table in datasets - tables: List["PowerBiAPI.Table"] - - def get_urn_part(self): - return f"datasets.{self.id}" - - def __members(self): - return (self.id,) - - def __eq__(self, instance): - return ( - isinstance(instance, PowerBiAPI.PowerBIDataset) - and self.__members() == instance.__members() - ) - - def __hash__(self): - return hash(self.__members()) - - @dataclass - class Page: - id: str - displayName: str - name: str - order: int - - def get_urn_part(self): - return f"pages.{self.id}" - - @dataclass - class User: - id: str - displayName: str - emailAddress: str - graphId: str - principalType: str - - def get_urn_part(self): - return f"users.{self.id}" - - def __members(self): - return (self.id,) - - def __eq__(self, instance): - return ( - isinstance(instance, PowerBiAPI.User) - and self.__members() == instance.__members() - ) - - def __hash__(self): - return hash(self.__members()) - - @dataclass - class Report: - id: str - name: str - webUrl: str - embedUrl: str - description: str - dataset: Optional["PowerBiAPI.PowerBIDataset"] - pages: List["PowerBiAPI.Page"] - users: List["PowerBiAPI.User"] - - def get_urn_part(self): - return f"reports.{self.id}" - - @dataclass - class Tile: - class CreatedFrom(Enum): - REPORT = "Report" - DATASET = "Dataset" - VISUALIZATION = "Visualization" - UNKNOWN = "UNKNOWN" - - id: str - title: str - embedUrl: str - dataset: Optional["PowerBiAPI.PowerBIDataset"] - report: Optional[Any] - createdFrom: CreatedFrom - - def get_urn_part(self): - return f"charts.{self.id}" - - @dataclass - class Dashboard: - id: str - displayName: str - embedUrl: str - webUrl: str - isReadOnly: Any - workspace_id: str - workspace_name: str - tiles: List["PowerBiAPI.Tile"] - users: List["PowerBiAPI.User"] - - def get_urn_part(self): - return f"dashboards.{self.id}" - - def __members(self): - return (self.id,) - - def __eq__(self, instance): - return ( - isinstance(instance, PowerBiAPI.Dashboard) - and self.__members() == instance.__members() - ) - - def __hash__(self): - return hash(self.__members()) - - def __init__(self, config: PowerBiAPIConfig) -> None: - self.__config: PowerBiAPIConfig = config - self.__access_token: str = "" - # Power-Bi Auth (Service Principal Auth) - self.__msal_client = msal.ConfidentialClientApplication( - self.__config.client_id, - client_credential=self.__config.client_secret, - authority=PowerBiAPI.AUTHORITY + self.__config.tenant_id, - ) - - # Test connection by generating a access token - LOGGER.info("Trying to connect to {}".format(self.__get_authority_url())) - self.get_access_token() - LOGGER.info("Able to connect to {}".format(self.__get_authority_url())) - - def __get_authority_url(self): - return "{}{}".format(PowerBiAPI.AUTHORITY, self.__config.tenant_id) - - def __get_users(self, workspace_id: str, entity: str, id: str) -> List[User]: - """ - Get user for the given PowerBi entity - """ - users: List[PowerBiAPI.User] = [] - if self.__config.extract_ownership is False: - LOGGER.info( - "ExtractOwnership capabilities is disabled from configuration and hence returning empty users list" - ) - return users - - user_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.ENTITY_USER_LIST] - # Replace place holders - user_list_endpoint = user_list_endpoint.format( - POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, - ENTITY=entity, - ENTITY_ID=id, - ) - # Hit PowerBi - LOGGER.info(f"Request to URL={user_list_endpoint}") - response = requests.get( - user_list_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - LOGGER.warning( - f"Failed to fetch user list from power-bi for, http_status={response.status_code}, message={response.text}" - ) - - LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.info(f"{Constant.ENTITY}={entity}") - LOGGER.info(f"{Constant.ID}={id}") - raise ConnectionError("Failed to fetch the user list from the power-bi") - - users_dict: List[Any] = response.json()[Constant.VALUE] - - # Iterate through response and create a list of PowerBiAPI.Dashboard - users = [ - PowerBiAPI.User( - id=instance.get("identifier"), - displayName=instance.get("displayName"), - emailAddress=instance.get("emailAddress"), - graphId=instance.get("graphId"), - principalType=instance.get("principalType"), - ) - for instance in users_dict - ] - - return users - - def __get_report(self, workspace_id: str, report_id: str) -> "PowerBiAPI.Report": - """ - Fetch the report from PowerBi for the given report identifier - """ - if workspace_id is None or report_id is None: - LOGGER.info("Input values are None") - LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.info(f"{Constant.ReportId}={report_id}") - return None - - report_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_GET] - # Replace place holders - report_get_endpoint = report_get_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, - WORKSPACE_ID=workspace_id, - REPORT_ID=report_id, - ) - # Hit PowerBi - LOGGER.info(f"Request to report URL={report_get_endpoint}") - response = requests.get( - report_get_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - message: str = "Failed to fetch report from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.warning(f"{Constant.ReportId}={report_id}") - raise ConnectionError(message) - - response_dict = response.json() - - return PowerBiAPI.Report( - id=response_dict.get("id"), - name=response_dict.get("name"), - webUrl=response_dict.get("webUrl"), - embedUrl=response_dict.get("embedUrl"), - description=response_dict.get("description"), - users=[], - pages=[], - dataset=self.get_dataset( - workspace_id=workspace_id, dataset_id=response_dict.get("datasetId") - ), - ) - - def get_access_token(self): - if self.__access_token != "": - LOGGER.info("Returning the cached access token") - return self.__access_token - - LOGGER.info("Generating PowerBi access token") - - auth_response = self.__msal_client.acquire_token_for_client( - scopes=[PowerBiAPI.SCOPE] - ) - - if not auth_response.get("access_token"): - LOGGER.warn( - "Failed to generate the PowerBi access token. Please check input configuration" - ) - raise ConfigurationError( - "Powerbi authorization failed . Please check your input configuration." - ) - - LOGGER.info("Generated PowerBi access token") - - self.__access_token = "Bearer {}".format(auth_response.get("access_token")) - - LOGGER.debug(f"{Constant.PBIAccessToken}={self.__access_token}") - - return self.__access_token - - def get_dashboard_users(self, dashboard: Dashboard) -> List[User]: - """ - Return list of dashboard users - """ - return self.__get_users( - workspace_id=dashboard.workspace_id, entity="dashboards", id=dashboard.id - ) - - def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: - """ - Get the list of dashboard from PowerBi for the given workspace identifier - - TODO: Pagination. As per REST API doc (https://docs.microsoft.com/en-us/rest/api/power-bi/dashboards/get-dashboards), there is no information available on pagination - """ - dashboard_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DASHBOARD_LIST] - # Replace place holders - dashboard_list_endpoint = dashboard_list_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, WORKSPACE_ID=workspace.id - ) - # Hit PowerBi - LOGGER.info(f"Request to URL={dashboard_list_endpoint}") - response = requests.get( - dashboard_list_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - LOGGER.warning("Failed to fetch dashboard list from power-bi for") - LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") - raise ConnectionError( - "Failed to fetch the dashboard list from the power-bi" - ) - - dashboards_dict: List[Any] = response.json()[Constant.VALUE] - - # Iterate through response and create a list of PowerBiAPI.Dashboard - dashboards: List[PowerBiAPI.Dashboard] = [ - PowerBiAPI.Dashboard( - id=instance.get("id"), - isReadOnly=instance.get("isReadOnly"), - displayName=instance.get("displayName"), - embedUrl=instance.get("embedUrl"), - webUrl=instance.get("webUrl"), - workspace_id=workspace.id, - workspace_name=workspace.name, - tiles=[], - users=[], - ) - for instance in dashboards_dict - if instance is not None - ] - - return dashboards - - def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: - """ - Fetch the dataset from PowerBi for the given dataset identifier - """ - if workspace_id is None or dataset_id is None: - LOGGER.info("Input values are None") - LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.info(f"{Constant.DatasetId}={dataset_id}") - return None - - dataset_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DATASET_GET] - # Replace place holders - dataset_get_endpoint = dataset_get_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, - WORKSPACE_ID=workspace_id, - DATASET_ID=dataset_id, - ) - # Hit PowerBi - LOGGER.info(f"Request to dataset URL={dataset_get_endpoint}") - response = requests.get( - dataset_get_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - message: str = "Failed to fetch dataset from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.warning(f"{Constant.DatasetId}={dataset_id}") - raise ConnectionError(message) - - response_dict = response.json() - LOGGER.debug("datasets = {}".format(response_dict)) - # PowerBi Always return the webURL, in-case if it is None then setting complete webURL to None instead of None/details - return PowerBiAPI.PowerBIDataset( - id=response_dict.get("id"), - name=response_dict.get("name"), - webUrl="{}/details".format(response_dict.get("webUrl")) - if response_dict.get("webUrl") is not None - else None, - workspace_id=workspace_id, - tables=[], - ) - - def get_data_sources( - self, dataset: PowerBIDataset - ) -> Optional[Dict[str, "PowerBiAPI.DataSource"]]: - """ - Fetch the data source from PowerBi for the given dataset - """ - - datasource_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DATASOURCE_GET] - # Replace place holders - datasource_get_endpoint = datasource_get_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, - WORKSPACE_ID=dataset.workspace_id, - DATASET_ID=dataset.id, - ) - # Hit PowerBi - LOGGER.info(f"Request to datasource URL={datasource_get_endpoint}") - response = requests.get( - datasource_get_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - message: str = "Failed to fetch datasource from power-bi for" - LOGGER.warning(message) - LOGGER.warning("{}={}".format(Constant.WorkspaceId, dataset.workspace_id)) - LOGGER.warning("{}={}".format(Constant.DatasetId, dataset.id)) - LOGGER.warning("{}={}".format(Constant.HTTP_RESPONSE_TEXT, response.text)) - LOGGER.warning( - "{}={}".format(Constant.HTTP_RESPONSE_STATUS_CODE, response.status_code) - ) - - raise ConnectionError(message) - - res = response.json() - value = res["value"] - if len(value) == 0: - LOGGER.info( - f"datasource is not found for dataset {dataset.name}({dataset.id})" - ) - - return None - - data_sources: Dict[str, "PowerBiAPI.DataSource"] = {} - LOGGER.debug("data-sources = {}".format(value)) - for datasource_dict in value: - # Create datasource instance with basic detail available - datasource = PowerBiAPI.DataSource( - id=datasource_dict.get( - "datasourceId" - ), # datasourceId is not available in all cases - type=datasource_dict["datasourceType"], - raw_connection_detail=datasource_dict["connectionDetails"], - ) - - data_sources[datasource.id] = datasource - - return data_sources - - def get_tiles(self, workspace: Workspace, dashboard: Dashboard) -> List[Tile]: - - """ - Get the list of tiles from PowerBi for the given workspace identifier - - TODO: Pagination. As per REST API doc (https://docs.microsoft.com/en-us/rest/api/power-bi/dashboards/get-tiles), there is no information available on pagination - """ - - def new_dataset_or_report(tile_instance: Any) -> dict: - """ - Find out which is the data source for tile. It is either REPORT or DATASET - """ - report_fields = { - "dataset": ( - workspace.datasets[tile_instance.get("datasetId")] - if tile_instance.get("datasetId") is not None - else None - ), - "report": ( - self.__get_report( - workspace_id=workspace.id, - report_id=tile_instance.get("reportId"), - ) - if tile_instance.get("reportId") is not None - else None - ), - "createdFrom": PowerBiAPI.Tile.CreatedFrom.UNKNOWN, - } - - # Tile is either created from report or dataset or from custom visualization - if report_fields["report"] is not None: - report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.REPORT - elif report_fields["dataset"] is not None: - report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.DATASET - else: - report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.VISUALIZATION - - LOGGER.info( - f'Tile {tile_instance.get("title")}({tile_instance.get("id")}) is created from {report_fields["createdFrom"]}' - ) - - return report_fields - - tile_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.TILE_LIST] - # Replace place holders - tile_list_endpoint = tile_list_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, - WORKSPACE_ID=dashboard.workspace_id, - DASHBOARD_ID=dashboard.id, - ) - # Hit PowerBi - LOGGER.info("Request to URL={}".format(tile_list_endpoint)) - response = requests.get( - tile_list_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - LOGGER.warning("Failed to fetch tiles list from power-bi for") - LOGGER.warning("{}={}".format(Constant.WorkspaceId, workspace.id)) - LOGGER.warning("{}={}".format(Constant.DashboardId, dashboard.id)) - raise ConnectionError("Failed to fetch the tile list from the power-bi") - - # Iterate through response and create a list of PowerBiAPI.Dashboard - tile_dict: List[Any] = response.json()[Constant.VALUE] - LOGGER.debug("Tile Dict = {}".format(tile_dict)) - tiles: List[PowerBiAPI.Tile] = [ - PowerBiAPI.Tile( - id=instance.get("id"), - title=instance.get("title"), - embedUrl=instance.get("embedUrl"), - **new_dataset_or_report(instance), - ) - for instance in tile_dict - if instance is not None - ] - - return tiles - - def get_pages_by_report( - self, workspace_id: str, report_id: str - ) -> List["PowerBiAPI.Page"]: - """ - Fetch the report from PowerBi for the given report identifier - """ - if workspace_id is None or report_id is None: - LOGGER.info("workspace_id or report_id is None") - return [] - - pages_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.PAGE_BY_REPORT] - # Replace place holders - pages_endpoint = pages_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, - WORKSPACE_ID=workspace_id, - REPORT_ID=report_id, - ) - # Hit PowerBi - LOGGER.info(f"Request to pages URL={pages_endpoint}") - response = requests.get( - pages_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - message: str = "Failed to fetch reports from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") - raise ConnectionError(message) - - response_dict = response.json() - return [ - PowerBiAPI.Page( - id="{}.{}".format(report_id, raw_instance["name"].replace(" ", "_")), - name=raw_instance["name"], - displayName=raw_instance.get("displayName"), - order=raw_instance.get("order"), - ) - for raw_instance in response_dict["value"] - ] - - def get_reports( - self, workspace: "PowerBiAPI.Workspace" - ) -> List["PowerBiAPI.Report"]: - """ - Fetch the report from PowerBi for the given report identifier - """ - if workspace is None: - LOGGER.info("workspace is None") - LOGGER.info(f"{Constant.WorkspaceId}={workspace.id}") - return [] - - report_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_LIST] - # Replace place holders - report_list_endpoint = report_list_endpoint.format( - POWERBI_BASE_URL=PowerBiAPI.BASE_URL, - WORKSPACE_ID=workspace.id, - ) - # Hit PowerBi - LOGGER.info(f"Request to report URL={report_list_endpoint}") - response = requests.get( - report_list_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - - # Check if we got response from PowerBi - if response.status_code != 200: - message: str = "Failed to fetch reports from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") - raise ConnectionError(message) - - response_dict = response.json() - reports: List["PowerBiAPI.Report"] = [ - PowerBiAPI.Report( - id=raw_instance["id"], - name=raw_instance.get("name"), - webUrl=raw_instance.get("webUrl"), - embedUrl=raw_instance.get("embedUrl"), - description=raw_instance.get("description"), - pages=self.get_pages_by_report( - workspace_id=workspace.id, report_id=raw_instance["id"] - ), - users=self.__get_users( - workspace_id=workspace.id, entity="reports", id=raw_instance["id"] - ), - dataset=workspace.datasets.get(raw_instance.get("datasetId")), - ) - for raw_instance in response_dict["value"] - ] - - return reports - - # flake8: noqa: C901 - def get_workspace( - self, workspace_id: str, reporter: PowerBiDashboardSourceReport - ) -> Workspace: - """ - Return Workspace for the given workspace identifier i.e. workspace_id - """ - scan_create_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_CREATE] - scan_create_endpoint = scan_create_endpoint.format( - POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL - ) - - def create_scan_job(): - """ - Create scan job on PowerBi for the workspace - """ - request_body = {"workspaces": [workspace_id]} - - res = requests.post( - scan_create_endpoint, - data=request_body, - params={ - "datasetExpressions": True, - "datasetSchema": True, - "datasourceDetails": True, - "getArtifactUsers": True, - "lineage": True, - }, - headers={Constant.Authorization: self.get_access_token()}, - ) - - if res.status_code not in (200, 202): - message = f"API({scan_create_endpoint}) return error code {res.status_code} for workspace id({workspace_id})" - - LOGGER.warning(message) - - raise ConnectionError(message) - # Return Id of Scan created for the given workspace - id = res.json()["id"] - LOGGER.info("Scan id({})".format(id)) - return id - - def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Boolean: - """ - Poll the PowerBi service for workspace scan to complete - """ - minimum_sleep = 3 - if timeout < minimum_sleep: - LOGGER.info( - f"Setting timeout to minimum_sleep time {minimum_sleep} seconds" - ) - timeout = minimum_sleep - - max_trial = timeout // minimum_sleep - LOGGER.info(f"Max trial {max_trial}") - scan_get_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_GET] - scan_get_endpoint = scan_get_endpoint.format( - POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id - ) - - LOGGER.info(f"Hitting URL={scan_get_endpoint}") - - trail = 1 - while True: - LOGGER.info(f"Trial = {trail}") - res = requests.get( - scan_get_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - if res.status_code != 200: - message = f"API({scan_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - - LOGGER.warning(message) - - raise ConnectionError(message) - - if res.json()["status"].upper() == "Succeeded".upper(): - LOGGER.info(f"Scan result is available for scan id({scan_id})") - return True - - if trail == max_trial: - break - LOGGER.info(f"Sleeping for {minimum_sleep} seconds") - sleep(minimum_sleep) - trail += 1 - - # Result is not available - return False - - def get_scan_result(scan_id: str) -> dict: - LOGGER.info("Fetching scan result") - LOGGER.info(f"{Constant.SCAN_ID}={scan_id}") - scan_result_get_endpoint = PowerBiAPI.API_ENDPOINTS[ - Constant.SCAN_RESULT_GET - ] - scan_result_get_endpoint = scan_result_get_endpoint.format( - POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id - ) - - LOGGER.info(f"Hitting URL={scan_result_get_endpoint}") - res = requests.get( - scan_result_get_endpoint, - headers={Constant.Authorization: self.get_access_token()}, - ) - if res.status_code != 200: - message = f"API({scan_result_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - - LOGGER.warning(message) - - raise ConnectionError(message) - - return res.json()["workspaces"][0] - - def json_to_dataset_map(scan_result: dict) -> dict: - """ - Filter out "dataset" from scan_result and return PowerBiAPI.Dataset instance set - """ - datasets: Optional[Any] = scan_result.get("datasets") - dataset_map: dict = {} - - if datasets is None or len(datasets) == 0: - LOGGER.warning( - f'Workspace {scan_result["name"]}({scan_result["id"]}) does not have datasets' - ) - - LOGGER.info("Returning empty datasets") - return dataset_map - - for dataset_dict in datasets: - dataset_instance: PowerBiAPI.PowerBIDataset = self.get_dataset( - workspace_id=scan_result["id"], - dataset_id=dataset_dict["id"], - ) - dataset_map[dataset_instance.id] = dataset_instance - # set dataset-name - dataset_name: str = ( - dataset_instance.name - if dataset_instance.name is not None - else dataset_instance.id - ) - - for table in dataset_dict["tables"]: - expression: str = ( - table["source"][0]["expression"] - if table.get("source") is not None and len(table["source"]) > 0 - else None - ) - dataset_instance.tables.append( - PowerBiAPI.PowerBIDataset.Table( - name=table["name"], - full_name="{}.{}".format( - dataset_name.replace(" ", "_"), - table["name"].replace(" ", "_"), - ), - expression=expression, - ) - ) - - return dataset_map - - def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: - for dashboard in workspace.dashboards: - dashboard.tiles = self.get_tiles(workspace, dashboard=dashboard) - - return None - - LOGGER.info("Creating scan job for workspace") - LOGGER.info("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.info("Hitting URL={}".format(scan_create_endpoint)) - scan_id = create_scan_job() - LOGGER.info("Waiting for scan to complete") - if ( - wait_for_scan_to_complete( - scan_id=scan_id, timeout=self.__config.scan_timeout - ) - is False - ): - raise ValueError( - "Workspace detail is not available. Please increase scan_timeout to wait." - ) - - # Scan is complete lets take the result - scan_result = get_scan_result(scan_id=scan_id) - LOGGER.debug(f"scan result = {scan_result}") - import json - print(json.dumps(scan_result, indent=1)) - workspace = PowerBiAPI.Workspace( - id=scan_result["id"], - name=scan_result["name"], - state=scan_result["state"], - datasets={}, - dashboards=[], - ) - # Get workspace dashboards - workspace.dashboards = self.get_dashboards(workspace) - - workspace.datasets = json_to_dataset_map(scan_result) - init_dashboard_tiles(workspace) - return workspace class Mapper: diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 0ffa9e635f43f..341906eae48d9 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,6 +1,7 @@ from lark import Tree from datahub.ingestion.source.powerbi import m_parser +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -97,4 +98,9 @@ # assert m_parser.get_output_variable(parse_tree) == 'two_source_table' def test_get_upstream(): - m_parser.get_upstream_tables(M_QUERIES[0], None) + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=M_QUERIES[0], + name="table-name", + full_name="db-name.schema-name.table-name", + ) + m_parser.get_upstream_tables(table, None) From 46dcafd15949f966ec8480b52b9021778b38c819 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 8 Dec 2022 00:04:00 +0530 Subject: [PATCH 15/53] WIP --- .../ingestion/source/powerbi/m_parser.py | 58 +++++++++++++++---- .../integration/powerbi/test_m_parser.py | 3 +- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 04d6dd5da69f3..75cc48172e95c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -5,11 +5,11 @@ from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport from datahub.ingestion.source.powerbi.proxy import PowerBiAPI import logging -from typing import List, Optional, Any, Dict +from typing import List, Optional, Any, Dict, Union from lark import Lark, Tree, Token -logger = logging.getLogger(__name__) +LOGGER = logging.getLogger(__name__) @dataclass @@ -87,23 +87,61 @@ def parse_expression(expression: str) -> Tree: parse_tree: Tree = lark_parser.parse(expression) - logger.debug("Parse Tree") - if logger.level == logging.DEBUG: # Guard condition to avoid heavy pretty() function call - logger.debug(parse_tree.pretty()) + LOGGER.debug("Parse Tree") + if LOGGER.level == logging.DEBUG: # Guard condition to avoid heavy pretty() function call + LOGGER.debug(parse_tree.pretty()) return parse_tree +def get_resolver(parse_tree: Tree) -> Optional[AbstractMQueryResolver]: + #import pdb; pdb.set_trace() + + _filter: Any = parse_tree.find_data("invoke_expression") + + def print_leaf(node: Tree): + print(node.pretty()) + # if isinstance(node, Tree) and isinstance(tree.children[0], Token): + # print("It is token") + # return + # + # for child in tree.children: + # print_leaf(child) + + #print(next(next(_filter).children[0].find_data("letter_character"))) + _filter = next(_filter).children[0].find_data("letter_character") + for node in _filter: + print('======') + print(node) + print('======') + + + return None + + def get_upstream_tables(table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport) -> List[DataPlatformTable]: parse_tree = parse_expression(table.expression) output_variable = get_output_variable(parse_tree) - filter: Any = parse_tree.find_data("invoke_expression") - tokens: List[Any] = list(filter) - print("Length = {}".format(len(tokens))) - for tree in tokens: - print(tree.pretty()) + _filter: Any = parse_tree.find_data("invoke_expression") + trees: List[Tree] = list(_filter) + if len(trees) > 1: + reporter.report_warning(table.full_name, f"{table.full_name} has more than one invoke expression") + return [] + + #print(trees[0]) + + resolver: AbstractMQueryResolver = get_resolver(parse_tree) + if resolver is None: + LOGGER.debug("Table full-name = %s", table.full_name) + LOGGER.debug("Expression = %s", table.expression) + reporter.report_warning( + table.full_name, + f"{table.full_name} M-Query resolver not found for the table expression" + ) + return [] + # filter: Any = parse_tree.find_data("variable") # def find_variable(node: Tree, variable: str) -> bool: diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 341906eae48d9..e5c2eb4b864ec 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -2,6 +2,7 @@ from datahub.ingestion.source.powerbi import m_parser from datahub.ingestion.source.powerbi.proxy import PowerBiAPI +from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -103,4 +104,4 @@ def test_get_upstream(): name="table-name", full_name="db-name.schema-name.table-name", ) - m_parser.get_upstream_tables(table, None) + m_parser.get_upstream_tables(table, PowerBiDashboardSourceReport()) From c5c5acecdc545ad9e5da578e36c649b672a90a60 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Fri, 9 Dec 2022 23:20:10 +0530 Subject: [PATCH 16/53] working code for postgres --- .../ingestion/source/powerbi/m_parser.py | 363 +++++++++++++----- .../ingestion/source/powerbi/powerbi.py | 10 +- .../integration/powerbi/test_m_parser.py | 69 ++-- 3 files changed, 300 insertions(+), 142 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 75cc48172e95c..de091127660ae 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,11 +1,12 @@ -from abc import ABC +from abc import ABC, abstractmethod +from enum import Enum from dataclasses import dataclass import importlib.resources as pkg_resource from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport from datahub.ingestion.source.powerbi.proxy import PowerBiAPI import logging -from typing import List, Optional, Any, Dict, Union +from typing import List, Optional, Any, Dict, Union, cast from lark import Lark, Tree, Token @@ -19,64 +20,280 @@ class DataPlatformTable: platform_type: str +class SupportedDataPlatform(Enum): + POSTGRES_SQL = "PostgreSQL" + ORACLE = "Oracle" + MY_SQL = "MySql" + SNOWFLAKE = "Snowflake" + + +def _get_output_variable(root: Tree) -> Optional[str]: + def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: + for rule in rules: + token_list = [x for x in tree.find_data(rule)] + if len(token_list) > 0: + return token_list + + return [] + + for tree in root.find_data("in_expression"): + for child1 in get_token_list_for_any( + tree, ["letter_character", "quoted_identifier"] + ): + return child1.children[0].value # type: ignore + + return None + + +def _get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: + _filter = parse_tree.find_data("variable") + # filter will return statement of the form = + # We are searching for Tree where variable-name is matching with provided variable + for tree in _filter: + values: List[str] = _token_values(tree.children[0]) + if len(values) > 1: + # Rare chances to happen as PowerBI Grammar only have one identifier in variable-name rule + LOGGER.info("Found more than one value in variable_name rule") + return None + + if variable == values[0]: + return tree + + LOGGER.info("Provided variable(%s) not found in variable rule", variable) + + return None + + +def _get_first_rule(tree: Tree, rule: str) -> Optional[Tree]: + """ + Lark library doesn't have advance search function. + This function will return the first tree of provided rule + :param tree: Tree to search for the expression rule + :return: Tree + """ + def internal(node: Union[Tree, Token]) -> Optional[Tree]: + if isinstance(node, Tree) and node.data == rule: + return node + if isinstance(node, Token): + return None + + for child in cast(Tree, node).children: + node = internal(child) + if node is not None: + return node + + expression_tree: Optional[Tree] = internal(tree) + + return expression_tree + + +def _token_values(tree: Tree) -> List[str]: + """ + + :param tree: Tree to traverse + :return: List of leaf token data + """ + values: List[str] = [] + + def internal(node: Union[Tree, Token]): + if isinstance(node, Token): + values.append(cast(Token, node).value) + return + + for child in node.children: + internal(child) + + internal(tree) + + return values + + +def _remove_whitespaces_from_list(values: List[str]) -> List[str]: + result: List[str] = [] + for item in values: + if item.strip() not in ('', '\n', '\t'): + result.append(item) + + return result + + +def _strip_char_from_list(values: List[str], char: str) -> List[str]: + result: List[str] = [] + for item in values: + result.append(item.strip(char)) + + return result + + +def _make_function_name(tree: Tree) -> str: + values: List[str] = _token_values(tree) + return ".".join(values) + + class AbstractMQueryResolver(ABC): pass class AbstractDataAccessMQueryResolver(AbstractMQueryResolver, ABC): - pass - + table: PowerBiAPI.Table + parse_tree: Tree + reporter: PowerBiDashboardSourceReport + + def __init__(self, table: PowerBiAPI.Table, parse_tree: Tree, reporter: PowerBiDashboardSourceReport): + self.table = table + self.parse_tree = parse_tree + self.reporter = reporter + + @abstractmethod + def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + pass + + +class RelationalMQueryResolver(AbstractDataAccessMQueryResolver, ABC): + + def get_item_selector_tokens(self, variable_statement: Tree) -> (str, List[str]): + expression_tree: Tree = _get_first_rule(variable_statement, "expression") + item_selector: Tree = _get_first_rule(expression_tree, "item_selector") + identifier_tree: Tree = _get_first_rule(expression_tree, "identifier") + # remove whitespaces and quotes from token + tokens: List[str] = _strip_char_from_list(_remove_whitespaces_from_list(_token_values(item_selector)), "\"") + identifier: List[str] = _token_values(identifier_tree) + # convert tokens to dict + iterator = iter(tokens) + return identifier[0], dict(zip(iterator, iterator)) + + def get_argument_list(self, variable_statement: Tree) -> List[str]: + expression_tree: Tree = _get_first_rule(variable_statement, "expression") + argument_list: Tree = _get_first_rule(expression_tree, "argument_list") + # remove whitespaces and quotes from token + tokens: List[str] = _strip_char_from_list(_remove_whitespaces_from_list(_token_values(argument_list)), "\"") + return tokens + + def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + data_platform_tables: List[DataPlatformTable] = [] + # Look for output variable + output_variable: str = _get_output_variable(self.parse_tree) + if output_variable is None: + self.reporter.warnings( + f"{self.table.full_name}-output-variable", + "output-variable not found in table expression", + ) + return data_platform_tables + + full_table_name: str = self.get_full_table_name(output_variable) + if full_table_name is None: + LOGGER.debug("Fail to form full_table_name for PowerBI DataSet table %s", self.table.full_name) + return data_platform_tables + + return [ + DataPlatformTable( + name=full_table_name.split(".")[-1], + full_name=full_table_name, + platform_type=self.get_platform() + ), + ] + + @abstractmethod + def get_platform(self) -> str: + pass + + @abstractmethod + def get_full_table_name(self, output_variable: str) -> str: + pass + + +class PostgresMQueryResolver(RelationalMQueryResolver): + def get_full_table_name(self, output_variable: str) -> Optional[str]: + variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) + if variable_statement is None: + self.reporter.warnings( + f"{self.table.full_name}-variable-statement", + "output variable statement not found in table expression", + ) + return None + source, tokens = self.get_item_selector_tokens(variable_statement) + schema_name: str = tokens["Schema"] + table_name: str = tokens["Item"] + # Look for database-name + variable_statement = _get_variable_statement(self.parse_tree, source) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-source-statement", + "source variable statement not found in table expression", + ) + return None + tokens = self.get_argument_list(variable_statement) + if len(tokens) < 1: + self.reporter.report_warning( + f"{self.table.full_name}-database-arg-list", + "Number of expected tokens in argument list are not present in table expression", + ) + return None + + database_name: str = tokens[1] # 1st token is database name + return f"{database_name}.{schema_name}.{table_name}" + + def get_platform(self) -> str: + return SupportedDataPlatform.POSTGRES_SQL.value -class PostgresMQueryResolver(AbstractDataAccessMQueryResolver): - pass class OracleMQueryResolver(AbstractDataAccessMQueryResolver): - pass + def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + return [ + DataPlatformTable( + name="postgres_table", + full_name="book.public.test", + platform_type="Oracle" + ), + ] class SnowflakeMQueryResolver(AbstractDataAccessMQueryResolver): - pass + def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + return [ + DataPlatformTable( + name="postgres_table", + full_name="book.public.test", + platform_type="Snowflake" + ), + ] -class AbstractTableAccessMQueryResolver(AbstractDataAccessMQueryResolver, ABC): - pass - - -class TableCombineMQueryResolver(AbstractTableAccessMQueryResolver): - pass +def _get_resolver(parse_tree: Tree) -> Optional[AbstractMQueryResolver]: + _filter: Any = parse_tree.find_data("invoke_expression") -DATA_ACCESS_RESOLVER: Dict[str, AbstractMQueryResolver.__class__] = { - "PostgreSQL.Database": PostgresMQueryResolver, - "Oracle.Database": OracleMQueryResolver, - "Snowflake.Database": SnowflakeMQueryResolver, -} + letter_tree: Tree = next(_filter).children[0] + data_access_func: str = _make_function_name(letter_tree) -TABLE_ACCESS_RESOLVER: Dict[str, AbstractMQueryResolver.__class__] = { - "Table.Combine": TableCombineMQueryResolver, -} + LOGGER.debug( + "Looking for data-access(%s) resolver in data-access-function registry %s", + data_access_func, + DATA_ACCESS_RESOLVER, + ) + if DATA_ACCESS_RESOLVER.get(data_access_func) is None: + LOGGER.info("Resolver not found for %s", data_access_func) + return None -def get_output_variable(root: Tree) -> Optional[str]: - def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: - for rule in rules: - token_list = [x for x in tree.find_data(rule)] - if len(token_list) > 0: - return token_list + return DATA_ACCESS_RESOLVER[data_access_func] - return [] - for tree in root.find_data("in_expression"): - for child1 in get_token_list_for_any( - tree, ["letter_character", "quoted_identifier"] - ): - return child1.children[0].value # type: ignore +# Register M-Query resolver for specific database platform +DATA_ACCESS_RESOLVER: Dict[str, AbstractDataAccessMQueryResolver.__class__] = { + f"{SupportedDataPlatform.POSTGRES_SQL.value}.Database": PostgresMQueryResolver, + f"{SupportedDataPlatform.ORACLE.value}.Database": OracleMQueryResolver, + f"{SupportedDataPlatform.SNOWFLAKE.value}.Databases": SnowflakeMQueryResolver, +} - return None +# Register M-Query resolver for function call to resolve function arguments +TABLE_ACCESS_RESOLVER: Dict[str, AbstractMQueryResolver.__class__] = { + "Table.Combine": None, +} -def parse_expression(expression: str) -> Tree: +def _parse_expression(expression: str) -> Tree: # Read lexical grammar as text grammar: str = pkg_resource.read_text( "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" @@ -94,45 +311,15 @@ def parse_expression(expression: str) -> Tree: return parse_tree -def get_resolver(parse_tree: Tree) -> Optional[AbstractMQueryResolver]: - #import pdb; pdb.set_trace() - - _filter: Any = parse_tree.find_data("invoke_expression") - - def print_leaf(node: Tree): - print(node.pretty()) - # if isinstance(node, Tree) and isinstance(tree.children[0], Token): - # print("It is token") - # return - # - # for child in tree.children: - # print_leaf(child) - - #print(next(next(_filter).children[0].find_data("letter_character"))) - _filter = next(_filter).children[0].find_data("letter_character") - for node in _filter: - print('======') - print(node) - print('======') - - - return None - - def get_upstream_tables(table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport) -> List[DataPlatformTable]: - parse_tree = parse_expression(table.expression) - - output_variable = get_output_variable(parse_tree) + parse_tree = _parse_expression(table.expression) - _filter: Any = parse_tree.find_data("invoke_expression") - trees: List[Tree] = list(_filter) + trees: List[Tree] = list(parse_tree.find_data("invoke_expression")) if len(trees) > 1: reporter.report_warning(table.full_name, f"{table.full_name} has more than one invoke expression") return [] - #print(trees[0]) - - resolver: AbstractMQueryResolver = get_resolver(parse_tree) + resolver: AbstractDataAccessMQueryResolver = _get_resolver(parse_tree) if resolver is None: LOGGER.debug("Table full-name = %s", table.full_name) LOGGER.debug("Expression = %s", table.expression) @@ -142,40 +329,4 @@ def get_upstream_tables(table: PowerBiAPI.Table, reporter: PowerBiDashboardSourc ) return [] - - # filter: Any = parse_tree.find_data("variable") - # def find_variable(node: Tree, variable: str) -> bool: - # for internal_child in node.children: - # if isinstance(internal_child, Token): - # if internal_child.value == variable: - # return True - # continue - # return find_variable(internal_child, variable) - # - # return False - # - # for tree in filter: - # if find_variable(tree, output_variable): - # print("Mohd1") - # print(tree.pretty()) - # for node in tree.find_data("field_selection"): - # print("Mohd2") - # print(node) - - return [ - DataPlatformTable( - name="postgres_table", - full_name="book.public.test", - platform_type="PostgreSql" - ), - DataPlatformTable( - name="oracle_table", - full_name="book.public.test", - platform_type="Oracle" - ), - DataPlatformTable( - name="snowflake_table", - full_name="book.public.test", - platform_type="Snowflake" - ), - ] + return resolver(table, parse_tree, reporter).resolve_to_data_platform_table_list() diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index e11525c6f0cdd..7d484cf757b00 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -41,15 +41,17 @@ ) from datahub.utilities.dedup_list import deduplicate_list from datahub.ingestion.source.powerbi import m_parser -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport, PowerBiDashboardSourceConfig, PlatformDetail +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceReport, + PowerBiDashboardSourceConfig, + PlatformDetail, + Constant +) from datahub.ingestion.source.powerbi.proxy import PowerBiAPI # Logger instance LOGGER = logging.getLogger(__name__) - - - class Mapper: """ Transfrom PowerBi concepts Dashboard, Dataset and Tile to DataHub concepts Dashboard, Dataset and Chart diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index e5c2eb4b864ec..3c08c9faea59b 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -18,90 +18,95 @@ "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source", 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"', 'let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n Source2 = PostgreSQL.Database(\"localhost\", \"mics\"),\n public_order_date = Source2{[Schema=\"public\",Item=\"order_date\"]}[Data],\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name=\"PUBLIC\",Kind=\"Schema\"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name=\"SALES_ANALYST_VIEW\",Kind=\"View\"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table', + 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', ] # def test_parse_m_query1(): # expression: str = M_QUERIES[0] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == "TESTTABLE_Table" +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "TESTTABLE_Table" # # # def test_parse_m_query2(): # expression: str = M_QUERIES[1] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Added Custom2"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom2"' # # # def test_parse_m_query3(): # expression: str = M_QUERIES[2] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Added Conditional Column"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Conditional Column"' # # # def test_parse_m_query4(): # expression: str = M_QUERIES[3] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Changed Type"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Changed Type"' # # # def test_parse_m_query5(): # expression: str = M_QUERIES[4] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Renamed Columns"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Renamed Columns"' # # # def test_parse_m_query6(): # expression: str = M_QUERIES[5] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Added Custom"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' # # # def test_parse_m_query7(): # expression: str = M_QUERIES[6] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == "Source" +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "Source" # # # def test_parse_m_query8(): # expression: str = M_QUERIES[7] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Added Custom1"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' # # # def test_parse_m_query9(): # expression: str = M_QUERIES[8] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Added Custom1"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' # # # def test_parse_m_query10(): # expression: str = M_QUERIES[9] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Changed Type1"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Changed Type1"' # # # def test_parse_m_query11(): # expression: str = M_QUERIES[10] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == "Source" +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "Source" # # # def test_parse_m_query12(): # expression: str = M_QUERIES[11] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == '"Added Custom"' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' # # # def test_parse_m_query13(): # expression: str = M_QUERIES[12] -# parse_tree: Tree = m_parser.parse_expression(expression) -# assert m_parser.get_output_variable(parse_tree) == 'two_source_table' +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == 'two_source_table' def test_get_upstream(): - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=M_QUERIES[0], - name="table-name", - full_name="db-name.schema-name.table-name", - ) - m_parser.get_upstream_tables(table, PowerBiDashboardSourceReport()) + qs = [M_QUERIES[0], M_QUERIES[-1]] + for q in qs: + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="table-name", + full_name="db-name.schema-name.table-name", + ) + reporter = PowerBiDashboardSourceReport() + print(m_parser.get_upstream_tables(table, reporter)) + From c86b23f19099135a4b80a12c944c6277ce530771 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Sat, 10 Dec 2022 22:37:30 +0530 Subject: [PATCH 17/53] WIP --- .../ingestion/source/powerbi/config.py | 23 +- .../ingestion/source/powerbi/m_parser.py | 188 +++++++++++---- .../ingestion/source/powerbi/powerbi.py | 72 +++--- .../integration/powerbi/test_m_parser.py | 217 +++++++++++------- 4 files changed, 326 insertions(+), 174 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 164dfb63ed612..b4a1422a11ada 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -1,4 +1,7 @@ import pydantic + +from pydantic import validator + import datahub.emitter.mce_builder as builder from dataclasses import field as dataclass_field @@ -8,6 +11,7 @@ from datahub.configuration.source_common import EnvBasedSourceConfigBase, DEFAULT_ENV from typing import Dict, Union from datahub.ingestion.api.source import SourceReport +from datahub.ingestion.source.powerbi import m_parser class Constant: """ @@ -83,7 +87,6 @@ def report_charts_dropped(self, view: str) -> None: @dataclass class PlatformDetail: - platform: str = pydantic.Field(description="DataHub platform name. Example postgres or oracle or snowflake") platform_instance: str = pydantic.Field(default=None, description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source") env: str = pydantic.Field( default=DEFAULT_ENV, @@ -114,11 +117,29 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): extract_ownership: bool = pydantic.Field( default=True, description="Whether ownership should be ingested" ) + # Enable/Disable extracting lineage information of PowerBI Dataset + extract_lineage: bool = pydantic.Field( + default=True, description="Whether lineage should be ingested" + ) # Enable/Disable extracting report information extract_reports: bool = pydantic.Field( default=True, description="Whether reports should be ingested" ) + @validator("dataset_type_mapping") + @classmethod + def check_dataset_type_mapping(cls, value): + # For backward compatibility map input PostgreSql to PostgreSQL + if "PostgreSql" in value.keys(): + platform_name = value["PostgreSql"] + del value["PostgreSql"] + value["PostgreSQL"] = platform_name + + for key in value.keys(): + if key not in m_parser.POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING.keys(): + raise ValueError(f"DataPlatform {key} is not supported") + return value + class PowerBiDashboardSourceConfig(PowerBiAPIConfig): platform_name: str = "powerbi" diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index de091127660ae..7a69aad810a01 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from enum import Enum +from functools import partial from dataclasses import dataclass import importlib.resources as pkg_resource @@ -23,26 +24,22 @@ class DataPlatformTable: class SupportedDataPlatform(Enum): POSTGRES_SQL = "PostgreSQL" ORACLE = "Oracle" - MY_SQL = "MySql" SNOWFLAKE = "Snowflake" -def _get_output_variable(root: Tree) -> Optional[str]: - def get_token_list_for_any(tree: Tree, rules: List[str]) -> List[Tree]: - for rule in rules: - token_list = [x for x in tree.find_data(rule)] - if len(token_list) > 0: - return token_list - - return [] +POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING: Dict[str, str] = { + SupportedDataPlatform.POSTGRES_SQL.value: "postgres", + SupportedDataPlatform.ORACLE.value: "oracle", + SupportedDataPlatform.SNOWFLAKE.value: "snowflake", +} - for tree in root.find_data("in_expression"): - for child1 in get_token_list_for_any( - tree, ["letter_character", "quoted_identifier"] - ): - return child1.children[0].value # type: ignore - return None +def _get_output_variable(root: Tree) -> Optional[str]: + in_expression_tree: Tree = _get_first_rule(root, "in_expression") + # Get list of terminal value + # Remove any whitespaces + # Remove any spaces + return "".join(_strip_char_from_list(_remove_whitespaces_from_list(_token_values(in_expression_tree)), " ")) def _get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: @@ -51,12 +48,11 @@ def _get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: # We are searching for Tree where variable-name is matching with provided variable for tree in _filter: values: List[str] = _token_values(tree.children[0]) - if len(values) > 1: - # Rare chances to happen as PowerBI Grammar only have one identifier in variable-name rule - LOGGER.info("Found more than one value in variable_name rule") - return None + actual_value: str = "".join(_strip_char_from_list(values, " ")) + LOGGER.info("Actual Value = %s", actual_value) + LOGGER.info("Expected Value = %s", variable) - if variable == values[0]: + if actual_value == variable: return tree LOGGER.info("Provided variable(%s) not found in variable rule", variable) @@ -143,18 +139,24 @@ def __init__(self, table: PowerBiAPI.Table, parse_tree: Tree, reporter: PowerBiD self.table = table self.parse_tree = parse_tree self.reporter = reporter + self.first_expression_func = partial(_get_first_rule, rule="expression") + self.first_item_selector_func = partial(_get_first_rule, rule="item_selector") + self.first_arg_list_func = partial(_get_first_rule, rule="argument_list") + self.first_identifier_func = partial(_get_first_rule, rule="identifier") + + @abstractmethod def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: pass -class RelationalMQueryResolver(AbstractDataAccessMQueryResolver, ABC): +class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): def get_item_selector_tokens(self, variable_statement: Tree) -> (str, List[str]): - expression_tree: Tree = _get_first_rule(variable_statement, "expression") - item_selector: Tree = _get_first_rule(expression_tree, "item_selector") - identifier_tree: Tree = _get_first_rule(expression_tree, "identifier") + expression_tree: Tree = self.first_expression_func(variable_statement) + item_selector: Tree = self.first_item_selector_func(expression_tree) + identifier_tree: Tree = self.first_identifier_func(expression_tree) # remove whitespaces and quotes from token tokens: List[str] = _strip_char_from_list(_remove_whitespaces_from_list(_token_values(item_selector)), "\"") identifier: List[str] = _token_values(identifier_tree) @@ -163,8 +165,8 @@ def get_item_selector_tokens(self, variable_statement: Tree) -> (str, List[str]) return identifier[0], dict(zip(iterator, iterator)) def get_argument_list(self, variable_statement: Tree) -> List[str]: - expression_tree: Tree = _get_first_rule(variable_statement, "expression") - argument_list: Tree = _get_first_rule(expression_tree, "argument_list") + expression_tree: Tree = self.first_expression_func(variable_statement) + argument_list: Tree = self.first_arg_list_func(expression_tree) # remove whitespaces and quotes from token tokens: List[str] = _strip_char_from_list(_remove_whitespaces_from_list(_token_values(argument_list)), "\"") return tokens @@ -202,13 +204,13 @@ def get_full_table_name(self, output_variable: str) -> str: pass -class PostgresMQueryResolver(RelationalMQueryResolver): +class PostgresMQueryResolver(BaseMQueryResolver): def get_full_table_name(self, output_variable: str) -> Optional[str]: variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) if variable_statement is None: - self.reporter.warnings( + self.reporter.report_warning( f"{self.table.full_name}-variable-statement", - "output variable statement not found in table expression", + f"output variable ({output_variable}) statement not found in table expression", ) return None source, tokens = self.get_item_selector_tokens(variable_statement) @@ -219,14 +221,14 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: if variable_statement is None: self.reporter.report_warning( f"{self.table.full_name}-source-statement", - "source variable statement not found in table expression", + f"source variable {source} statement not found in table expression", ) return None tokens = self.get_argument_list(variable_statement) if len(tokens) < 1: self.reporter.report_warning( f"{self.table.full_name}-database-arg-list", - "Number of expected tokens in argument list are not present in table expression", + "Expected number of argument not found in data-access function of table expression", ) return None @@ -237,27 +239,115 @@ def get_platform(self) -> str: return SupportedDataPlatform.POSTGRES_SQL.value +class OracleMQueryResolver(BaseMQueryResolver): + def get_platform(self) -> str: + return SupportedDataPlatform.ORACLE.value -class OracleMQueryResolver(AbstractDataAccessMQueryResolver): - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: - return [ - DataPlatformTable( - name="postgres_table", - full_name="book.public.test", - platform_type="Oracle" - ), - ] + def _get_db_name(self, value: str) -> Optional[str]: + error_message: str = f"The target argument ({value}) should in the format of :/[.]" + splitter_result: List[str] = value.split("/") + if len(splitter_result) != 2: + self.reporter.report_warning( + f"{self.table.full_name}-oracle-target", + error_message + ) + return None + db_name = splitter_result[1].split(".")[0] -class SnowflakeMQueryResolver(AbstractDataAccessMQueryResolver): - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: - return [ - DataPlatformTable( - name="postgres_table", - full_name="book.public.test", - platform_type="Snowflake" - ), - ] + return db_name + + def get_full_table_name(self, output_variable: str) -> str: + # Find step for the output variable + variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) + + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"output variable ({output_variable}) statement not found in table expression", + ) + return None + + schema_variable, tokens = self.get_item_selector_tokens(variable_statement) + table_name: str = tokens["Name"] + + # Find step for the schema variable + variable_statement = _get_variable_statement(self.parse_tree, schema_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-schema-variable-statement", + f"schema variable ({schema_variable}) statement not found in table expression", + ) + return None + + source_variable, tokens = self.get_item_selector_tokens(variable_statement) + schema_name: str = tokens["Schema"] + + # Find step for the database access variable + variable_statement = _get_variable_statement(self.parse_tree, source_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-source-variable-statement", + f"schema variable ({source_variable}) statement not found in table expression", + ) + return None + tokens = self.get_argument_list(variable_statement) + if len(tokens) < 1: + self.reporter.report_warning( + f"{self.table.full_name}-database-arg-list", + "Expected number of argument not found in data-access function of table expression", + ) + return None + # The first argument has database name. format localhost:1521/salesdb.GSLAB.COM + db_name: Optional[str] = self._get_db_name(tokens[0]) + if db_name is None: + LOGGER.debug(f"Fail to extract db name from the target {tokens[0]}") + + return f"{db_name}.{schema_name}.{table_name}" + + +class SnowflakeMQueryResolver(BaseMQueryResolver): + def get_platform(self) -> str: + return SupportedDataPlatform.SNOWFLAKE.value + + def get_full_table_name(self, output_variable: str) -> str: + # Find step for the output variable + variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) + + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"output variable ({output_variable}) statement not found in table expression", + ) + return None + + schema_variable, tokens = self.get_item_selector_tokens(variable_statement) + table_name: str = tokens["Name"] + + # Find step for the schema variable + variable_statement = _get_variable_statement(self.parse_tree, schema_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-schema-variable-statement", + f"schema variable ({schema_variable}) statement not found in table expression", + ) + return None + + source_variable, tokens = self.get_item_selector_tokens(variable_statement) + schema_name: str = tokens["Name"] + + # Find step for the database access variable + variable_statement = _get_variable_statement(self.parse_tree, source_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-source-variable-statement", + f"schema variable ({source_variable}) statement not found in table expression", + ) + return None + _, tokens = self.get_item_selector_tokens(variable_statement) + db_name: str = tokens["Name"] + + return f"{db_name}.{schema_name}.{table_name}" def _get_resolver(parse_tree: Tree) -> Optional[AbstractMQueryResolver]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 7d484cf757b00..2878ffdc5680a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -148,45 +148,43 @@ def __to_datahub_dataset( aspect_name=Constant.STATUS, aspect=StatusClass(removed=False), ) + dataset_mcps.extend([info_mcp, status_mcp]) + # Check if upstreams table is available, parse them and create dataset URN for each upstream table - upstreams: List[UpstreamClass] = [] - upstream_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table.expression, self.__reporter) - for upstream_table in upstream_tables: - platform: Union[str, PlatformDetail] = self.__config.dataset_type_mapping[upstream_table.platform_type] - platform_name: str = None - platform_instance_name: str = None - platform_env: str = DEFAULT_ENV - # Determine if PlatformDetail is provided - if isinstance(platform, PlatformDetail): - platform_name = cast(PlatformDetail, platform).platform - platform_instance_name = cast(PlatformDetail, platform).platform_instance - platform_env = cast(PlatformDetail, platform).env - else: - platform_name = platform - - upstream_urn = builder.make_dataset_urn_with_platform_instance( - platform=platform_name, - platform_instance=platform_instance_name, - env=platform_env, - name=upstream_table.full_name, - ) - upstream_table = UpstreamClass( - upstream_urn, - DatasetLineageTypeClass.TRANSFORMED, - ) - upstreams.append(upstream_table) - - if len(upstreams) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) - mcp = MetadataChangeProposalWrapper( - entityType="dataset", - changeType=ChangeTypeClass.UPSERT, - entityUrn=ds_urn, - aspect=upstream_lineage, + if self.__config.extract_lineage is True: + upstreams: List[UpstreamClass] = [] + upstream_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table.expression, self.__reporter) + for upstream_table in upstream_tables: + platform: Union[str, PlatformDetail] = self.__config.dataset_type_mapping[upstream_table.platform_type] + platform_name: str = m_parser.POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING[upstream_table.platform_type] + platform_instance_name: str = None + platform_env: str = DEFAULT_ENV + # Determine if PlatformDetail is provided + if isinstance(platform, PlatformDetail): + platform_instance_name = cast(PlatformDetail, platform).platform_instance + platform_env = cast(PlatformDetail, platform).env + + upstream_urn = builder.make_dataset_urn_with_platform_instance( + platform=platform_name, + platform_instance=platform_instance_name, + env=platform_env, + name=upstream_table.full_name, ) - dataset_mcps.extend([mcp]) - - dataset_mcps.extend([info_mcp, status_mcp]) + upstream_table = UpstreamClass( + upstream_urn, + DatasetLineageTypeClass.TRANSFORMED, + ) + upstreams.append(upstream_table) + + if len(upstreams) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + mcp = MetadataChangeProposalWrapper( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn=ds_urn, + aspect=upstream_lineage, + ) + dataset_mcps.extend([mcp]) return dataset_mcps diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 3c08c9faea59b..9128b5912b86b 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -1,6 +1,9 @@ +from typing import List + from lark import Tree from datahub.ingestion.source.powerbi import m_parser +from datahub.ingestion.source.powerbi.m_parser import DataPlatformTable, SupportedDataPlatform from datahub.ingestion.source.powerbi.proxy import PowerBiAPI from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport @@ -19,94 +22,134 @@ 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"', 'let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n Source2 = PostgreSQL.Database(\"localhost\", \"mics\"),\n public_order_date = Source2{[Schema=\"public\",Item=\"order_date\"]}[Data],\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name=\"PUBLIC\",Kind=\"Schema\"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name=\"SALES_ANALYST_VIEW\",Kind=\"View\"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table', 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', + 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1' ] -# def test_parse_m_query1(): -# expression: str = M_QUERIES[0] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "TESTTABLE_Table" -# -# -# def test_parse_m_query2(): -# expression: str = M_QUERIES[1] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom2"' -# -# -# def test_parse_m_query3(): -# expression: str = M_QUERIES[2] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Conditional Column"' -# -# -# def test_parse_m_query4(): -# expression: str = M_QUERIES[3] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Changed Type"' -# -# -# def test_parse_m_query5(): -# expression: str = M_QUERIES[4] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Renamed Columns"' -# -# -# def test_parse_m_query6(): -# expression: str = M_QUERIES[5] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query7(): -# expression: str = M_QUERIES[6] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "Source" -# -# -# def test_parse_m_query8(): -# expression: str = M_QUERIES[7] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query9(): -# expression: str = M_QUERIES[8] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query10(): -# expression: str = M_QUERIES[9] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Changed Type1"' -# -# -# def test_parse_m_query11(): -# expression: str = M_QUERIES[10] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "Source" -# -# -# def test_parse_m_query12(): -# expression: str = M_QUERIES[11] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query13(): -# expression: str = M_QUERIES[12] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == 'two_source_table' - -def test_get_upstream(): - qs = [M_QUERIES[0], M_QUERIES[-1]] - for q in qs: - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="table-name", - full_name="db-name.schema-name.table-name", - ) - reporter = PowerBiDashboardSourceReport() - print(m_parser.get_upstream_tables(table, reporter)) +def test_parse_m_query1(): + expression: str = M_QUERIES[0] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == "TESTTABLE_Table" + + +def test_parse_m_query2(): + expression: str = M_QUERIES[1] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Added Custom2"' + + +def test_parse_m_query3(): + expression: str = M_QUERIES[2] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Added Conditional Column"' + + +def test_parse_m_query4(): + expression: str = M_QUERIES[3] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Changed Type"' + + +def test_parse_m_query5(): + expression: str = M_QUERIES[4] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Renamed Columns"' + + +def test_parse_m_query6(): + expression: str = M_QUERIES[5] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' + + +def test_parse_m_query7(): + expression: str = M_QUERIES[6] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == "Source" + + +def test_parse_m_query8(): + expression: str = M_QUERIES[7] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query9(): + expression: str = M_QUERIES[8] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query10(): + expression: str = M_QUERIES[9] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Changed Type1"' + + +def test_parse_m_query11(): + expression: str = M_QUERIES[10] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == "Source" + + +def test_parse_m_query12(): + expression: str = M_QUERIES[11] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' + + +def test_parse_m_query13(): + expression: str = M_QUERIES[12] + parse_tree: Tree = m_parser._parse_expression(expression) + assert m_parser._get_output_variable(parse_tree) == 'two_source_table' + + +def test_postgres_regular_case(): + q: str = M_QUERIES[13] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table, reporter) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "order_date" + assert data_platform_tables[0].full_name == "mics.public.order_date" + assert data_platform_tables[0].platform_type == SupportedDataPlatform.POSTGRES_SQL.value + + +def test_oracle_regular_case(): + q: str = M_QUERIES[14] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table, reporter) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "EMPLOYEES" + assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" + assert data_platform_tables[0].platform_type == SupportedDataPlatform.ORACLE.value + + +def test_snowflake_regular_case(): + q: str = M_QUERIES[0] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table, reporter) + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "TESTTABLE" + assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" + assert data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value From d7c046448b9a061f6895468408c1b3da3cb2bffb Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 12 Dec 2022 13:55:36 +0530 Subject: [PATCH 18/53] lint fix --- .../ingestion/source/powerbi/config.py | 22 +- .../ingestion/source/powerbi/m_parser.py | 246 +++++++++++++----- .../ingestion/source/powerbi/powerbi.py | 69 ++--- .../integration/powerbi/test_m_parser.py | 34 ++- .../tests/integration/powerbi/test_powerbi.py | 1 + 5 files changed, 253 insertions(+), 119 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index b4a1422a11ada..db0f5858d997e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -1,18 +1,15 @@ -import pydantic +from dataclasses import dataclass, field as dataclass_field +from typing import Dict, List, Union +import pydantic from pydantic import validator import datahub.emitter.mce_builder as builder - -from dataclasses import field as dataclass_field -from typing import List - -from dataclasses import dataclass -from datahub.configuration.source_common import EnvBasedSourceConfigBase, DEFAULT_ENV -from typing import Dict, Union +from datahub.configuration.source_common import DEFAULT_ENV, EnvBasedSourceConfigBase from datahub.ingestion.api.source import SourceReport from datahub.ingestion.source.powerbi import m_parser + class Constant: """ keys used in powerbi plugin @@ -87,7 +84,10 @@ def report_charts_dropped(self, view: str) -> None: @dataclass class PlatformDetail: - platform_instance: str = pydantic.Field(default=None, description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source") + platform_instance: str = pydantic.Field( + default=None, + description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source", + ) env: str = pydantic.Field( default=DEFAULT_ENV, description="The environment that all assets produced by DataHub platform ingestion source belong to", @@ -102,7 +102,9 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): # Dataset type mapping PowerBI support many type of data-sources. Here user need to define what type of PowerBI # DataSource need to be mapped to corresponding DataHub Platform DataSource. For example PowerBI `Snowflake` is # mapped to DataHub `snowflake` PowerBI `PostgreSQL` is mapped to DataHub `postgres` and so on. - dataset_type_mapping: Union[Dict[str, str], Dict[str, PlatformDetail]] = pydantic.Field( + dataset_type_mapping: Union[ + Dict[str, str], Dict[str, PlatformDetail] + ] = pydantic.Field( description="Mapping of PowerBI datasource type to DataHub supported data-sources. See Quickstart Recipe for mapping" ) # Azure app client identifier diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 7a69aad810a01..0b40639f8aadf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -1,15 +1,16 @@ +import importlib.resources as pkg_resource +import logging from abc import ABC, abstractmethod +from dataclasses import dataclass from enum import Enum from functools import partial +from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast + +import lark +from lark import Lark, Token, Tree -from dataclasses import dataclass -import importlib.resources as pkg_resource from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport from datahub.ingestion.source.powerbi.proxy import PowerBiAPI -import logging -from typing import List, Optional, Any, Dict, Union, cast - -from lark import Lark, Tree, Token LOGGER = logging.getLogger(__name__) @@ -35,11 +36,17 @@ class SupportedDataPlatform(Enum): def _get_output_variable(root: Tree) -> Optional[str]: - in_expression_tree: Tree = _get_first_rule(root, "in_expression") + in_expression_tree: Optional[Tree] = _get_first_rule(root, "in_expression") + if in_expression_tree is None: + return None # Get list of terminal value # Remove any whitespaces # Remove any spaces - return "".join(_strip_char_from_list(_remove_whitespaces_from_list(_token_values(in_expression_tree)), " ")) + return "".join( + _strip_char_from_list( + _remove_whitespaces_from_list(_token_values(in_expression_tree)), " " + ) + ) def _get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: @@ -49,8 +56,8 @@ def _get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: for tree in _filter: values: List[str] = _token_values(tree.children[0]) actual_value: str = "".join(_strip_char_from_list(values, " ")) - LOGGER.info("Actual Value = %s", actual_value) - LOGGER.info("Expected Value = %s", variable) + LOGGER.debug("Actual Value = %s", actual_value) + LOGGER.debug("Expected Value = %s", variable) if actual_value == variable: return tree @@ -67,6 +74,7 @@ def _get_first_rule(tree: Tree, rule: str) -> Optional[Tree]: :param tree: Tree to search for the expression rule :return: Tree """ + def internal(node: Union[Tree, Token]) -> Optional[Tree]: if isinstance(node, Tree) and node.data == rule: return node @@ -74,9 +82,11 @@ def internal(node: Union[Tree, Token]) -> Optional[Tree]: return None for child in cast(Tree, node).children: - node = internal(child) - if node is not None: - return node + child_node: Optional[Tree] = internal(child) + if child_node is not None: + return child_node + + return None expression_tree: Optional[Tree] = internal(tree) @@ -91,7 +101,7 @@ def _token_values(tree: Tree) -> List[str]: """ values: List[str] = [] - def internal(node: Union[Tree, Token]): + def internal(node: Union[Tree, Token]) -> None: if isinstance(node, Token): values.append(cast(Token, node).value) return @@ -107,7 +117,7 @@ def internal(node: Union[Tree, Token]): def _remove_whitespaces_from_list(values: List[str]) -> List[str]: result: List[str] = [] for item in values: - if item.strip() not in ('', '\n', '\t'): + if item.strip() not in ("", "\n", "\t"): result.append(item) return result @@ -135,7 +145,12 @@ class AbstractDataAccessMQueryResolver(AbstractMQueryResolver, ABC): parse_tree: Tree reporter: PowerBiDashboardSourceReport - def __init__(self, table: PowerBiAPI.Table, parse_tree: Tree, reporter: PowerBiDashboardSourceReport): + def __init__( + self, + table: PowerBiAPI.Table, + parse_tree: Tree, + reporter: PowerBiDashboardSourceReport, + ): self.table = table self.parse_tree = parse_tree self.reporter = reporter @@ -144,54 +159,87 @@ def __init__(self, table: PowerBiAPI.Table, parse_tree: Tree, reporter: PowerBiD self.first_arg_list_func = partial(_get_first_rule, rule="argument_list") self.first_identifier_func = partial(_get_first_rule, rule="identifier") - - @abstractmethod def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: pass class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): + def get_item_selector_tokens( + self, variable_statement: Tree + ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: + expression_tree: Optional[Tree] = self.first_expression_func(variable_statement) + if expression_tree is None: + LOGGER.debug("Expression tree not found") + LOGGER.debug(variable_statement.pretty()) + return None, None + + item_selector: Optional[Tree] = self.first_item_selector_func(expression_tree) + if item_selector is None: + LOGGER.debug("Item Selector not found in tree") + LOGGER.debug(variable_statement.pretty()) + return None, None + + identifier_tree: Optional[Tree] = self.first_identifier_func(expression_tree) + if identifier_tree is None: + LOGGER.debug("Identifier not found in tree") + LOGGER.debug(variable_statement.pretty()) + return None, None - def get_item_selector_tokens(self, variable_statement: Tree) -> (str, List[str]): - expression_tree: Tree = self.first_expression_func(variable_statement) - item_selector: Tree = self.first_item_selector_func(expression_tree) - identifier_tree: Tree = self.first_identifier_func(expression_tree) # remove whitespaces and quotes from token - tokens: List[str] = _strip_char_from_list(_remove_whitespaces_from_list(_token_values(item_selector)), "\"") - identifier: List[str] = _token_values(identifier_tree) + tokens: List[str] = _strip_char_from_list( + _remove_whitespaces_from_list(_token_values(cast(Tree, item_selector))), + '"', + ) + identifier: List[str] = _token_values( + cast(Tree, identifier_tree) + ) # type :ignore # convert tokens to dict iterator = iter(tokens) + # cast to satisfy lint return identifier[0], dict(zip(iterator, iterator)) - def get_argument_list(self, variable_statement: Tree) -> List[str]: - expression_tree: Tree = self.first_expression_func(variable_statement) - argument_list: Tree = self.first_arg_list_func(expression_tree) + def get_argument_list(self, variable_statement: Tree) -> Optional[List[str]]: + expression_tree: Optional[Tree] = self.first_expression_func(variable_statement) + if expression_tree is None: + LOGGER.debug("First expression rule not found in input tree") + return None + + argument_list: Optional[Tree] = self.first_arg_list_func(expression_tree) + if argument_list is None: + LOGGER.debug("First argument-list rule not found in input tree") + return None + # remove whitespaces and quotes from token - tokens: List[str] = _strip_char_from_list(_remove_whitespaces_from_list(_token_values(argument_list)), "\"") + tokens: List[str] = _strip_char_from_list( + _remove_whitespaces_from_list(_token_values(argument_list)), '"' + ) return tokens def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] # Look for output variable - output_variable: str = _get_output_variable(self.parse_tree) + output_variable: Optional[str] = _get_output_variable(self.parse_tree) if output_variable is None: - self.reporter.warnings( + self.reporter.report_warning( f"{self.table.full_name}-output-variable", "output-variable not found in table expression", ) return data_platform_tables - full_table_name: str = self.get_full_table_name(output_variable) + full_table_name: Optional[str] = self.get_full_table_name(output_variable) if full_table_name is None: - LOGGER.debug("Fail to form full_table_name for PowerBI DataSet table %s", self.table.full_name) + LOGGER.debug( + "Fail to form full_table_name for PowerBI DataSet table %s", + self.table.full_name, + ) return data_platform_tables return [ DataPlatformTable( name=full_table_name.split(".")[-1], full_name=full_table_name, - platform_type=self.get_platform() + platform_type=self.get_platform(), ), ] @@ -200,20 +248,29 @@ def get_platform(self) -> str: pass @abstractmethod - def get_full_table_name(self, output_variable: str) -> str: + def get_full_table_name(self, output_variable: str) -> Optional[str]: pass class PostgresMQueryResolver(BaseMQueryResolver): def get_full_table_name(self, output_variable: str) -> Optional[str]: - variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) + variable_statement: Optional[Tree] = _get_variable_statement( + self.parse_tree, output_variable + ) if variable_statement is None: self.reporter.report_warning( f"{self.table.full_name}-variable-statement", f"output variable ({output_variable}) statement not found in table expression", ) return None - source, tokens = self.get_item_selector_tokens(variable_statement) + source, tokens = self.get_item_selector_tokens(cast(Tree, variable_statement)) + if source is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "Schema detail not found in table expression", + ) + return None + schema_name: str = tokens["Schema"] table_name: str = tokens["Item"] # Look for database-name @@ -224,16 +281,16 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: f"source variable {source} statement not found in table expression", ) return None - tokens = self.get_argument_list(variable_statement) - if len(tokens) < 1: + arg_list = self.get_argument_list(cast(Tree, variable_statement)) + if arg_list is None or len(arg_list) < 1: self.reporter.report_warning( f"{self.table.full_name}-database-arg-list", "Expected number of argument not found in data-access function of table expression", ) return None - database_name: str = tokens[1] # 1st token is database name - return f"{database_name}.{schema_name}.{table_name}" + database_name: str = cast(List[str], arg_list)[1] # 1st token is database name + return cast(Optional[str], f"{database_name}.{schema_name}.{table_name}") def get_platform(self) -> str: return SupportedDataPlatform.POSTGRES_SQL.value @@ -248,8 +305,7 @@ def _get_db_name(self, value: str) -> Optional[str]: splitter_result: List[str] = value.split("/") if len(splitter_result) != 2: self.reporter.report_warning( - f"{self.table.full_name}-oracle-target", - error_message + f"{self.table.full_name}-oracle-target", error_message ) return None @@ -257,9 +313,11 @@ def _get_db_name(self, value: str) -> Optional[str]: return db_name - def get_full_table_name(self, output_variable: str) -> str: + def get_full_table_name(self, output_variable: str) -> Optional[str]: # Find step for the output variable - variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) + variable_statement: Optional[Tree] = _get_variable_statement( + self.parse_tree, output_variable + ) if variable_statement is None: self.reporter.report_warning( @@ -268,11 +326,22 @@ def get_full_table_name(self, output_variable: str) -> str: ) return None - schema_variable, tokens = self.get_item_selector_tokens(variable_statement) + schema_variable, tokens = self.get_item_selector_tokens( + cast(Tree, variable_statement) + ) + if schema_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "table name not found in table expression", + ) + return None + table_name: str = tokens["Name"] # Find step for the schema variable - variable_statement = _get_variable_statement(self.parse_tree, schema_variable) + variable_statement = _get_variable_statement( + self.parse_tree, cast(str, schema_variable) + ) if variable_statement is None: self.reporter.report_warning( f"{self.table.full_name}-schema-variable-statement", @@ -281,6 +350,13 @@ def get_full_table_name(self, output_variable: str) -> str: return None source_variable, tokens = self.get_item_selector_tokens(variable_statement) + if source_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "Schema not found in table expression", + ) + return None + schema_name: str = tokens["Schema"] # Find step for the database access variable @@ -291,17 +367,17 @@ def get_full_table_name(self, output_variable: str) -> str: f"schema variable ({source_variable}) statement not found in table expression", ) return None - tokens = self.get_argument_list(variable_statement) - if len(tokens) < 1: + arg_list = self.get_argument_list(variable_statement) + if arg_list is None or len(arg_list) < 1: self.reporter.report_warning( f"{self.table.full_name}-database-arg-list", "Expected number of argument not found in data-access function of table expression", ) return None # The first argument has database name. format localhost:1521/salesdb.GSLAB.COM - db_name: Optional[str] = self._get_db_name(tokens[0]) + db_name: Optional[str] = self._get_db_name(arg_list[0]) if db_name is None: - LOGGER.debug(f"Fail to extract db name from the target {tokens[0]}") + LOGGER.debug(f"Fail to extract db name from the target {arg_list}") return f"{db_name}.{schema_name}.{table_name}" @@ -310,9 +386,11 @@ class SnowflakeMQueryResolver(BaseMQueryResolver): def get_platform(self) -> str: return SupportedDataPlatform.SNOWFLAKE.value - def get_full_table_name(self, output_variable: str) -> str: + def get_full_table_name(self, output_variable: str) -> Optional[str]: # Find step for the output variable - variable_statement: Tree = _get_variable_statement(self.parse_tree, output_variable) + variable_statement: Optional[Tree] = _get_variable_statement( + self.parse_tree, output_variable + ) if variable_statement is None: self.reporter.report_warning( @@ -322,6 +400,13 @@ def get_full_table_name(self, output_variable: str) -> str: return None schema_variable, tokens = self.get_item_selector_tokens(variable_statement) + if schema_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "table name not found in table expression", + ) + return None + table_name: str = tokens["Name"] # Find step for the schema variable @@ -334,6 +419,13 @@ def get_full_table_name(self, output_variable: str) -> str: return None source_variable, tokens = self.get_item_selector_tokens(variable_statement) + if source_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "schema name not found in table expression", + ) + return None + schema_name: str = tokens["Name"] # Find step for the database access variable @@ -345,12 +437,19 @@ def get_full_table_name(self, output_variable: str) -> str: ) return None _, tokens = self.get_item_selector_tokens(variable_statement) + if tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "database name not found in table expression", + ) + return None + db_name: str = tokens["Name"] return f"{db_name}.{schema_name}.{table_name}" -def _get_resolver(parse_tree: Tree) -> Optional[AbstractMQueryResolver]: +def _get_resolver(parse_tree: Tree) -> Optional[Type["BaseMQueryResolver"]]: _filter: Any = parse_tree.find_data("invoke_expression") @@ -371,16 +470,11 @@ def _get_resolver(parse_tree: Tree) -> Optional[AbstractMQueryResolver]: # Register M-Query resolver for specific database platform -DATA_ACCESS_RESOLVER: Dict[str, AbstractDataAccessMQueryResolver.__class__] = { +DATA_ACCESS_RESOLVER = { f"{SupportedDataPlatform.POSTGRES_SQL.value}.Database": PostgresMQueryResolver, f"{SupportedDataPlatform.ORACLE.value}.Database": OracleMQueryResolver, f"{SupportedDataPlatform.SNOWFLAKE.value}.Databases": SnowflakeMQueryResolver, -} - -# Register M-Query resolver for function call to resolve function arguments -TABLE_ACCESS_RESOLVER: Dict[str, AbstractMQueryResolver.__class__] = { - "Table.Combine": None, -} +} # type :ignore def _parse_expression(expression: str) -> Tree: @@ -395,28 +489,46 @@ def _parse_expression(expression: str) -> Tree: parse_tree: Tree = lark_parser.parse(expression) LOGGER.debug("Parse Tree") - if LOGGER.level == logging.DEBUG: # Guard condition to avoid heavy pretty() function call + if ( + LOGGER.level == logging.DEBUG + ): # Guard condition to avoid heavy pretty() function call LOGGER.debug(parse_tree.pretty()) return parse_tree -def get_upstream_tables(table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport) -> List[DataPlatformTable]: - parse_tree = _parse_expression(table.expression) +def get_upstream_tables( + table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport +) -> List[DataPlatformTable]: + if table.expression is None: + reporter.report_warning(table.full_name, "Expression is none") + return [] + + try: + parse_tree: Tree = _parse_expression(table.expression) + except lark.exceptions.UnexpectedCharacters: + reporter.report_warning( + table.full_name, f"UnSupported expression = {table.expression}" + ) + return [] trees: List[Tree] = list(parse_tree.find_data("invoke_expression")) if len(trees) > 1: - reporter.report_warning(table.full_name, f"{table.full_name} has more than one invoke expression") + reporter.report_warning( + table.full_name, f"{table.full_name} has more than one invoke expression" + ) return [] - resolver: AbstractDataAccessMQueryResolver = _get_resolver(parse_tree) + resolver: Optional[Type[BaseMQueryResolver]] = _get_resolver(parse_tree) if resolver is None: LOGGER.debug("Table full-name = %s", table.full_name) LOGGER.debug("Expression = %s", table.expression) reporter.report_warning( table.full_name, - f"{table.full_name} M-Query resolver not found for the table expression" + f"{table.full_name} M-Query resolver not found for the table expression", ) return [] - return resolver(table, parse_tree, reporter).resolve_to_data_platform_table_list() + return resolver( + table, parse_tree, reporter + ).resolve_to_data_platform_table_list() # type: ignore diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 2878ffdc5680a..f172cc37cc721 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -5,7 +5,7 @@ ######################################################### import logging -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Iterable, List, Optional, Tuple, Union, cast import datahub.emitter.mce_builder as builder from datahub.configuration.source_common import DEFAULT_ENV @@ -21,7 +21,15 @@ ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.powerbi import m_parser +from datahub.ingestion.source.powerbi.config import ( + Constant, + PlatformDetail, + PowerBiDashboardSourceConfig, + PowerBiDashboardSourceReport, +) from datahub.ingestion.source.powerbi.m_parser import DataPlatformTable +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps from datahub.metadata.schema_classes import ( BrowsePathsClass, @@ -32,22 +40,18 @@ CorpUserKeyClass, DashboardInfoClass, DashboardKeyClass, + DatasetLineageTypeClass, DatasetPropertiesClass, OwnerClass, OwnershipClass, OwnershipTypeClass, StatusClass, - SubTypesClass, UpstreamClass, DatasetLineageTypeClass, UpstreamLineageClass, + SubTypesClass, + UpstreamClass, + UpstreamLineageClass, ) from datahub.utilities.dedup_list import deduplicate_list -from datahub.ingestion.source.powerbi import m_parser -from datahub.ingestion.source.powerbi.config import ( - PowerBiDashboardSourceReport, - PowerBiDashboardSourceConfig, - PlatformDetail, - Constant -) -from datahub.ingestion.source.powerbi.proxy import PowerBiAPI + # Logger instance LOGGER = logging.getLogger(__name__) @@ -69,7 +73,11 @@ def __eq__(self, instance): def __hash__(self): return id(self.id) - def __init__(self, config: PowerBiDashboardSourceConfig, reporter: PowerBiDashboardSourceReport): + def __init__( + self, + config: PowerBiDashboardSourceConfig, + reporter: PowerBiDashboardSourceReport, + ): self.__config = config self.__reporter = reporter @@ -150,18 +158,28 @@ def __to_datahub_dataset( ) dataset_mcps.extend([info_mcp, status_mcp]) - # Check if upstreams table is available, parse them and create dataset URN for each upstream table if self.__config.extract_lineage is True: + # Check if upstreams table is available, parse them and create dataset URN for each upstream table upstreams: List[UpstreamClass] = [] - upstream_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table.expression, self.__reporter) + upstream_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( + table, self.__reporter + ) for upstream_table in upstream_tables: - platform: Union[str, PlatformDetail] = self.__config.dataset_type_mapping[upstream_table.platform_type] - platform_name: str = m_parser.POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING[upstream_table.platform_type] - platform_instance_name: str = None + platform: Union[ + str, PlatformDetail + ] = self.__config.dataset_type_mapping[upstream_table.platform_type] + platform_name: str = ( + m_parser.POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING[ + upstream_table.platform_type + ] + ) + platform_instance_name: Optional[str] = None platform_env: str = DEFAULT_ENV # Determine if PlatformDetail is provided if isinstance(platform, PlatformDetail): - platform_instance_name = cast(PlatformDetail, platform).platform_instance + platform_instance_name = cast( + PlatformDetail, platform + ).platform_instance platform_env = cast(PlatformDetail, platform).env upstream_urn = builder.make_dataset_urn_with_platform_instance( @@ -170,11 +188,11 @@ def __to_datahub_dataset( env=platform_env, name=upstream_table.full_name, ) - upstream_table = UpstreamClass( + upstream_table_class = UpstreamClass( upstream_urn, DatasetLineageTypeClass.TRANSFORMED, ) - upstreams.append(upstream_table) + upstreams.append(upstream_table_class) if len(upstreams) > 0: upstream_lineage = UpstreamLineageClass(upstreams=upstreams) @@ -538,19 +556,6 @@ def to_chart_mcps( aspect=StatusClass(removed=False), ) - # ChartKey status - chart_key_instance = ChartKeyClass( - dashboardTool=self.__config.platform_name, - chartId=Constant.CHART_ID.format(page.id), - ) - - chartkey_mcp = self.new_mcp( - entity_type=Constant.CHART, - entity_urn=chart_urn, - aspect_name=Constant.CHART_KEY, - aspect=chart_key_instance, - ) - return [info_mcp, status_mcp] for page in pages: diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 9128b5912b86b..3b5f9dbd515c3 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -3,9 +3,12 @@ from lark import Tree from datahub.ingestion.source.powerbi import m_parser -from datahub.ingestion.source.powerbi.m_parser import DataPlatformTable, SupportedDataPlatform -from datahub.ingestion.source.powerbi.proxy import PowerBiAPI from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.m_parser import ( + DataPlatformTable, + SupportedDataPlatform, +) +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -20,9 +23,9 @@ 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","OPERATIONS_ANALYTICS_WAREHOUSE_PROD",[Role="OPERATIONS_ANALYTICS_MEMBER_AD"]),\n OPERATIONS_ANALYTICS_Database = Source{[Name="OPERATIONS_ANALYTICS",Kind="Database"]}[Data],\n TEST_Schema = OPERATIONS_ANALYTICS_Database{[Name="TEST",Kind="Schema"]}[Data],\n LZ_MIGRATION_DOWNLOAD_View = TEST_Schema{[Name="LZ_MIGRATION_DOWNLOAD",Kind="View"]}[Data],\n #"Changed Type" = Table.TransformColumnTypes(LZ_MIGRATION_DOWNLOAD_View,{{"MIGRATION_MONTH_ID", type text}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Migration Month", each Date.FromText(\nText.Range([MIGRATION_MONTH_ID], 0,4) & "-" & \nText.Range([MIGRATION_MONTH_ID], 4,2) \n)),\n #"Changed Type1" = Table.TransformColumnTypes(#"Added Custom",{{"Migration Month", type date}})\nin\n #"Changed Type1"', "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source", 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"', - 'let\n Source = Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]),\n Source2 = PostgreSQL.Database(\"localhost\", \"mics\"),\n public_order_date = Source2{[Schema=\"public\",Item=\"order_date\"]}[Data],\n GSL_TEST_DB_Database = Source{[Name=\"GSL_TEST_DB\",Kind=\"Database\"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name=\"PUBLIC\",Kind=\"Schema\"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name=\"SALES_ANALYST_VIEW\",Kind=\"View\"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table', + 'let\n Source = Snowflake.Databases("xaa48144.snowflakecomputing.com","GSL_TEST_WH",[Role="ACCOUNTADMIN"]),\n Source2 = PostgreSQL.Database("localhost", "mics"),\n public_order_date = Source2{[Schema="public",Item="order_date"]}[Data],\n GSL_TEST_DB_Database = Source{[Name="GSL_TEST_DB",Kind="Database"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name="PUBLIC",Kind="Schema"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name="SALES_ANALYST_VIEW",Kind="View"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table', 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', - 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1' + 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1', ] @@ -101,7 +104,7 @@ def test_parse_m_query12(): def test_parse_m_query13(): expression: str = M_QUERIES[12] parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == 'two_source_table' + assert m_parser._get_output_variable(parse_tree) == "two_source_table" def test_postgres_regular_case(): @@ -113,12 +116,17 @@ def test_postgres_regular_case(): ) reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table, reporter) + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( + table, reporter + ) assert len(data_platform_tables) == 1 assert data_platform_tables[0].name == "order_date" assert data_platform_tables[0].full_name == "mics.public.order_date" - assert data_platform_tables[0].platform_type == SupportedDataPlatform.POSTGRES_SQL.value + assert ( + data_platform_tables[0].platform_type + == SupportedDataPlatform.POSTGRES_SQL.value + ) def test_oracle_regular_case(): @@ -130,7 +138,9 @@ def test_oracle_regular_case(): ) reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table, reporter) + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( + table, reporter + ) assert len(data_platform_tables) == 1 assert data_platform_tables[0].name == "EMPLOYEES" @@ -147,9 +157,13 @@ def test_snowflake_regular_case(): ) reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables(table, reporter) + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( + table, reporter + ) assert len(data_platform_tables) == 1 assert data_platform_tables[0].name == "TESTTABLE" assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" - assert data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value + assert ( + data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value + ) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 40d441b9cbc91..acaa2fb77307c 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -220,6 +220,7 @@ def default_source_config(): "client_secret": "bar", "tenant_id": "0B0C960B-FCDF-4D0F-8C45-2E03BB59DDEB", "workspace_id": "64ED5CAD-7C10-4684-8180-826122881108", + "extract_lineage": False, "dataset_type_mapping": { "PostgreSql": "postgres", "Oracle": "oracle", From 75d5b6bf477b2cc5d371add21d355281b9be2623 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 12 Dec 2022 14:02:40 +0530 Subject: [PATCH 19/53] PowerBI API --- .../datahub/ingestion/source/powerbi/proxy.py | 885 ++++++++++++++++++ 1 file changed, 885 insertions(+) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py new file mode 100644 index 0000000000000..1b644a4fb4265 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py @@ -0,0 +1,885 @@ +import json +import logging +from dataclasses import dataclass +from enum import Enum +from time import sleep +from typing import Any, Dict, List, Optional + +import msal +import requests as requests + +from datahub.configuration.common import ConfigurationError +from datahub.ingestion.source.powerbi.config import ( + Constant, + PowerBiAPIConfig, + PowerBiDashboardSourceReport, +) + +# Logger instance +LOGGER = logging.getLogger(__name__) + + +class PowerBiAPI: + # API endpoints of PowerBi to fetch dashboards, tiles, datasets + API_ENDPOINTS = { + Constant.DASHBOARD_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/dashboards", + Constant.ENTITY_USER_LIST: "{POWERBI_ADMIN_BASE_URL}/{ENTITY}/{ENTITY_ID}/users", + Constant.TILE_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/dashboards/{DASHBOARD_ID}/tiles", + Constant.DATASET_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/datasets/{DATASET_ID}", + Constant.DATASOURCE_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/datasets/{DATASET_ID}/datasources", + Constant.REPORT_GET: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports/{REPORT_ID}", + Constant.REPORT_LIST: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports", + Constant.SCAN_GET: "{POWERBI_ADMIN_BASE_URL}/workspaces/scanStatus/{SCAN_ID}", + Constant.SCAN_RESULT_GET: "{POWERBI_ADMIN_BASE_URL}/workspaces/scanResult/{SCAN_ID}", + Constant.SCAN_CREATE: "{POWERBI_ADMIN_BASE_URL}/workspaces/getInfo", + Constant.PAGE_BY_REPORT: "{POWERBI_BASE_URL}/{WORKSPACE_ID}/reports/{REPORT_ID}/pages", + } + + SCOPE: str = "https://analysis.windows.net/powerbi/api/.default" + BASE_URL: str = "https://api.powerbi.com/v1.0/myorg/groups" + ADMIN_BASE_URL: str = "https://api.powerbi.com/v1.0/myorg/admin" + AUTHORITY: str = "https://login.microsoftonline.com/" + + @dataclass + class Workspace: + """ + PowerBi Workspace + """ + + id: str + name: str + state: str + dashboards: List[Any] + datasets: Dict[str, "PowerBiAPI.PowerBIDataset"] + + @dataclass + class DataSource: + """ + PowerBi + """ + + id: str + type: str + raw_connection_detail: Dict + + def __members(self): + return (self.id,) + + def __eq__(self, instance): + return ( + isinstance(instance, PowerBiAPI.DataSource) + and self.__members() == instance.__members() + ) + + def __hash__(self): + return hash(self.__members()) + + @dataclass + class Table: + name: str + full_name: str + expression: Optional[str] + + # dataclasses for PowerBi Dashboard + @dataclass + class PowerBIDataset: + id: str + name: str + webUrl: Optional[str] + workspace_id: str + # Table in datasets + tables: List["PowerBiAPI.Table"] + + def get_urn_part(self): + return f"datasets.{self.id}" + + def __members(self): + return (self.id,) + + def __eq__(self, instance): + return ( + isinstance(instance, PowerBiAPI.PowerBIDataset) + and self.__members() == instance.__members() + ) + + def __hash__(self): + return hash(self.__members()) + + @dataclass + class Page: + id: str + displayName: str + name: str + order: int + + def get_urn_part(self): + return f"pages.{self.id}" + + @dataclass + class User: + id: str + displayName: str + emailAddress: str + graphId: str + principalType: str + + def get_urn_part(self): + return f"users.{self.id}" + + def __members(self): + return (self.id,) + + def __eq__(self, instance): + return ( + isinstance(instance, PowerBiAPI.User) + and self.__members() == instance.__members() + ) + + def __hash__(self): + return hash(self.__members()) + + @dataclass + class Report: + id: str + name: str + webUrl: str + embedUrl: str + description: str + dataset: Optional["PowerBiAPI.PowerBIDataset"] + pages: List["PowerBiAPI.Page"] + users: List["PowerBiAPI.User"] + + def get_urn_part(self): + return f"reports.{self.id}" + + @dataclass + class Tile: + class CreatedFrom(Enum): + REPORT = "Report" + DATASET = "Dataset" + VISUALIZATION = "Visualization" + UNKNOWN = "UNKNOWN" + + id: str + title: str + embedUrl: str + dataset: Optional["PowerBiAPI.PowerBIDataset"] + report: Optional[Any] + createdFrom: CreatedFrom + + def get_urn_part(self): + return f"charts.{self.id}" + + @dataclass + class Dashboard: + id: str + displayName: str + embedUrl: str + webUrl: str + isReadOnly: Any + workspace_id: str + workspace_name: str + tiles: List["PowerBiAPI.Tile"] + users: List["PowerBiAPI.User"] + + def get_urn_part(self): + return f"dashboards.{self.id}" + + def __members(self): + return (self.id,) + + def __eq__(self, instance): + return ( + isinstance(instance, PowerBiAPI.Dashboard) + and self.__members() == instance.__members() + ) + + def __hash__(self): + return hash(self.__members()) + + def __init__(self, config: PowerBiAPIConfig) -> None: + self.__config: PowerBiAPIConfig = config + self.__access_token: str = "" + # Power-Bi Auth (Service Principal Auth) + self.__msal_client = msal.ConfidentialClientApplication( + self.__config.client_id, + client_credential=self.__config.client_secret, + authority=PowerBiAPI.AUTHORITY + self.__config.tenant_id, + ) + + # Test connection by generating a access token + LOGGER.info("Trying to connect to {}".format(self.__get_authority_url())) + self.get_access_token() + LOGGER.info("Able to connect to {}".format(self.__get_authority_url())) + + def __get_authority_url(self): + return "{}{}".format(PowerBiAPI.AUTHORITY, self.__config.tenant_id) + + def __get_users(self, workspace_id: str, entity: str, _id: str) -> List[User]: + """ + Get user for the given PowerBi entity + """ + users: List[PowerBiAPI.User] = [] + if self.__config.extract_ownership is False: + LOGGER.info( + "ExtractOwnership capabilities is disabled from configuration and hence returning empty users list" + ) + return users + + user_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.ENTITY_USER_LIST] + # Replace place holders + user_list_endpoint = user_list_endpoint.format( + POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, + ENTITY=entity, + ENTITY_ID=_id, + ) + # Hit PowerBi + LOGGER.info(f"Request to URL={user_list_endpoint}") + response = requests.get( + user_list_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + LOGGER.warning( + "Failed to fetch user list from power-bi. http_status=%s. message=%s", + response.status_code, + response.text, + ) + + LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.info(f"{Constant.ENTITY}={entity}") + LOGGER.info(f"{Constant.ID}={_id}") + raise ConnectionError("Failed to fetch the user list from the power-bi") + + users_dict: List[Any] = response.json()[Constant.VALUE] + + # Iterate through response and create a list of PowerBiAPI.Dashboard + users = [ + PowerBiAPI.User( + id=instance.get("identifier"), + displayName=instance.get("displayName"), + emailAddress=instance.get("emailAddress"), + graphId=instance.get("graphId"), + principalType=instance.get("principalType"), + ) + for instance in users_dict + ] + + return users + + def __get_report( + self, workspace_id: str, report_id: str + ) -> Optional["PowerBiAPI.Report"]: + """ + Fetch the report from PowerBi for the given report identifier + """ + if workspace_id is None or report_id is None: + LOGGER.info("Input values are None") + LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.info(f"{Constant.ReportId}={report_id}") + return None + + report_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_GET] + # Replace place holders + report_get_endpoint = report_get_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, + WORKSPACE_ID=workspace_id, + REPORT_ID=report_id, + ) + # Hit PowerBi + LOGGER.info(f"Request to report URL={report_get_endpoint}") + response = requests.get( + report_get_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + message: str = "Failed to fetch report from power-bi for" + LOGGER.warning(message) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.warning(f"{Constant.ReportId}={report_id}") + raise ConnectionError(message) + + response_dict = response.json() + + return PowerBiAPI.Report( + id=response_dict.get("id"), + name=response_dict.get("name"), + webUrl=response_dict.get("webUrl"), + embedUrl=response_dict.get("embedUrl"), + description=response_dict.get("description"), + users=[], + pages=[], + dataset=self.get_dataset( + workspace_id=workspace_id, dataset_id=response_dict.get("datasetId") + ), + ) + + def get_access_token(self): + if self.__access_token != "": + LOGGER.info("Returning the cached access token") + return self.__access_token + + LOGGER.info("Generating PowerBi access token") + + auth_response = self.__msal_client.acquire_token_for_client( + scopes=[PowerBiAPI.SCOPE] + ) + + if not auth_response.get("access_token"): + LOGGER.warning( + "Failed to generate the PowerBi access token. Please check input configuration" + ) + raise ConfigurationError( + "Powerbi authorization failed . Please check your input configuration." + ) + + LOGGER.info("Generated PowerBi access token") + + self.__access_token = "Bearer {}".format(auth_response.get("access_token")) + + LOGGER.debug(f"{Constant.PBIAccessToken}={self.__access_token}") + + return self.__access_token + + def get_dashboard_users(self, dashboard: Dashboard) -> List[User]: + """ + Return list of dashboard users + """ + return self.__get_users( + workspace_id=dashboard.workspace_id, entity="dashboards", _id=dashboard.id + ) + + def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: + """ + Get the list of dashboard from PowerBi for the given workspace identifier + + TODO: Pagination. As per REST API doc (https://docs.microsoft.com/en-us/rest/api/power-bi/dashboards/get + -dashboards), there is no information available on pagination + """ + dashboard_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DASHBOARD_LIST] + # Replace place holders + dashboard_list_endpoint = dashboard_list_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, WORKSPACE_ID=workspace.id + ) + # Hit PowerBi + LOGGER.info(f"Request to URL={dashboard_list_endpoint}") + response = requests.get( + dashboard_list_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + LOGGER.warning("Failed to fetch dashboard list from power-bi for") + LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") + raise ConnectionError( + "Failed to fetch the dashboard list from the power-bi" + ) + + dashboards_dict: List[Any] = response.json()[Constant.VALUE] + + # Iterate through response and create a list of PowerBiAPI.Dashboard + dashboards: List[PowerBiAPI.Dashboard] = [ + PowerBiAPI.Dashboard( + id=instance.get("id"), + isReadOnly=instance.get("isReadOnly"), + displayName=instance.get("displayName"), + embedUrl=instance.get("embedUrl"), + webUrl=instance.get("webUrl"), + workspace_id=workspace.id, + workspace_name=workspace.name, + tiles=[], + users=[], + ) + for instance in dashboards_dict + if instance is not None + ] + + return dashboards + + def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: + """ + Fetch the dataset from PowerBi for the given dataset identifier + """ + if workspace_id is None or dataset_id is None: + LOGGER.info("Input values are None") + LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.info(f"{Constant.DatasetId}={dataset_id}") + return None + + dataset_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DATASET_GET] + # Replace place holders + dataset_get_endpoint = dataset_get_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, + WORKSPACE_ID=workspace_id, + DATASET_ID=dataset_id, + ) + # Hit PowerBi + LOGGER.info(f"Request to dataset URL={dataset_get_endpoint}") + response = requests.get( + dataset_get_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + message: str = "Failed to fetch dataset from power-bi for" + LOGGER.warning(message) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") + LOGGER.warning(f"{Constant.DatasetId}={dataset_id}") + raise ConnectionError(message) + + response_dict = response.json() + LOGGER.debug("datasets = {}".format(response_dict)) + # PowerBi Always return the webURL, in-case if it is None then setting complete webURL to None instead of + # None/details + return PowerBiAPI.PowerBIDataset( + id=response_dict.get("id"), + name=response_dict.get("name"), + webUrl="{}/details".format(response_dict.get("webUrl")) + if response_dict.get("webUrl") is not None + else None, + workspace_id=workspace_id, + tables=[], + ) + + def get_data_sources( + self, dataset: PowerBIDataset + ) -> Optional[Dict[str, "PowerBiAPI.DataSource"]]: + """ + Fetch the data source from PowerBi for the given dataset + """ + + datasource_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DATASOURCE_GET] + # Replace place holders + datasource_get_endpoint = datasource_get_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, + WORKSPACE_ID=dataset.workspace_id, + DATASET_ID=dataset.id, + ) + # Hit PowerBi + LOGGER.info(f"Request to datasource URL={datasource_get_endpoint}") + response = requests.get( + datasource_get_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + message: str = "Failed to fetch datasource from power-bi for" + LOGGER.warning(message) + LOGGER.warning("{}={}".format(Constant.WorkspaceId, dataset.workspace_id)) + LOGGER.warning("{}={}".format(Constant.DatasetId, dataset.id)) + LOGGER.warning("{}={}".format(Constant.HTTP_RESPONSE_TEXT, response.text)) + LOGGER.warning( + "{}={}".format(Constant.HTTP_RESPONSE_STATUS_CODE, response.status_code) + ) + + raise ConnectionError(message) + + res = response.json() + value = res["value"] + if len(value) == 0: + LOGGER.info( + f"datasource is not found for dataset {dataset.name}({dataset.id})" + ) + + return None + + data_sources: Dict[str, "PowerBiAPI.DataSource"] = {} + LOGGER.debug("data-sources = {}".format(value)) + for datasource_dict in value: + # Create datasource instance with basic detail available + datasource = PowerBiAPI.DataSource( + id=datasource_dict.get( + "datasourceId" + ), # datasourceId is not available in all cases + type=datasource_dict["datasourceType"], + raw_connection_detail=datasource_dict["connectionDetails"], + ) + + data_sources[datasource.id] = datasource + + return data_sources + + def get_tiles(self, workspace: Workspace, dashboard: Dashboard) -> List[Tile]: + + """ + Get the list of tiles from PowerBi for the given workspace identifier + + TODO: Pagination. As per REST API doc (https://docs.microsoft.com/en-us/rest/api/power-bi/dashboards/get + -tiles), there is no information available on pagination + + """ + + def new_dataset_or_report(tile_instance: Any) -> dict: + """ + Find out which is the data source for tile. It is either REPORT or DATASET + """ + report_fields = { + "dataset": ( + workspace.datasets[tile_instance.get("datasetId")] + if tile_instance.get("datasetId") is not None + else None + ), + "report": ( + self.__get_report( + workspace_id=workspace.id, + report_id=tile_instance.get("reportId"), + ) + if tile_instance.get("reportId") is not None + else None + ), + "createdFrom": PowerBiAPI.Tile.CreatedFrom.UNKNOWN, + } + + # Tile is either created from report or dataset or from custom visualization + if report_fields["report"] is not None: + report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.REPORT + elif report_fields["dataset"] is not None: + report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.DATASET + else: + report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.VISUALIZATION + + LOGGER.info( + "Tile %s(%s) is created from %s", + tile_instance.get("title"), + tile_instance.get("id"), + report_fields["createdFrom"], + ) + + return report_fields + + tile_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.TILE_LIST] + # Replace place holders + tile_list_endpoint = tile_list_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, + WORKSPACE_ID=dashboard.workspace_id, + DASHBOARD_ID=dashboard.id, + ) + # Hit PowerBi + LOGGER.info("Request to URL={}".format(tile_list_endpoint)) + response = requests.get( + tile_list_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + LOGGER.warning("Failed to fetch tiles list from power-bi for") + LOGGER.warning("{}={}".format(Constant.WorkspaceId, workspace.id)) + LOGGER.warning("{}={}".format(Constant.DashboardId, dashboard.id)) + raise ConnectionError("Failed to fetch the tile list from the power-bi") + + # Iterate through response and create a list of PowerBiAPI.Dashboard + tile_dict: List[Any] = response.json()[Constant.VALUE] + LOGGER.debug("Tile Dict = {}".format(tile_dict)) + tiles: List[PowerBiAPI.Tile] = [ + PowerBiAPI.Tile( + id=instance.get("id"), + title=instance.get("title"), + embedUrl=instance.get("embedUrl"), + **new_dataset_or_report(instance), + ) + for instance in tile_dict + if instance is not None + ] + + return tiles + + def get_pages_by_report( + self, workspace_id: str, report_id: str + ) -> List["PowerBiAPI.Page"]: + """ + Fetch the report from PowerBi for the given report identifier + """ + if workspace_id is None or report_id is None: + LOGGER.info("workspace_id or report_id is None") + return [] + + pages_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.PAGE_BY_REPORT] + # Replace place holders + pages_endpoint = pages_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, + WORKSPACE_ID=workspace_id, + REPORT_ID=report_id, + ) + # Hit PowerBi + LOGGER.info(f"Request to pages URL={pages_endpoint}") + response = requests.get( + pages_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + message: str = "Failed to fetch reports from power-bi for" + LOGGER.warning(message) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") + raise ConnectionError(message) + + response_dict = response.json() + return [ + PowerBiAPI.Page( + id="{}.{}".format(report_id, raw_instance["name"].replace(" ", "_")), + name=raw_instance["name"], + displayName=raw_instance.get("displayName"), + order=raw_instance.get("order"), + ) + for raw_instance in response_dict["value"] + ] + + def get_reports( + self, workspace: "PowerBiAPI.Workspace" + ) -> List["PowerBiAPI.Report"]: + """ + Fetch the report from PowerBi for the given report identifier + """ + if workspace is None: + LOGGER.info("workspace is None") + return [] + + report_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_LIST] + # Replace place holders + report_list_endpoint = report_list_endpoint.format( + POWERBI_BASE_URL=PowerBiAPI.BASE_URL, + WORKSPACE_ID=workspace.id, + ) + # Hit PowerBi + LOGGER.info(f"Request to report URL={report_list_endpoint}") + response = requests.get( + report_list_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + + # Check if we got response from PowerBi + if response.status_code != 200: + message: str = "Failed to fetch reports from power-bi for" + LOGGER.warning(message) + LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") + raise ConnectionError(message) + + response_dict = response.json() + reports: List["PowerBiAPI.Report"] = [ + PowerBiAPI.Report( + id=raw_instance["id"], + name=raw_instance.get("name"), + webUrl=raw_instance.get("webUrl"), + embedUrl=raw_instance.get("embedUrl"), + description=raw_instance.get("description"), + pages=self.get_pages_by_report( + workspace_id=workspace.id, report_id=raw_instance["id"] + ), + users=self.__get_users( + workspace_id=workspace.id, entity="reports", _id=raw_instance["id"] + ), + dataset=workspace.datasets.get(raw_instance.get("datasetId")), + ) + for raw_instance in response_dict["value"] + ] + + return reports + + # flake8: noqa: C901 + def get_workspace( + self, workspace_id: str, reporter: PowerBiDashboardSourceReport + ) -> Workspace: + """ + Return Workspace for the given workspace identifier i.e. workspace_id + """ + scan_create_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_CREATE] + scan_create_endpoint = scan_create_endpoint.format( + POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL + ) + + def create_scan_job(): + """ + Create scan job on PowerBi for the workspace + """ + request_body = {"workspaces": [workspace_id]} + + res = requests.post( + scan_create_endpoint, + data=request_body, + params={ + "datasetExpressions": True, + "datasetSchema": True, + "datasourceDetails": True, + "getArtifactUsers": True, + "lineage": True, + }, + headers={Constant.Authorization: self.get_access_token()}, + ) + + if res.status_code not in (200, 202): + message = f"API({scan_create_endpoint}) return error code {res.status_code} for workspace id({workspace_id})" + + LOGGER.warning(message) + + raise ConnectionError(message) + # Return Id of Scan created for the given workspace + id = res.json()["id"] + LOGGER.info("Scan id({})".format(id)) + return id + + def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: + """ + Poll the PowerBi service for workspace scan to complete + """ + minimum_sleep = 3 + if timeout < minimum_sleep: + LOGGER.info( + f"Setting timeout to minimum_sleep time {minimum_sleep} seconds" + ) + timeout = minimum_sleep + + max_trial = timeout // minimum_sleep + LOGGER.info(f"Max trial {max_trial}") + scan_get_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_GET] + scan_get_endpoint = scan_get_endpoint.format( + POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id + ) + + LOGGER.info(f"Hitting URL={scan_get_endpoint}") + + trail = 1 + while True: + LOGGER.info(f"Trial = {trail}") + res = requests.get( + scan_get_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + if res.status_code != 200: + message = f"API({scan_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" + + LOGGER.warning(message) + + raise ConnectionError(message) + + if res.json()["status"].upper() == "Succeeded".upper(): + LOGGER.info(f"Scan result is available for scan id({scan_id})") + return True + + if trail == max_trial: + break + LOGGER.info(f"Sleeping for {minimum_sleep} seconds") + sleep(minimum_sleep) + trail += 1 + + # Result is not available + return False + + def get_scan_result(scan_id: str) -> dict: + LOGGER.info("Fetching scan result") + LOGGER.info(f"{Constant.SCAN_ID}={scan_id}") + scan_result_get_endpoint = PowerBiAPI.API_ENDPOINTS[ + Constant.SCAN_RESULT_GET + ] + scan_result_get_endpoint = scan_result_get_endpoint.format( + POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id + ) + + LOGGER.info(f"Hitting URL={scan_result_get_endpoint}") + res = requests.get( + scan_result_get_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + if res.status_code != 200: + message = f"API({scan_result_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" + + LOGGER.warning(message) + + raise ConnectionError(message) + + return res.json()["workspaces"][0] + + def json_to_dataset_map(scan_result: dict) -> dict: + """ + Filter out "dataset" from scan_result and return PowerBiAPI.Dataset instance set + """ + datasets: Optional[Any] = scan_result.get("datasets") + dataset_map: dict = {} + + if datasets is None or len(datasets) == 0: + LOGGER.warning( + f'Workspace {scan_result["name"]}({scan_result["id"]}) does not have datasets' + ) + + LOGGER.info("Returning empty datasets") + return dataset_map + + for dataset_dict in datasets: + dataset_instance: PowerBiAPI.PowerBIDataset = self.get_dataset( + workspace_id=scan_result["id"], + dataset_id=dataset_dict["id"], + ) + dataset_map[dataset_instance.id] = dataset_instance + # set dataset-name + dataset_name: str = ( + dataset_instance.name + if dataset_instance.name is not None + else dataset_instance.id + ) + + for table in dataset_dict["tables"]: + expression: str = ( + table["source"][0]["expression"] + if table.get("source") is not None and len(table["source"]) > 0 + else None + ) + dataset_instance.tables.append( + PowerBiAPI.Table( + name=table["name"], + full_name="{}.{}".format( + dataset_name.replace(" ", "_"), + table["name"].replace(" ", "_"), + ), + expression=expression, + ) + ) + + return dataset_map + + def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: + for dashboard in workspace.dashboards: + dashboard.tiles = self.get_tiles(workspace, dashboard=dashboard) + + return None + + LOGGER.info("Creating scan job for workspace") + LOGGER.info("{}={}".format(Constant.WorkspaceId, workspace_id)) + LOGGER.info("Hitting URL={}".format(scan_create_endpoint)) + scan_id = create_scan_job() + LOGGER.info("Waiting for scan to complete") + if ( + wait_for_scan_to_complete( + scan_id=scan_id, timeout=self.__config.scan_timeout + ) + is False + ): + raise ValueError( + "Workspace detail is not available. Please increase scan_timeout to wait." + ) + + # Scan is complete lets take the result + scan_result = get_scan_result(scan_id=scan_id) + + LOGGER.debug(f"scan result = %s", json.dumps(scan_result, indent=1)) + workspace = PowerBiAPI.Workspace( + id=scan_result["id"], + name=scan_result["name"], + state=scan_result["state"], + datasets={}, + dashboards=[], + ) + # Get workspace dashboards + workspace.dashboards = self.get_dashboards(workspace) + + workspace.datasets = json_to_dataset_map(scan_result) + init_dashboard_tiles(workspace) + + return workspace From 61c1d2def810ba4c60ef29b7b9b307d7e49eb4e8 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 12 Dec 2022 21:58:07 +0530 Subject: [PATCH 20/53] mssql server support --- .../ingestion/source/powerbi/m_parser.py | 23 ++++++++++- .../integration/powerbi/test_m_parser.py | 39 ++++++++++++++----- 2 files changed, 51 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index 0b40639f8aadf..b12f804c00b73 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -26,6 +26,7 @@ class SupportedDataPlatform(Enum): POSTGRES_SQL = "PostgreSQL" ORACLE = "Oracle" SNOWFLAKE = "Snowflake" + MS_SQL = "Sql" POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING: Dict[str, str] = { @@ -252,7 +253,16 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: pass -class PostgresMQueryResolver(BaseMQueryResolver): +class DefaultTwoStepDataAccessSources(BaseMQueryResolver, ABC): + """ + These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + """ + def get_full_table_name(self, output_variable: str) -> Optional[str]: variable_statement: Optional[Tree] = _get_variable_statement( self.parse_tree, output_variable @@ -292,10 +302,17 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: database_name: str = cast(List[str], arg_list)[1] # 1st token is database name return cast(Optional[str], f"{database_name}.{schema_name}.{table_name}") + +class PostgresMQueryResolver(DefaultTwoStepDataAccessSources): def get_platform(self) -> str: return SupportedDataPlatform.POSTGRES_SQL.value +class MSSqlMQueryResolver(DefaultTwoStepDataAccessSources): + def get_platform(self) -> str: + return SupportedDataPlatform.MS_SQL.value + + class OracleMQueryResolver(BaseMQueryResolver): def get_platform(self) -> str: return SupportedDataPlatform.ORACLE.value @@ -474,6 +491,7 @@ def _get_resolver(parse_tree: Tree) -> Optional[Type["BaseMQueryResolver"]]: f"{SupportedDataPlatform.POSTGRES_SQL.value}.Database": PostgresMQueryResolver, f"{SupportedDataPlatform.ORACLE.value}.Database": OracleMQueryResolver, f"{SupportedDataPlatform.SNOWFLAKE.value}.Databases": SnowflakeMQueryResolver, + f"{SupportedDataPlatform.MS_SQL.value}.Database": MSSqlMQueryResolver, } # type :ignore @@ -506,7 +524,8 @@ def get_upstream_tables( try: parse_tree: Tree = _parse_expression(table.expression) - except lark.exceptions.UnexpectedCharacters: + except lark.exceptions.UnexpectedCharacters as e: + LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) reporter.report_warning( table.full_name, f"UnSupported expression = {table.expression}" ) diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 3b5f9dbd515c3..4a0c0bcf17f7b 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -24,8 +24,9 @@ "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select *,#(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS Agent,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Industries'#(lf)and TARGET_TEAM = 'Enterprise'\", null, [EnableFolding=true])\nin\n Source", 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select#(lf)*,#(lf)concat((UPPER(REPLACE(SALES_SPECIALIST,\'-\',\'\'))),#(lf)LEFT(CAST(INVOICE_DATE AS DATE),4)+LEFT(RIGHT(CAST(INVOICE_DATE AS DATE),5),2)) AS AGENT_KEY,#(lf)CASE#(lf) WHEN CLASS = \'Software\' and (NOT(PRODUCT in (\'ADV\', \'Adv\') and left(ACCOUNT_ID,2)=\'10\') #(lf) or V_ENTERPRISE_INVOICED_REVENUE.TYPE = \'Manual Adjustment\') THEN INVOICE_AMOUNT#(lf) WHEN V_ENTERPRISE_INVOICED_REVENUE.TYPE IN (\'Recurring\',\'0\') THEN INVOICE_AMOUNT#(lf) ELSE 0#(lf)END as SOFTWARE_INV#(lf)#(lf)from V_ENTERPRISE_INVOICED_REVENUE", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Services", each if [CLASS] = "Services" then [INVOICE_AMOUNT] else 0),\n #"Added Custom" = Table.AddColumn(#"Added Conditional Column", "Advanced New Sites", each if [PRODUCT] = "ADV"\nor [PRODUCT] = "Adv"\nthen [NEW_SITE]\nelse 0)\nin\n #"Added Custom"', 'let\n Source = Snowflake.Databases("xaa48144.snowflakecomputing.com","GSL_TEST_WH",[Role="ACCOUNTADMIN"]),\n Source2 = PostgreSQL.Database("localhost", "mics"),\n public_order_date = Source2{[Schema="public",Item="order_date"]}[Data],\n GSL_TEST_DB_Database = Source{[Name="GSL_TEST_DB",Kind="Database"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name="PUBLIC",Kind="Schema"]}[Data],\n SALES_ANALYST_VIEW_View = PUBLIC_Schema{[Name="SALES_ANALYST_VIEW",Kind="View"]}[Data],\n two_source_table = Table.Combine({public_order_date, SALES_ANALYST_VIEW_View})\n in\n two_source_table', - 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', - 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1', + 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', + 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1', + 'let\n Source = Sql.Database("localhost", "library"),\n dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]\n in dbo_book_issue', ] @@ -107,6 +108,27 @@ def test_parse_m_query13(): assert m_parser._get_output_variable(parse_tree) == "two_source_table" +def test_snowflake_regular_case(): + q: str = M_QUERIES[0] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "TESTTABLE" + assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" + assert ( + data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value + ) + + def test_postgres_regular_case(): q: str = M_QUERIES[13] table: PowerBiAPI.Table = PowerBiAPI.Table( @@ -148,8 +170,8 @@ def test_oracle_regular_case(): assert data_platform_tables[0].platform_type == SupportedDataPlatform.ORACLE.value -def test_snowflake_regular_case(): - q: str = M_QUERIES[0] +def test_mssql_regular_case(): + q: str = M_QUERIES[15] table: PowerBiAPI.Table = PowerBiAPI.Table( expression=q, name="virtual_order_table", @@ -157,13 +179,12 @@ def test_snowflake_regular_case(): ) reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( table, reporter ) assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" - assert ( - data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value - ) + assert data_platform_tables[0].name == "book_issue" + assert data_platform_tables[0].full_name == "library.dbo.book_issue" + assert data_platform_tables[0].platform_type == SupportedDataPlatform.MS_SQL.value From aad6f2946adaaa1623065914aab849b08626ee30 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 14 Dec 2022 15:58:31 +0530 Subject: [PATCH 21/53] WIP --- .../powerbi/{m_parser.py => m_parser2.py} | 27 ++ .../integration/powerbi/test_m_parser.py | 316 +++++++++--------- 2 files changed, 190 insertions(+), 153 deletions(-) rename metadata-ingestion/src/datahub/ingestion/source/powerbi/{m_parser.py => m_parser2.py} (94%) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py similarity index 94% rename from metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py rename to metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py index b12f804c00b73..2979e181ca248 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py @@ -515,6 +515,31 @@ def _parse_expression(expression: str) -> Tree: return parse_tree +def _validate_parse_tree(supported_funcs: List[str], tree: Tree) -> Tuple[bool, str]: + """ + :param tree: tree to validate as per functions supported by m_parser module + :return: first argument is False if validation is failed and second argument would contain the error message. + in-case of valid tree the first argument is True and second argument would be None. + """ + _filter: List[Tree] = tree.find_data("invoke_expression") + + valid: bool = False + message: Optional[str] = None + + for node in _filter: + primary_expression_node: Optional[Tree] = _get_first_rule(node, "primary_expression") + if primary_expression_node is None: + continue + identifier_node: Optional[Tree] = _get_first_rule(primary_expression_node, "identifier") + if identifier_node is None: + continue + + function_name: str = _make_function_name(identifier_node) + # This function should be in our supported function list + if function_name not in supported_funcs: + return False, f"function {function_name} is not supported" + + def get_upstream_tables( table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport ) -> List[DataPlatformTable]: @@ -524,6 +549,8 @@ def get_upstream_tables( try: parse_tree: Tree = _parse_expression(table.expression) + _validate_parse_tree([], parse_tree) + exit() except lark.exceptions.UnexpectedCharacters as e: LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) reporter.report_warning( diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 4a0c0bcf17f7b..917532b783976 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -13,7 +13,7 @@ M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"ADDed Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"', - 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"', + 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20 658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = \'Accounting\'#(lf)and TARGET_TEAM = \'Enterprise\'", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "Has PS Software Quota?", each if [TIER] = "Expansion (Medium)" then "Yes" else if [TIER] = "Acquisition" then "Yes" else "No")\nin\n #"Added Conditional Column"', 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *#(lf),concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), MONTHID) as AGENT_KEY#(lf),concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY#(lf)#(lf)from V_OIP_ENT_2022"]),\n #"Added Custom" = Table.AddColumn(Source, "OIP in $(*$350)", each [SALES_INVOICE_AMOUNT] * 350),\n #"Changed Type" = Table.TransformColumnTypes(#"Added Custom",{{"OIP in $(*$350)", type number}})\nin\n #"Changed Type"', 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="Select *,#(lf)#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_QUOTED,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), #(lf)LEFT(CAST(DTE AS DATE),4)+LEFT(RIGHT(CAST(DTE AS DATE),5),2)) AS CD_AGENT_KEY#(lf)#(lf)from V_INVOICE_BOOKING_2022"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"CLIENT_ID", Int64.Type}}),\n #"Added Conditional Column" = Table.AddColumn(#"Changed Type", "PS Software (One-Off)", each if Text.Contains([REVENUE_TYPE], "Software") then [Inv_Amt] else if Text.Contains([REVENUE_TYPE], "Tax Seminar") then [Inv_Amt] else 0),\n #"Filtered Rows" = Table.SelectRows(#"Added Conditional Column", each true),\n #"Duplicated Column" = Table.DuplicateColumn(#"Filtered Rows", "CLIENT_ID", "CLIENT_ID - Copy"),\n #"Changed Type1" = Table.TransformColumnTypes(#"Duplicated Column",{{"CLIENT_ID - Copy", type text}}),\n #"Renamed Columns" = Table.RenameColumns(#"Changed Type1",{{"CLIENT_ID - Copy", "CLIENT_ID for Filter"}})\nin\n #"Renamed Columns"', 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="SELECT *,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), #(lf)LEFT(CAST(MONTH_DATE AS DATE),4)+LEFT(RIGHT(CAST(MONTH_DATE AS DATE),5),2)) AS AGENT_KEY#(lf)#(lf)FROM dbo.V_ARR_ADDS"]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"MONTH_DATE", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([MONTH_DATE]))\nin\n #"Added Custom"', @@ -30,161 +30,171 @@ ] -def test_parse_m_query1(): - expression: str = M_QUERIES[0] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == "TESTTABLE_Table" +# def test_parse_m_query1(): +# expression: str = M_QUERIES[0] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "TESTTABLE_Table" +# +# +# def test_parse_m_query2(): +# expression: str = M_QUERIES[1] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom2"' +# +# +# def test_parse_m_query3(): +# expression: str = M_QUERIES[2] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Conditional Column"' +# +# +# def test_parse_m_query4(): +# expression: str = M_QUERIES[3] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Changed Type"' +# +# +# def test_parse_m_query5(): +# expression: str = M_QUERIES[4] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Renamed Columns"' +# +# +# def test_parse_m_query6(): +# expression: str = M_QUERIES[5] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query7(): +# expression: str = M_QUERIES[6] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "Source" +# +# +# def test_parse_m_query8(): +# expression: str = M_QUERIES[7] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query9(): +# expression: str = M_QUERIES[8] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query10(): +# expression: str = M_QUERIES[9] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Changed Type1"' +# +# +# def test_parse_m_query11(): +# expression: str = M_QUERIES[10] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "Source" +# +# +# def test_parse_m_query12(): +# expression: str = M_QUERIES[11] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query13(): +# expression: str = M_QUERIES[12] +# parse_tree: Tree = m_parser._parse_expression(expression) +# assert m_parser._get_output_variable(parse_tree) == "two_source_table" +# +# +# def test_snowflake_regular_case(): +# q: str = M_QUERIES[0] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "TESTTABLE" +# assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" +# assert ( +# data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value +# ) +# +# +# def test_postgres_regular_case(): +# q: str = M_QUERIES[13] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "order_date" +# assert data_platform_tables[0].full_name == "mics.public.order_date" +# assert ( +# data_platform_tables[0].platform_type +# == SupportedDataPlatform.POSTGRES_SQL.value +# ) +# +# +# def test_oracle_regular_case(): +# q: str = M_QUERIES[14] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "EMPLOYEES" +# assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" +# assert data_platform_tables[0].platform_type == SupportedDataPlatform.ORACLE.value +# +# +# def test_mssql_regular_case(): +# q: str = M_QUERIES[15] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# +# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "book_issue" +# assert data_platform_tables[0].full_name == "library.dbo.book_issue" +# assert data_platform_tables[0].platform_type == SupportedDataPlatform.MS_SQL.value + +def test_advance_use_case(): - -def test_parse_m_query2(): - expression: str = M_QUERIES[1] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Added Custom2"' - - -def test_parse_m_query3(): - expression: str = M_QUERIES[2] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Added Conditional Column"' - - -def test_parse_m_query4(): - expression: str = M_QUERIES[3] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Changed Type"' - - -def test_parse_m_query5(): - expression: str = M_QUERIES[4] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Renamed Columns"' - - -def test_parse_m_query6(): - expression: str = M_QUERIES[5] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' - - -def test_parse_m_query7(): - expression: str = M_QUERIES[6] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == "Source" - - -def test_parse_m_query8(): - expression: str = M_QUERIES[7] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query9(): - expression: str = M_QUERIES[8] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query10(): - expression: str = M_QUERIES[9] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Changed Type1"' - - -def test_parse_m_query11(): - expression: str = M_QUERIES[10] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == "Source" - - -def test_parse_m_query12(): - expression: str = M_QUERIES[11] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' - - -def test_parse_m_query13(): - expression: str = M_QUERIES[12] - parse_tree: Tree = m_parser._parse_expression(expression) - assert m_parser._get_output_variable(parse_tree) == "two_source_table" - - -def test_snowflake_regular_case(): - q: str = M_QUERIES[0] table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, + expression=M_QUERIES[1], name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) + m_parser.get_upstream_tables(table, PowerBiDashboardSourceReport()) - reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" - assert ( - data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value - ) - - -def test_postgres_regular_case(): - q: str = M_QUERIES[13] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "order_date" - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert ( - data_platform_tables[0].platform_type - == SupportedDataPlatform.POSTGRES_SQL.value - ) - - -def test_oracle_regular_case(): - q: str = M_QUERIES[14] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "EMPLOYEES" - assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" - assert data_platform_tables[0].platform_type == SupportedDataPlatform.ORACLE.value - - -def test_mssql_regular_case(): - q: str = M_QUERIES[15] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - - data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "book_issue" - assert data_platform_tables[0].full_name == "library.dbo.book_issue" - assert data_platform_tables[0].platform_type == SupportedDataPlatform.MS_SQL.value From 33a31506bb603753d8b27f346d0468575d534964 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 14 Dec 2022 22:32:28 +0530 Subject: [PATCH 22/53] mssql key --- .../src/datahub/ingestion/source/powerbi/m_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py index b12f804c00b73..41ccdeffd1940 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser.py @@ -33,6 +33,7 @@ class SupportedDataPlatform(Enum): SupportedDataPlatform.POSTGRES_SQL.value: "postgres", SupportedDataPlatform.ORACLE.value: "oracle", SupportedDataPlatform.SNOWFLAKE.value: "snowflake", + SupportedDataPlatform.MS_SQL.value: "mssql", } From aecb69585310be774895d2f81ef2b1dda6cc8aa1 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 15 Dec 2022 17:12:51 +0530 Subject: [PATCH 23/53] WIP --- .../ingestion/source/powerbi/config.py | 9 +- .../source/powerbi/m_query/__init__.py | 0 .../source/powerbi/m_query/parser.py | 67 +++ .../source/powerbi/m_query/resolver.py | 406 ++++++++++++++++++ .../source/powerbi/m_query/tree_function.py | 143 ++++++ .../source/powerbi/m_query/validator.py | 43 ++ .../ingestion/source/powerbi/powerbi.py | 27 +- .../integration/powerbi/test_m_parser.py | 329 +++++++------- 8 files changed, 845 insertions(+), 179 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index db0f5858d997e..5d6c3dc0529d7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -7,7 +7,6 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.source_common import DEFAULT_ENV, EnvBasedSourceConfigBase from datahub.ingestion.api.source import SourceReport -from datahub.ingestion.source.powerbi import m_parser class Constant: @@ -130,16 +129,14 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): @validator("dataset_type_mapping") @classmethod - def check_dataset_type_mapping(cls, value): - # For backward compatibility map input PostgreSql to PostgreSQL + def map_data_platform(cls, value): + # For backward compatibility convert input PostgreSql to PostgreSQL + # PostgreSQL is name of the data-platform in M-Query if "PostgreSql" in value.keys(): platform_name = value["PostgreSql"] del value["PostgreSql"] value["PostgreSQL"] = platform_name - for key in value.keys(): - if key not in m_parser.POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING.keys(): - raise ValueError(f"DataPlatform {key} is not supported") return value diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py new file mode 100644 index 0000000000000..aa929b475529a --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -0,0 +1,67 @@ +import importlib.resources as pkg_resource +import logging +from typing import List, Optional + +import lark +from lark import Lark, Tree + +from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI +from datahub.ingestion.source.powerbi.m_query import validator +from datahub.ingestion.source.powerbi.m_query import resolver + +LOGGER = logging.getLogger(__name__) + + +def _parse_expression(expression: str) -> Tree: + # Read lexical grammar as text + grammar: str = pkg_resource.read_text( + "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" + ) + + # Create lark parser for the grammar text + lark_parser = Lark(grammar, start="let_expression", regex=True) + + parse_tree: Tree = lark_parser.parse(expression) + + LOGGER.debug("Parse Tree") + if ( + LOGGER.level == logging.DEBUG + ): # Guard condition to avoid heavy pretty() function call + LOGGER.debug(parse_tree.pretty()) + + return parse_tree + + +def get_upstream_tables( + table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport +) -> List[resolver.DataPlatformTable]: + if table.expression is None: + reporter.report_warning(table.full_name, "Expression is none") + return [] + + try: + parse_tree: Tree = _parse_expression(table.expression) + except lark.exceptions.UnexpectedCharacters as e: + LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) + reporter.report_warning( + table.full_name, f"UnSupported expression = {table.expression}" + ) + return [] + + resolver_enum: Optional[resolver.SupportedDataPlatform] = resolver.get_resolver(parse_tree) + if resolver_enum is None: + LOGGER.debug("Table full-name = %s", table.full_name) + LOGGER.debug("Expression = %s", table.expression) + reporter.report_warning( + table.full_name, + f"{table.full_name} M-Query resolver not found for the table expression", + ) + return [] + + return resolver_enum.get_m_query_resolver()( + table=table, + parse_tree=parse_tree, + data_platform_pair=resolver_enum.get_data_platform_pair(), + reporter=reporter, + ).resolve_to_data_platform_table_list() # type: ignore diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py new file mode 100644 index 0000000000000..a71db5f6b1145 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -0,0 +1,406 @@ +import logging +from abc import ABC, abstractmethod +from typing import Dict, Optional, List, cast, Tuple, Type, Any + +from lark import Tree + +from dataclasses import dataclass +from enum import Enum + +from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.proxy import PowerBiAPI + +from datahub.ingestion.source.powerbi.m_query import tree_function + +LOGGER = logging.getLogger(__name__) + + +@dataclass +class DataPlatformPair: + datahub_data_platform_name: str + powerbi_data_platform_name: str + + +@dataclass +class DataPlatformTable: + name: str + full_name: str + data_platform_pair: DataPlatformPair + + +class AbstractMQueryResolver(ABC): + pass + + +class AbstractDataAccessMQueryResolver(AbstractMQueryResolver, ABC): + table: PowerBiAPI.Table + parse_tree: Tree + reporter: PowerBiDashboardSourceReport + data_platform_pair: DataPlatformPair + + def __init__( + self, + table: PowerBiAPI.Table, + parse_tree: Tree, + data_platform_pair: DataPlatformPair, + reporter: PowerBiDashboardSourceReport, + ): + self.table = table + self.parse_tree = parse_tree + self.reporter = reporter + self.data_platform_pair = data_platform_pair + + @abstractmethod + def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + pass + + +class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): + def get_item_selector_tokens( + self, variable_statement: Tree + ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: + expression_tree: Optional[Tree] = tree_function.first_expression_func(variable_statement) + if expression_tree is None: + LOGGER.debug("Expression tree not found") + LOGGER.debug(variable_statement.pretty()) + return None, None + + item_selector: Optional[Tree] = tree_function.first_item_selector_func(expression_tree) + if item_selector is None: + LOGGER.debug("Item Selector not found in tree") + LOGGER.debug(variable_statement.pretty()) + return None, None + + identifier_tree: Optional[Tree] = tree_function.first_identifier_func(expression_tree) + if identifier_tree is None: + LOGGER.debug("Identifier not found in tree") + LOGGER.debug(variable_statement.pretty()) + return None, None + + # remove whitespaces and quotes from token + tokens: List[str] = tree_function.strip_char_from_list( + tree_function.remove_whitespaces_from_list(tree_function.token_values(cast(Tree, item_selector))), + '"', + ) + identifier: List[str] = tree_function.token_values( + cast(Tree, identifier_tree) + ) # type :ignore + # convert tokens to dict + iterator = iter(tokens) + # cast to satisfy lint + return identifier[0], dict(zip(iterator, iterator)) + + def get_argument_list(self, variable_statement: Tree) -> Optional[List[str]]: + expression_tree: Optional[Tree] = tree_function.first_expression_func(variable_statement) + if expression_tree is None: + LOGGER.debug("First expression rule not found in input tree") + return None + + argument_list: Optional[Tree] = tree_function.first_arg_list_func(expression_tree) + if argument_list is None: + LOGGER.debug("First argument-list rule not found in input tree") + return None + + # remove whitespaces and quotes from token + tokens: List[str] = tree_function.strip_char_from_list( + tree_function.remove_whitespaces_from_list(tree_function.token_values(argument_list)), '"' + ) + return tokens + + def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: + data_platform_tables: List[DataPlatformTable] = [] + # Look for output variable + output_variable: Optional[str] = tree_function.get_output_variable(self.parse_tree) + if output_variable is None: + self.reporter.report_warning( + f"{self.table.full_name}-output-variable", + "output-variable not found in table expression", + ) + return data_platform_tables + + full_table_name: Optional[str] = self.get_full_table_name(output_variable) + if full_table_name is None: + LOGGER.debug( + "Fail to form full_table_name for PowerBI DataSet table %s", + self.table.full_name, + ) + return data_platform_tables + + return [ + DataPlatformTable( + name=full_table_name.split(".")[-1], + full_name=full_table_name, + data_platform_pair=self.data_platform_pair + ), + ] + + @abstractmethod + def get_full_table_name(self, output_variable: str) -> Optional[str]: + pass + + +class DefaultTwoStepDataAccessSources(BaseMQueryResolver, ABC): + """ + These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern + let + Source = Sql.Database("localhost", "library"), + dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] + in + dbo_book_issue + """ + + def get_full_table_name(self, output_variable: str) -> Optional[str]: + variable_statement: Optional[Tree] = tree_function.get_variable_statement( + self.parse_tree, output_variable + ) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"output variable ({output_variable}) statement not found in table expression", + ) + return None + source, tokens = self.get_item_selector_tokens(cast(Tree, variable_statement)) + if source is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "Schema detail not found in table expression", + ) + return None + + schema_name: str = tokens["Schema"] + table_name: str = tokens["Item"] + # Look for database-name + variable_statement = tree_function.get_variable_statement(self.parse_tree, source) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-source-statement", + f"source variable {source} statement not found in table expression", + ) + return None + arg_list = self.get_argument_list(cast(Tree, variable_statement)) + if arg_list is None or len(arg_list) < 1: + self.reporter.report_warning( + f"{self.table.full_name}-database-arg-list", + "Expected number of argument not found in data-access function of table expression", + ) + return None + + database_name: str = cast(List[str], arg_list)[1] # 1st token is database name + return cast(Optional[str], f"{database_name}.{schema_name}.{table_name}") + + +class PostgresMQueryResolver(DefaultTwoStepDataAccessSources): + pass + + +class MSSqlMQueryResolver(DefaultTwoStepDataAccessSources): + pass + + +class OracleMQueryResolver(BaseMQueryResolver): + + def _get_db_name(self, value: str) -> Optional[str]: + error_message: str = f"The target argument ({value}) should in the format of :/[.]" + splitter_result: List[str] = value.split("/") + if len(splitter_result) != 2: + self.reporter.report_warning( + f"{self.table.full_name}-oracle-target", error_message + ) + return None + + db_name = splitter_result[1].split(".")[0] + + return db_name + + def get_full_table_name(self, output_variable: str) -> Optional[str]: + # Find step for the output variable + variable_statement: Optional[Tree] = tree_function.get_variable_statement( + self.parse_tree, output_variable + ) + + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"output variable ({output_variable}) statement not found in table expression", + ) + return None + + schema_variable, tokens = self.get_item_selector_tokens( + cast(Tree, variable_statement) + ) + if schema_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "table name not found in table expression", + ) + return None + + table_name: str = tokens["Name"] + + # Find step for the schema variable + variable_statement = tree_function.get_variable_statement( + self.parse_tree, cast(str, schema_variable) + ) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-schema-variable-statement", + f"schema variable ({schema_variable}) statement not found in table expression", + ) + return None + + source_variable, tokens = self.get_item_selector_tokens(variable_statement) + if source_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "Schema not found in table expression", + ) + return None + + schema_name: str = tokens["Schema"] + + # Find step for the database access variable + variable_statement = tree_function.get_variable_statement(self.parse_tree, source_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-source-variable-statement", + f"schema variable ({source_variable}) statement not found in table expression", + ) + return None + arg_list = self.get_argument_list(variable_statement) + if arg_list is None or len(arg_list) < 1: + self.reporter.report_warning( + f"{self.table.full_name}-database-arg-list", + "Expected number of argument not found in data-access function of table expression", + ) + return None + # The first argument has database name. format localhost:1521/salesdb.GSLAB.COM + db_name: Optional[str] = self._get_db_name(arg_list[0]) + if db_name is None: + LOGGER.debug(f"Fail to extract db name from the target {arg_list}") + + return f"{db_name}.{schema_name}.{table_name}" + + +class SnowflakeMQueryResolver(BaseMQueryResolver): + + def get_full_table_name(self, output_variable: str) -> Optional[str]: + # Find step for the output variable + variable_statement: Optional[Tree] = tree_function.get_variable_statement( + self.parse_tree, output_variable + ) + + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"output variable ({output_variable}) statement not found in table expression", + ) + return None + + schema_variable, tokens = self.get_item_selector_tokens(variable_statement) + if schema_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "table name not found in table expression", + ) + return None + + table_name: str = tokens["Name"] + + # Find step for the schema variable + variable_statement = tree_function.get_variable_statement(self.parse_tree, schema_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-schema-variable-statement", + f"schema variable ({schema_variable}) statement not found in table expression", + ) + return None + + source_variable, tokens = self.get_item_selector_tokens(variable_statement) + if source_variable is None or tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "schema name not found in table expression", + ) + return None + + schema_name: str = tokens["Name"] + + # Find step for the database access variable + variable_statement = tree_function.get_variable_statement(self.parse_tree, source_variable) + if variable_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-source-variable-statement", + f"schema variable ({source_variable}) statement not found in table expression", + ) + return None + _, tokens = self.get_item_selector_tokens(variable_statement) + if tokens is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "database name not found in table expression", + ) + return None + + db_name: str = tokens["Name"] + + return f"{db_name}.{schema_name}.{table_name}" + + +class SupportedDataPlatform(Enum): + POSTGRES_SQL = ( + DataPlatformPair( + powerbi_data_platform_name="PostgreSQL", + datahub_data_platform_name="postgres" + ), + PostgresMQueryResolver + ) + ORACLE = ( + DataPlatformPair( + powerbi_data_platform_name="Oracle", + datahub_data_platform_name="oracle" + ), + OracleMQueryResolver + ) + SNOWFLAKE = ( + DataPlatformPair( + powerbi_data_platform_name="Snowflake", + datahub_data_platform_name="snowflake" + ), + SnowflakeMQueryResolver + ) + MS_SQL = ( + DataPlatformPair( + powerbi_data_platform_name="Sql", + datahub_data_platform_name="mssql" + ), + MSSqlMQueryResolver + ) + + def get_data_platform_pair(self) -> DataPlatformPair: + return self.value[0] + + def get_m_query_resolver(self) -> Type[BaseMQueryResolver]: + return self.value[1] + + +def get_resolver(parse_tree: Tree) -> Optional[SupportedDataPlatform]: + + _filter: Any = parse_tree.find_data("invoke_expression") + + letter_tree: Tree = next(_filter).children[0] + data_access_func: str = tree_function.make_function_name(letter_tree) + + LOGGER.debug( + "Looking for data-access(%s) resolver", + data_access_func, + ) + + # Take platform name from data_access_func variable + platform_name: str = data_access_func.split(".")[0] + for platform in SupportedDataPlatform: + if platform.get_data_platform_pair().powerbi_data_platform_name == platform_name: + return platform + + LOGGER.info("M-Query resolver not found for data access function %s", data_access_func) + + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py new file mode 100644 index 0000000000000..91c9550903bd8 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py @@ -0,0 +1,143 @@ +import logging +from typing import Optional, List, Union, cast, Any + +from functools import partial + +from lark import Token, Tree + +LOGGER = logging.getLogger(__name__) + + +def get_output_variable(root: Tree) -> Optional[str]: + in_expression_tree: Optional[Tree] = get_first_rule(root, "in_expression") + if in_expression_tree is None: + return None + # Get list of terminal value + # Remove any whitespaces + # Remove any spaces + return "".join( + strip_char_from_list( + remove_whitespaces_from_list(token_values(in_expression_tree)), " " + ) + ) + + +def get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: + _filter = parse_tree.find_data("variable") + # filter will return statement of the form = + # We are searching for Tree where variable-name is matching with provided variable + for tree in _filter: + values: List[str] = token_values(tree.children[0]) + actual_value: str = "".join(strip_char_from_list(values, " ")) + LOGGER.debug("Actual Value = %s", actual_value) + LOGGER.debug("Expected Value = %s", variable) + + if actual_value == variable: + return tree + + LOGGER.info("Provided variable(%s) not found in variable rule", variable) + + return None + + +def get_first_rule(tree: Tree, rule: str) -> Optional[Tree]: + """ + Lark library doesn't have advance search function. + This function will return the first tree of provided rule + :param tree: Tree to search for the expression rule + :return: Tree + """ + + def internal(node: Union[Tree, Token]) -> Optional[Tree]: + if isinstance(node, Tree) and node.data == rule: + return node + if isinstance(node, Token): + return None + + for child in cast(Tree, node).children: + child_node: Optional[Tree] = internal(child) + if child_node is not None: + return child_node + + return None + + expression_tree: Optional[Tree] = internal(tree) + + return expression_tree + + +def token_values(tree: Tree) -> List[str]: + """ + + :param tree: Tree to traverse + :return: List of leaf token data + """ + values: List[str] = [] + + def internal(node: Union[Tree, Token]) -> None: + if isinstance(node, Token): + values.append(cast(Token, node).value) + return + + for child in node.children: + internal(child) + + internal(tree) + + return values + + +def remove_whitespaces_from_list(values: List[str]) -> List[str]: + result: List[str] = [] + for item in values: + if item.strip() not in ("", "\n", "\t"): + result.append(item) + + return result + + +def strip_char_from_list(values: List[str], char: str) -> List[str]: + result: List[str] = [] + for item in values: + result.append(item.strip(char)) + + return result + + +def make_function_name(tree: Tree) -> str: + values: List[str] = token_values(tree) + return ".".join(values) + + +def get_all_function_name(tree: Tree) -> List[str]: + """ + Returns all function name present in input tree + :param tree: Input lexical tree + :return: list of function name + """ + functions: List[str] = [] + + # List the all invoke_expression in the Tree + _filter: Any = tree.find_data("invoke_expression") + + for node in _filter: + LOGGER.debug("Tree = %s", node.pretty()) + primary_expression_node: Optional[Tree] = first_primary_expression_func(node) + if primary_expression_node is None: + continue + + identifier_node: Optional[Tree] = first_identifier_func(primary_expression_node) + if identifier_node is None: + continue + + functions.append(make_function_name(identifier_node)) + + return functions + + +first_expression_func = partial(get_first_rule, rule="expression") +first_item_selector_func = partial(get_first_rule, rule="item_selector") +first_arg_list_func = partial(get_first_rule, rule="argument_list") +first_identifier_func = partial(get_first_rule, rule="identifier") +first_primary_expression_func = partial(get_first_rule, rule="primary_expression") +first_identifier_func = partial(get_first_rule, rule="identifier") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py new file mode 100644 index 0000000000000..9f3664bfb5f41 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -0,0 +1,43 @@ +import logging + +from datahub.ingestion.source.powerbi.m_query import tree_function + +from typing import List, Tuple, Optional +from lark import Tree + +LOGGER = logging.getLogger(__name__) + + +def any_one_should_present(supported_funcs: List[str], functions: List[str]) -> Tuple[bool, Optional[str]]: + """ + Anyone functions from supported_funcs should present in functions list + :param supported_funcs: List of function m_query module supports + :param functions: List of functions retrieved from expression + :return: True or False + """ + for f in supported_funcs: + if f in functions: + return True, None + + return False, f"Function from supported function list {supported_funcs} not found" + + +def all_function_should_be_known(supported_funcs: List[str], functions: List[str]) -> Tuple[bool, Optional[str]]: + for f in functions: + if f not in supported_funcs: + return False, f"Function {f} is unknown" + + return True, None + + +def validate_parse_tree(supported_funcs: List[str], tree: Tree) -> Tuple[bool, str]: + """ + :param supported_funcs: List of supported functions + :param tree: tree to validate as per functions supported by m_parser module + :return: first argument is False if validation is failed and second argument would contain the error message. + in-case of valid tree the first argument is True and second argument would be None. + """ + functions: List[str] = tree_function.get_all_function_name(tree) + if len(functions) == 0: + return False, "Function call not found" + diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index f172cc37cc721..b573956b0fd7b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -21,14 +21,15 @@ ) from datahub.ingestion.api.source import Source, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.powerbi import m_parser from datahub.ingestion.source.powerbi.config import ( Constant, PlatformDetail, PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, ) -from datahub.ingestion.source.powerbi.m_parser import DataPlatformTable + +from datahub.ingestion.source.powerbi.m_query import resolver +from datahub.ingestion.source.powerbi.m_query import parser from datahub.ingestion.source.powerbi.proxy import PowerBiAPI from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps from datahub.metadata.schema_classes import ( @@ -161,18 +162,17 @@ def __to_datahub_dataset( if self.__config.extract_lineage is True: # Check if upstreams table is available, parse them and create dataset URN for each upstream table upstreams: List[UpstreamClass] = [] - upstream_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( + upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( table, self.__reporter ) for upstream_table in upstream_tables: + if upstream_table.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping[upstream_table.platform_type]: + continue + platform: Union[ str, PlatformDetail ] = self.__config.dataset_type_mapping[upstream_table.platform_type] - platform_name: str = ( - m_parser.POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING[ - upstream_table.platform_type - ] - ) + platform_name: str = upstream_table.data_platform_pair.datahub_data_platform_name platform_instance_name: Optional[str] = None platform_env: str = DEFAULT_ENV # Determine if PlatformDetail is provided @@ -730,12 +730,21 @@ def create(cls, config_dict, ctx): config = PowerBiDashboardSourceConfig.parse_obj(config_dict) return cls(config, ctx) + def validate_dataset_type_mapping(self): + powerbi_data_platforms: List[str] = [data_platform.get_data_platform_pair().powerbi_data_platform_name for data_platform + in resolver.SupportedDataPlatform] + + for key in self.source_config.keys(): + if key not in powerbi_data_platforms: + raise ValueError(f"PowerBI DataPlatform {key} is not supported") + def get_workunits(self) -> Iterable[MetadataWorkUnit]: """ Datahub Ingestion framework invoke this method """ LOGGER.info("PowerBi plugin execution is started") - + # Validate dataset type mapping + self.validate_dataset_type_mapping() # Fetch PowerBi workspace for given workspace identifier workspace = self.powerbi_client.get_workspace( self.source_config.workspace_id, self.reporter diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 917532b783976..ca9bcb5f7fad5 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -2,9 +2,12 @@ from lark import Tree -from datahub.ingestion.source.powerbi import m_parser +from datahub.ingestion.source.powerbi.m_query import ( + parser, + tree_function +) from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport -from datahub.ingestion.source.powerbi.m_parser import ( +from datahub.ingestion.source.powerbi.m_query.resolver import ( DataPlatformTable, SupportedDataPlatform, ) @@ -30,171 +33,169 @@ ] -# def test_parse_m_query1(): -# expression: str = M_QUERIES[0] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "TESTTABLE_Table" -# -# -# def test_parse_m_query2(): -# expression: str = M_QUERIES[1] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom2"' -# -# -# def test_parse_m_query3(): -# expression: str = M_QUERIES[2] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Conditional Column"' -# -# -# def test_parse_m_query4(): -# expression: str = M_QUERIES[3] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Changed Type"' -# -# -# def test_parse_m_query5(): -# expression: str = M_QUERIES[4] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Renamed Columns"' -# -# -# def test_parse_m_query6(): -# expression: str = M_QUERIES[5] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query7(): -# expression: str = M_QUERIES[6] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "Source" -# -# -# def test_parse_m_query8(): -# expression: str = M_QUERIES[7] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query9(): -# expression: str = M_QUERIES[8] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query10(): -# expression: str = M_QUERIES[9] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Changed Type1"' -# -# -# def test_parse_m_query11(): -# expression: str = M_QUERIES[10] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "Source" -# -# -# def test_parse_m_query12(): -# expression: str = M_QUERIES[11] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query13(): -# expression: str = M_QUERIES[12] -# parse_tree: Tree = m_parser._parse_expression(expression) -# assert m_parser._get_output_variable(parse_tree) == "two_source_table" -# -# -# def test_snowflake_regular_case(): -# q: str = M_QUERIES[0] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "TESTTABLE" -# assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" -# assert ( -# data_platform_tables[0].platform_type == SupportedDataPlatform.SNOWFLAKE.value -# ) -# -# -# def test_postgres_regular_case(): -# q: str = M_QUERIES[13] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "order_date" -# assert data_platform_tables[0].full_name == "mics.public.order_date" -# assert ( -# data_platform_tables[0].platform_type -# == SupportedDataPlatform.POSTGRES_SQL.value -# ) -# -# -# def test_oracle_regular_case(): -# q: str = M_QUERIES[14] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "EMPLOYEES" -# assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" -# assert data_platform_tables[0].platform_type == SupportedDataPlatform.ORACLE.value -# -# -# def test_mssql_regular_case(): -# q: str = M_QUERIES[15] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# -# data_platform_tables: List[DataPlatformTable] = m_parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "book_issue" -# assert data_platform_tables[0].full_name == "library.dbo.book_issue" -# assert data_platform_tables[0].platform_type == SupportedDataPlatform.MS_SQL.value - -def test_advance_use_case(): +def test_parse_m_query1(): + expression: str = M_QUERIES[0] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "TESTTABLE_Table" + + +def test_parse_m_query2(): + expression: str = M_QUERIES[1] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom2"' + + +def test_parse_m_query3(): + expression: str = M_QUERIES[2] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Conditional Column"' + + +def test_parse_m_query4(): + expression: str = M_QUERIES[3] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Changed Type"' + + +def test_parse_m_query5(): + expression: str = M_QUERIES[4] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Renamed Columns"' + + +def test_parse_m_query6(): + expression: str = M_QUERIES[5] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' + + +def test_parse_m_query7(): + expression: str = M_QUERIES[6] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "Source" + + +def test_parse_m_query8(): + expression: str = M_QUERIES[7] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' + +def test_parse_m_query9(): + expression: str = M_QUERIES[8] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query10(): + expression: str = M_QUERIES[9] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Changed Type1"' + + +def test_parse_m_query11(): + expression: str = M_QUERIES[10] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "Source" + + +def test_parse_m_query12(): + expression: str = M_QUERIES[11] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' + + +def test_parse_m_query13(): + expression: str = M_QUERIES[12] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "two_source_table" + + +def test_snowflake_regular_case(): + q: str = M_QUERIES[0] table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=M_QUERIES[1], + expression=q, name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) - m_parser.get_upstream_tables(table, PowerBiDashboardSourceReport()) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "TESTTABLE" + assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.get_data_platform_pair().powerbi_data_platform_name + ) + + +def test_postgres_regular_case(): + q: str = M_QUERIES[13] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "order_date" + assert data_platform_tables[0].full_name == "mics.public.order_date" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.POSTGRES_SQL.get_data_platform_pair().powerbi_data_platform_name + ) + + +def test_oracle_regular_case(): + q: str = M_QUERIES[14] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "EMPLOYEES" + assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.ORACLE.get_data_platform_pair().powerbi_data_platform_name + ) + + +def test_mssql_regular_case(): + q: str = M_QUERIES[15] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "book_issue" + assert data_platform_tables[0].full_name == "library.dbo.book_issue" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.MS_SQL.get_data_platform_pair().powerbi_data_platform_name + ) From 776a78760158c7df21c419e8a09b7485db6bffe9 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 15 Dec 2022 21:22:00 +0530 Subject: [PATCH 24/53] text fixes --- .../source/powerbi/m_query/parser.py | 13 ++++++++- .../source/powerbi/m_query/resolver.py | 27 ++++++++++++++----- .../source/powerbi/m_query/validator.py | 17 +++++++++--- .../integration/powerbi/test_m_parser.py | 16 +++++++++++ 4 files changed, 62 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index aa929b475529a..341a9a215dfcb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -6,6 +6,7 @@ from lark import Lark, Tree from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport + from datahub.ingestion.source.powerbi.proxy import PowerBiAPI from datahub.ingestion.source.powerbi.m_query import validator from datahub.ingestion.source.powerbi.m_query import resolver @@ -34,7 +35,9 @@ def _parse_expression(expression: str) -> Tree: def get_upstream_tables( - table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport + table: PowerBiAPI.Table, + reporter: PowerBiDashboardSourceReport, + native_query_enabled: bool = True, ) -> List[resolver.DataPlatformTable]: if table.expression is None: reporter.report_warning(table.full_name, "Expression is none") @@ -42,6 +45,14 @@ def get_upstream_tables( try: parse_tree: Tree = _parse_expression(table.expression) + valid, message = validator.validate_parse_tree(parse_tree, native_query_enabled=native_query_enabled) + if valid is False: + LOGGER.debug("Validation failed: %s", message) + reporter.report_warning( + table.full_name, + message + ) + return [] except lark.exceptions.UnexpectedCharacters as e: LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) reporter.report_warning( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index a71db5f6b1145..bb20b191689b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -346,34 +346,46 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: return f"{db_name}.{schema_name}.{table_name}" +class FunctionName(Enum): + NATIVE_QUERY = "Value.NativeQuery" + POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database" + ORACLE_DATA_ACCESS = "Oracle.Database" + SNOWFLAKE_DATA_ACCESS = "Snowflake.Databases" + MSSQL_DATA_ACCESS = "Sql.Database" + + class SupportedDataPlatform(Enum): POSTGRES_SQL = ( DataPlatformPair( powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" ), - PostgresMQueryResolver + PostgresMQueryResolver, + FunctionName.POSTGRESQL_DATA_ACCESS, ) ORACLE = ( DataPlatformPair( powerbi_data_platform_name="Oracle", datahub_data_platform_name="oracle" ), - OracleMQueryResolver + OracleMQueryResolver, + FunctionName.ORACLE_DATA_ACCESS, ) SNOWFLAKE = ( DataPlatformPair( powerbi_data_platform_name="Snowflake", datahub_data_platform_name="snowflake" ), - SnowflakeMQueryResolver + SnowflakeMQueryResolver, + FunctionName.SNOWFLAKE_DATA_ACCESS, ) MS_SQL = ( DataPlatformPair( powerbi_data_platform_name="Sql", datahub_data_platform_name="mssql" ), - MSSqlMQueryResolver + MSSqlMQueryResolver, + FunctionName.MSSQL_DATA_ACCESS, ) def get_data_platform_pair(self) -> DataPlatformPair: @@ -382,6 +394,9 @@ def get_data_platform_pair(self) -> DataPlatformPair: def get_m_query_resolver(self) -> Type[BaseMQueryResolver]: return self.value[1] + def get_function_name(self) -> FunctionName: + return self.value[2] + def get_resolver(parse_tree: Tree) -> Optional[SupportedDataPlatform]: @@ -395,10 +410,8 @@ def get_resolver(parse_tree: Tree) -> Optional[SupportedDataPlatform]: data_access_func, ) - # Take platform name from data_access_func variable - platform_name: str = data_access_func.split(".")[0] for platform in SupportedDataPlatform: - if platform.get_data_platform_pair().powerbi_data_platform_name == platform_name: + if platform.get_function_name().value == data_access_func: return platform LOGGER.info("M-Query resolver not found for data access function %s", data_access_func) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py index 9f3664bfb5f41..3941e4ed38ed5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -1,8 +1,9 @@ import logging from datahub.ingestion.source.powerbi.m_query import tree_function +from datahub.ingestion.source.powerbi.m_query import resolver -from typing import List, Tuple, Optional +from typing import List, Tuple, Optional, Set from lark import Tree LOGGER = logging.getLogger(__name__) @@ -30,10 +31,10 @@ def all_function_should_be_known(supported_funcs: List[str], functions: List[str return True, None -def validate_parse_tree(supported_funcs: List[str], tree: Tree) -> Tuple[bool, str]: +def validate_parse_tree(tree: Tree, native_query_enabled: bool = True) -> Tuple[bool, str]: """ - :param supported_funcs: List of supported functions :param tree: tree to validate as per functions supported by m_parser module + :param native_query_enabled: Whether user want to extract lineage from native query :return: first argument is False if validation is failed and second argument would contain the error message. in-case of valid tree the first argument is True and second argument would be None. """ @@ -41,3 +42,13 @@ def validate_parse_tree(supported_funcs: List[str], tree: Tree) -> Tuple[bool, s if len(functions) == 0: return False, "Function call not found" + data_access_function_names: List[str] = [x.get_function_name().value for x in resolver.SupportedDataPlatform] + result: Set[str] = set(data_access_function_names) & set(functions) + if len(result) != 1: + return False, f"More than one data-access functions are found in expression. Functions = {result}" + + if native_query_enabled is False: + if resolver.FunctionName.NATIVE_QUERY.value in functions: + return False, f"Lineage extraction from native query is disabled." + + return True, None diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index ca9bcb5f7fad5..7a51006e1e46f 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -120,6 +120,7 @@ def test_snowflake_regular_case(): ) reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter ) @@ -199,3 +200,18 @@ def test_mssql_regular_case(): == SupportedDataPlatform.MS_SQL.get_data_platform_pair().powerbi_data_platform_name ) + +def test_native_query_disabled(): + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=M_QUERIES[1], + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter, native_query_enabled=False + ) + + assert len(data_platform_tables) == 0 From 0a4a9b0eeef9f88a68281ed91cd174810299a3c2 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Sun, 18 Dec 2022 22:13:25 +0530 Subject: [PATCH 25/53] WIP --- .../ingestion/source/powerbi/m_parser2.py | 580 ------------------ .../source/powerbi/m_query/parser.py | 13 +- .../source/powerbi/m_query/resolver.py | 252 +++++--- .../source/powerbi/m_query/tree_function.py | 6 +- .../integration/powerbi/test_m_parser.py | 353 ++++++----- 5 files changed, 367 insertions(+), 837 deletions(-) delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py deleted file mode 100644 index 2979e181ca248..0000000000000 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_parser2.py +++ /dev/null @@ -1,580 +0,0 @@ -import importlib.resources as pkg_resource -import logging -from abc import ABC, abstractmethod -from dataclasses import dataclass -from enum import Enum -from functools import partial -from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast - -import lark -from lark import Lark, Token, Tree - -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport -from datahub.ingestion.source.powerbi.proxy import PowerBiAPI - -LOGGER = logging.getLogger(__name__) - - -@dataclass -class DataPlatformTable: - name: str - full_name: str - platform_type: str - - -class SupportedDataPlatform(Enum): - POSTGRES_SQL = "PostgreSQL" - ORACLE = "Oracle" - SNOWFLAKE = "Snowflake" - MS_SQL = "Sql" - - -POWERBI_TO_DATAHUB_DATA_PLATFORM_MAPPING: Dict[str, str] = { - SupportedDataPlatform.POSTGRES_SQL.value: "postgres", - SupportedDataPlatform.ORACLE.value: "oracle", - SupportedDataPlatform.SNOWFLAKE.value: "snowflake", -} - - -def _get_output_variable(root: Tree) -> Optional[str]: - in_expression_tree: Optional[Tree] = _get_first_rule(root, "in_expression") - if in_expression_tree is None: - return None - # Get list of terminal value - # Remove any whitespaces - # Remove any spaces - return "".join( - _strip_char_from_list( - _remove_whitespaces_from_list(_token_values(in_expression_tree)), " " - ) - ) - - -def _get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: - _filter = parse_tree.find_data("variable") - # filter will return statement of the form = - # We are searching for Tree where variable-name is matching with provided variable - for tree in _filter: - values: List[str] = _token_values(tree.children[0]) - actual_value: str = "".join(_strip_char_from_list(values, " ")) - LOGGER.debug("Actual Value = %s", actual_value) - LOGGER.debug("Expected Value = %s", variable) - - if actual_value == variable: - return tree - - LOGGER.info("Provided variable(%s) not found in variable rule", variable) - - return None - - -def _get_first_rule(tree: Tree, rule: str) -> Optional[Tree]: - """ - Lark library doesn't have advance search function. - This function will return the first tree of provided rule - :param tree: Tree to search for the expression rule - :return: Tree - """ - - def internal(node: Union[Tree, Token]) -> Optional[Tree]: - if isinstance(node, Tree) and node.data == rule: - return node - if isinstance(node, Token): - return None - - for child in cast(Tree, node).children: - child_node: Optional[Tree] = internal(child) - if child_node is not None: - return child_node - - return None - - expression_tree: Optional[Tree] = internal(tree) - - return expression_tree - - -def _token_values(tree: Tree) -> List[str]: - """ - - :param tree: Tree to traverse - :return: List of leaf token data - """ - values: List[str] = [] - - def internal(node: Union[Tree, Token]) -> None: - if isinstance(node, Token): - values.append(cast(Token, node).value) - return - - for child in node.children: - internal(child) - - internal(tree) - - return values - - -def _remove_whitespaces_from_list(values: List[str]) -> List[str]: - result: List[str] = [] - for item in values: - if item.strip() not in ("", "\n", "\t"): - result.append(item) - - return result - - -def _strip_char_from_list(values: List[str], char: str) -> List[str]: - result: List[str] = [] - for item in values: - result.append(item.strip(char)) - - return result - - -def _make_function_name(tree: Tree) -> str: - values: List[str] = _token_values(tree) - return ".".join(values) - - -class AbstractMQueryResolver(ABC): - pass - - -class AbstractDataAccessMQueryResolver(AbstractMQueryResolver, ABC): - table: PowerBiAPI.Table - parse_tree: Tree - reporter: PowerBiDashboardSourceReport - - def __init__( - self, - table: PowerBiAPI.Table, - parse_tree: Tree, - reporter: PowerBiDashboardSourceReport, - ): - self.table = table - self.parse_tree = parse_tree - self.reporter = reporter - self.first_expression_func = partial(_get_first_rule, rule="expression") - self.first_item_selector_func = partial(_get_first_rule, rule="item_selector") - self.first_arg_list_func = partial(_get_first_rule, rule="argument_list") - self.first_identifier_func = partial(_get_first_rule, rule="identifier") - - @abstractmethod - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: - pass - - -class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): - def get_item_selector_tokens( - self, variable_statement: Tree - ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: - expression_tree: Optional[Tree] = self.first_expression_func(variable_statement) - if expression_tree is None: - LOGGER.debug("Expression tree not found") - LOGGER.debug(variable_statement.pretty()) - return None, None - - item_selector: Optional[Tree] = self.first_item_selector_func(expression_tree) - if item_selector is None: - LOGGER.debug("Item Selector not found in tree") - LOGGER.debug(variable_statement.pretty()) - return None, None - - identifier_tree: Optional[Tree] = self.first_identifier_func(expression_tree) - if identifier_tree is None: - LOGGER.debug("Identifier not found in tree") - LOGGER.debug(variable_statement.pretty()) - return None, None - - # remove whitespaces and quotes from token - tokens: List[str] = _strip_char_from_list( - _remove_whitespaces_from_list(_token_values(cast(Tree, item_selector))), - '"', - ) - identifier: List[str] = _token_values( - cast(Tree, identifier_tree) - ) # type :ignore - # convert tokens to dict - iterator = iter(tokens) - # cast to satisfy lint - return identifier[0], dict(zip(iterator, iterator)) - - def get_argument_list(self, variable_statement: Tree) -> Optional[List[str]]: - expression_tree: Optional[Tree] = self.first_expression_func(variable_statement) - if expression_tree is None: - LOGGER.debug("First expression rule not found in input tree") - return None - - argument_list: Optional[Tree] = self.first_arg_list_func(expression_tree) - if argument_list is None: - LOGGER.debug("First argument-list rule not found in input tree") - return None - - # remove whitespaces and quotes from token - tokens: List[str] = _strip_char_from_list( - _remove_whitespaces_from_list(_token_values(argument_list)), '"' - ) - return tokens - - def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: - data_platform_tables: List[DataPlatformTable] = [] - # Look for output variable - output_variable: Optional[str] = _get_output_variable(self.parse_tree) - if output_variable is None: - self.reporter.report_warning( - f"{self.table.full_name}-output-variable", - "output-variable not found in table expression", - ) - return data_platform_tables - - full_table_name: Optional[str] = self.get_full_table_name(output_variable) - if full_table_name is None: - LOGGER.debug( - "Fail to form full_table_name for PowerBI DataSet table %s", - self.table.full_name, - ) - return data_platform_tables - - return [ - DataPlatformTable( - name=full_table_name.split(".")[-1], - full_name=full_table_name, - platform_type=self.get_platform(), - ), - ] - - @abstractmethod - def get_platform(self) -> str: - pass - - @abstractmethod - def get_full_table_name(self, output_variable: str) -> Optional[str]: - pass - - -class DefaultTwoStepDataAccessSources(BaseMQueryResolver, ABC): - """ - These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern - let - Source = Sql.Database("localhost", "library"), - dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data] - in - dbo_book_issue - """ - - def get_full_table_name(self, output_variable: str) -> Optional[str]: - variable_statement: Optional[Tree] = _get_variable_statement( - self.parse_tree, output_variable - ) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - f"output variable ({output_variable}) statement not found in table expression", - ) - return None - source, tokens = self.get_item_selector_tokens(cast(Tree, variable_statement)) - if source is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Schema detail not found in table expression", - ) - return None - - schema_name: str = tokens["Schema"] - table_name: str = tokens["Item"] - # Look for database-name - variable_statement = _get_variable_statement(self.parse_tree, source) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-source-statement", - f"source variable {source} statement not found in table expression", - ) - return None - arg_list = self.get_argument_list(cast(Tree, variable_statement)) - if arg_list is None or len(arg_list) < 1: - self.reporter.report_warning( - f"{self.table.full_name}-database-arg-list", - "Expected number of argument not found in data-access function of table expression", - ) - return None - - database_name: str = cast(List[str], arg_list)[1] # 1st token is database name - return cast(Optional[str], f"{database_name}.{schema_name}.{table_name}") - - -class PostgresMQueryResolver(DefaultTwoStepDataAccessSources): - def get_platform(self) -> str: - return SupportedDataPlatform.POSTGRES_SQL.value - - -class MSSqlMQueryResolver(DefaultTwoStepDataAccessSources): - def get_platform(self) -> str: - return SupportedDataPlatform.MS_SQL.value - - -class OracleMQueryResolver(BaseMQueryResolver): - def get_platform(self) -> str: - return SupportedDataPlatform.ORACLE.value - - def _get_db_name(self, value: str) -> Optional[str]: - error_message: str = f"The target argument ({value}) should in the format of :/[.]" - splitter_result: List[str] = value.split("/") - if len(splitter_result) != 2: - self.reporter.report_warning( - f"{self.table.full_name}-oracle-target", error_message - ) - return None - - db_name = splitter_result[1].split(".")[0] - - return db_name - - def get_full_table_name(self, output_variable: str) -> Optional[str]: - # Find step for the output variable - variable_statement: Optional[Tree] = _get_variable_statement( - self.parse_tree, output_variable - ) - - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - f"output variable ({output_variable}) statement not found in table expression", - ) - return None - - schema_variable, tokens = self.get_item_selector_tokens( - cast(Tree, variable_statement) - ) - if schema_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "table name not found in table expression", - ) - return None - - table_name: str = tokens["Name"] - - # Find step for the schema variable - variable_statement = _get_variable_statement( - self.parse_tree, cast(str, schema_variable) - ) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-schema-variable-statement", - f"schema variable ({schema_variable}) statement not found in table expression", - ) - return None - - source_variable, tokens = self.get_item_selector_tokens(variable_statement) - if source_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Schema not found in table expression", - ) - return None - - schema_name: str = tokens["Schema"] - - # Find step for the database access variable - variable_statement = _get_variable_statement(self.parse_tree, source_variable) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-source-variable-statement", - f"schema variable ({source_variable}) statement not found in table expression", - ) - return None - arg_list = self.get_argument_list(variable_statement) - if arg_list is None or len(arg_list) < 1: - self.reporter.report_warning( - f"{self.table.full_name}-database-arg-list", - "Expected number of argument not found in data-access function of table expression", - ) - return None - # The first argument has database name. format localhost:1521/salesdb.GSLAB.COM - db_name: Optional[str] = self._get_db_name(arg_list[0]) - if db_name is None: - LOGGER.debug(f"Fail to extract db name from the target {arg_list}") - - return f"{db_name}.{schema_name}.{table_name}" - - -class SnowflakeMQueryResolver(BaseMQueryResolver): - def get_platform(self) -> str: - return SupportedDataPlatform.SNOWFLAKE.value - - def get_full_table_name(self, output_variable: str) -> Optional[str]: - # Find step for the output variable - variable_statement: Optional[Tree] = _get_variable_statement( - self.parse_tree, output_variable - ) - - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - f"output variable ({output_variable}) statement not found in table expression", - ) - return None - - schema_variable, tokens = self.get_item_selector_tokens(variable_statement) - if schema_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "table name not found in table expression", - ) - return None - - table_name: str = tokens["Name"] - - # Find step for the schema variable - variable_statement = _get_variable_statement(self.parse_tree, schema_variable) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-schema-variable-statement", - f"schema variable ({schema_variable}) statement not found in table expression", - ) - return None - - source_variable, tokens = self.get_item_selector_tokens(variable_statement) - if source_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "schema name not found in table expression", - ) - return None - - schema_name: str = tokens["Name"] - - # Find step for the database access variable - variable_statement = _get_variable_statement(self.parse_tree, source_variable) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-source-variable-statement", - f"schema variable ({source_variable}) statement not found in table expression", - ) - return None - _, tokens = self.get_item_selector_tokens(variable_statement) - if tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "database name not found in table expression", - ) - return None - - db_name: str = tokens["Name"] - - return f"{db_name}.{schema_name}.{table_name}" - - -def _get_resolver(parse_tree: Tree) -> Optional[Type["BaseMQueryResolver"]]: - - _filter: Any = parse_tree.find_data("invoke_expression") - - letter_tree: Tree = next(_filter).children[0] - data_access_func: str = _make_function_name(letter_tree) - - LOGGER.debug( - "Looking for data-access(%s) resolver in data-access-function registry %s", - data_access_func, - DATA_ACCESS_RESOLVER, - ) - - if DATA_ACCESS_RESOLVER.get(data_access_func) is None: - LOGGER.info("Resolver not found for %s", data_access_func) - return None - - return DATA_ACCESS_RESOLVER[data_access_func] - - -# Register M-Query resolver for specific database platform -DATA_ACCESS_RESOLVER = { - f"{SupportedDataPlatform.POSTGRES_SQL.value}.Database": PostgresMQueryResolver, - f"{SupportedDataPlatform.ORACLE.value}.Database": OracleMQueryResolver, - f"{SupportedDataPlatform.SNOWFLAKE.value}.Databases": SnowflakeMQueryResolver, - f"{SupportedDataPlatform.MS_SQL.value}.Database": MSSqlMQueryResolver, -} # type :ignore - - -def _parse_expression(expression: str) -> Tree: - # Read lexical grammar as text - grammar: str = pkg_resource.read_text( - "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" - ) - - # Create lark parser for the grammar text - lark_parser = Lark(grammar, start="let_expression", regex=True) - - parse_tree: Tree = lark_parser.parse(expression) - - LOGGER.debug("Parse Tree") - if ( - LOGGER.level == logging.DEBUG - ): # Guard condition to avoid heavy pretty() function call - LOGGER.debug(parse_tree.pretty()) - - return parse_tree - - -def _validate_parse_tree(supported_funcs: List[str], tree: Tree) -> Tuple[bool, str]: - """ - :param tree: tree to validate as per functions supported by m_parser module - :return: first argument is False if validation is failed and second argument would contain the error message. - in-case of valid tree the first argument is True and second argument would be None. - """ - _filter: List[Tree] = tree.find_data("invoke_expression") - - valid: bool = False - message: Optional[str] = None - - for node in _filter: - primary_expression_node: Optional[Tree] = _get_first_rule(node, "primary_expression") - if primary_expression_node is None: - continue - identifier_node: Optional[Tree] = _get_first_rule(primary_expression_node, "identifier") - if identifier_node is None: - continue - - function_name: str = _make_function_name(identifier_node) - # This function should be in our supported function list - if function_name not in supported_funcs: - return False, f"function {function_name} is not supported" - - -def get_upstream_tables( - table: PowerBiAPI.Table, reporter: PowerBiDashboardSourceReport -) -> List[DataPlatformTable]: - if table.expression is None: - reporter.report_warning(table.full_name, "Expression is none") - return [] - - try: - parse_tree: Tree = _parse_expression(table.expression) - _validate_parse_tree([], parse_tree) - exit() - except lark.exceptions.UnexpectedCharacters as e: - LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) - reporter.report_warning( - table.full_name, f"UnSupported expression = {table.expression}" - ) - return [] - - trees: List[Tree] = list(parse_tree.find_data("invoke_expression")) - if len(trees) > 1: - reporter.report_warning( - table.full_name, f"{table.full_name} has more than one invoke expression" - ) - return [] - - resolver: Optional[Type[BaseMQueryResolver]] = _get_resolver(parse_tree) - if resolver is None: - LOGGER.debug("Table full-name = %s", table.full_name) - LOGGER.debug("Expression = %s", table.expression) - reporter.report_warning( - table.full_name, - f"{table.full_name} M-Query resolver not found for the table expression", - ) - return [] - - return resolver( - table, parse_tree, reporter - ).resolve_to_data_platform_table_list() # type: ignore diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 341a9a215dfcb..2b442f1394037 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -60,19 +60,8 @@ def get_upstream_tables( ) return [] - resolver_enum: Optional[resolver.SupportedDataPlatform] = resolver.get_resolver(parse_tree) - if resolver_enum is None: - LOGGER.debug("Table full-name = %s", table.full_name) - LOGGER.debug("Expression = %s", table.expression) - reporter.report_warning( - table.full_name, - f"{table.full_name} M-Query resolver not found for the table expression", - ) - return [] - - return resolver_enum.get_m_query_resolver()( + return resolver.BaseMQueryResolver( table=table, parse_tree=parse_tree, - data_platform_pair=resolver_enum.get_data_platform_pair(), reporter=reporter, ).resolve_to_data_platform_table_list() # type: ignore diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index bb20b191689b0..50a5e488d32e0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -28,27 +28,27 @@ class DataPlatformTable: data_platform_pair: DataPlatformPair -class AbstractMQueryResolver(ABC): - pass +class FullTableNameCreator(ABC): + @abstractmethod + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + pass -class AbstractDataAccessMQueryResolver(AbstractMQueryResolver, ABC): +class AbstractDataAccessMQueryResolver(ABC): table: PowerBiAPI.Table parse_tree: Tree reporter: PowerBiDashboardSourceReport - data_platform_pair: DataPlatformPair def __init__( self, table: PowerBiAPI.Table, parse_tree: Tree, - data_platform_pair: DataPlatformPair, reporter: PowerBiDashboardSourceReport, ): self.table = table self.parse_tree = parse_tree self.reporter = reporter - self.data_platform_pair = data_platform_pair + self.specific_resolver = {} @abstractmethod def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: @@ -56,25 +56,21 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): + @staticmethod def get_item_selector_tokens( - self, variable_statement: Tree + expression_tree: Tree ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: - expression_tree: Optional[Tree] = tree_function.first_expression_func(variable_statement) - if expression_tree is None: - LOGGER.debug("Expression tree not found") - LOGGER.debug(variable_statement.pretty()) - return None, None item_selector: Optional[Tree] = tree_function.first_item_selector_func(expression_tree) if item_selector is None: LOGGER.debug("Item Selector not found in tree") - LOGGER.debug(variable_statement.pretty()) + LOGGER.debug(expression_tree.pretty()) return None, None identifier_tree: Optional[Tree] = tree_function.first_identifier_func(expression_tree) if identifier_tree is None: LOGGER.debug("Identifier not found in tree") - LOGGER.debug(variable_statement.pretty()) + LOGGER.debug(item_selector.pretty()) return None, None # remove whitespaces and quotes from token @@ -90,7 +86,7 @@ def get_item_selector_tokens( # cast to satisfy lint return identifier[0], dict(zip(iterator, iterator)) - def get_argument_list(self, variable_statement: Tree) -> Optional[List[str]]: + def get_argument_list(self, variable_statement: Tree) -> Optional[Tree]: expression_tree: Optional[Tree] = tree_function.first_expression_func(variable_statement) if expression_tree is None: LOGGER.debug("First expression rule not found in input tree") @@ -101,15 +97,104 @@ def get_argument_list(self, variable_statement: Tree) -> Optional[List[str]]: LOGGER.debug("First argument-list rule not found in input tree") return None - # remove whitespaces and quotes from token - tokens: List[str] = tree_function.strip_char_from_list( - tree_function.remove_whitespaces_from_list(tree_function.token_values(argument_list)), '"' - ) - return tokens + return argument_list + + def make_token_dict(self, identifier: str) -> Dict[str, Any]: + token_dict: Dict[str, Any] = {} + + def fill_token_dict(identifier: str, supported_data_access_func: List[str], t_dict: Dict[str, Any]) -> None: + """ + 1) Find statement where identifier appear in the left-hand side i.e. identifier = expression + 2) Check expression is function invocation i.e. invoke_expression or item_selector + 3) if it is function invocation and this function is not the data-access function then take first argument + i.e. identifier and call the function recursively + 4) if it is item_selector then take identifier and key-value pair, + add identifier and key-value pair in current_selector and call the function recursively + 5) This recursion will continue till we reach to data-access function and during recursion we will fill + token_dict dictionary for all item_selector we find during traversal. + + :param identifier: variable to look for + :param supported_data_access_func: List of supported data-access functions + :param t_dict: dict where key is identifier and value is key-value pair which represent item selected from + identifier + :return: None + """ + v_statement: Optional[Tree] = tree_function.get_variable_statement( + self.parse_tree, identifier + ) + if v_statement is None: + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"output variable ({identifier}) statement not found in table expression", + ) + return None + + expression_tree: Optional[Tree] = tree_function.first_expression_func(v_statement) + if expression_tree is None: + LOGGER.debug("Expression tree not found") + LOGGER.debug(v_statement.pretty()) + return None + invoke_expression: Optional[Tree] = tree_function.first_invoke_expression_func(expression_tree) + if invoke_expression is not None: + letter_tree: Tree = invoke_expression.children[0] + data_access_func: str = tree_function.make_function_name(letter_tree) + if data_access_func in supported_data_access_func: + token_dict.update( + { + f"{data_access_func}": { + "arg_list": self.get_argument_list(expression_tree), + **t_dict, + } + } + ) + return + + first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func(invoke_expression) + if first_arg_tree is None: + LOGGER.debug("Function invocation without argument in expression = %s", invoke_expression.pretty()) + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"Function invocation without argument", + ) + return None + type_expression: Optional[Tree] = tree_function.first_type_expression_func(first_arg_tree) + if type_expression is None: + LOGGER.debug("Type expression not found in expression = %s", first_arg_tree.pretty()) + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"Type expression not found", + ) + return None + + tokens: List[str] = tree_function.token_values(type_expression) + if len(tokens) != 1: + LOGGER.debug("type-expression has more than one identifier = %s", type_expression.pretty()) + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + f"Unsupported type expression", + ) + return None + new_identifier: str = tokens[0] + fill_token_dict(new_identifier, supported_data_access_func, t_dict) + else: + identifier, key_vs_value = self.get_item_selector_tokens( + tree_function.first_expression_func(expression_tree) + ) + current_selector: Dict[str, Any] = { + f"{identifier}": { + "item_selectors": [key_vs_value], + **t_dict, + } + } + fill_token_dict(identifier, supported_data_access_func, current_selector) + + fill_token_dict(identifier, SupportedResolver.get_function_names(), {}) + + return token_dict def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] - # Look for output variable + output_variable: Optional[str] = tree_function.get_output_variable(self.parse_tree) if output_variable is None: self.reporter.report_warning( @@ -118,28 +203,33 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) return data_platform_tables - full_table_name: Optional[str] = self.get_full_table_name(output_variable) - if full_table_name is None: - LOGGER.debug( - "Fail to form full_table_name for PowerBI DataSet table %s", - self.table.full_name, - ) - return data_platform_tables - - return [ - DataPlatformTable( - name=full_table_name.split(".")[-1], - full_name=full_table_name, - data_platform_pair=self.data_platform_pair - ), - ] - - @abstractmethod - def get_full_table_name(self, output_variable: str) -> Optional[str]: - pass - - -class DefaultTwoStepDataAccessSources(BaseMQueryResolver, ABC): + token_dict: Dict[str, Any] = self.make_token_dict(output_variable) + + # each key is data-access function + for data_access_func in token_dict.keys(): + supported_resolver = SupportedResolver.get_resolver(data_access_func) + if supported_resolver is None: + LOGGER.debug("Resolver not found for the data-access-function %s", data_access_func) + self.reporter.report_warning( + f"{self.table.full_name}-data-access-function", + f"Resolver not found for data-access-function = {data_access_func}" + ) + continue + + table_full_name_creator: FullTableNameCreator = supported_resolver.get_table_full_name_creator()() + for table_full_name in table_full_name_creator.get_full_table_names(token_dict): + data_platform_tables.append( + DataPlatformTable( + name=table_full_name.split(".")[-1], + full_name=table_full_name, + data_platform_pair=supported_resolver.get_data_platform_pair() + ) + ) + + return data_platform_tables + + +class DefaultTwoStepDataAccessSources(FullTableNameCreator): """ These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern let @@ -149,7 +239,7 @@ class DefaultTwoStepDataAccessSources(BaseMQueryResolver, ABC): dbo_book_issue """ - def get_full_table_name(self, output_variable: str) -> Optional[str]: + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: variable_statement: Optional[Tree] = tree_function.get_variable_statement( self.parse_tree, output_variable ) @@ -189,15 +279,15 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: return cast(Optional[str], f"{database_name}.{schema_name}.{table_name}") -class PostgresMQueryResolver(DefaultTwoStepDataAccessSources): +class PostgresFullTableNameCreator(DefaultTwoStepDataAccessSources): pass -class MSSqlMQueryResolver(DefaultTwoStepDataAccessSources): +class MSSqlFullTableNameCreator(DefaultTwoStepDataAccessSources): pass -class OracleMQueryResolver(BaseMQueryResolver): +class OracleFullTableNameCreator(FullTableNameCreator): def _get_db_name(self, value: str) -> Optional[str]: error_message: str = f"The target argument ({value}) should in the format of :/[.]" @@ -212,7 +302,7 @@ def _get_db_name(self, value: str) -> Optional[str]: return db_name - def get_full_table_name(self, output_variable: str) -> Optional[str]: + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: # Find step for the output variable variable_statement: Optional[Tree] = tree_function.get_variable_statement( self.parse_tree, output_variable @@ -281,9 +371,9 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: return f"{db_name}.{schema_name}.{table_name}" -class SnowflakeMQueryResolver(BaseMQueryResolver): +class SnowflakeFullTableNameCreator(FullTableNameCreator): - def get_full_table_name(self, output_variable: str) -> Optional[str]: + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: # Find step for the output variable variable_statement: Optional[Tree] = tree_function.get_variable_statement( self.parse_tree, output_variable @@ -346,6 +436,12 @@ def get_full_table_name(self, output_variable: str) -> Optional[str]: return f"{db_name}.{schema_name}.{table_name}" +class NativeQueryFullTableNameCreator(FullTableNameCreator): + + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + pass + + class FunctionName(Enum): NATIVE_QUERY = "Value.NativeQuery" POSTGRESQL_DATA_ACCESS = "PostgreSQL.Database" @@ -354,66 +450,72 @@ class FunctionName(Enum): MSSQL_DATA_ACCESS = "Sql.Database" -class SupportedDataPlatform(Enum): +class SupportedResolver(Enum): POSTGRES_SQL = ( DataPlatformPair( powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" ), - PostgresMQueryResolver, + PostgresFullTableNameCreator, FunctionName.POSTGRESQL_DATA_ACCESS, ) + ORACLE = ( DataPlatformPair( powerbi_data_platform_name="Oracle", datahub_data_platform_name="oracle" ), - OracleMQueryResolver, + OracleFullTableNameCreator, FunctionName.ORACLE_DATA_ACCESS, ) + SNOWFLAKE = ( DataPlatformPair( powerbi_data_platform_name="Snowflake", datahub_data_platform_name="snowflake" ), - SnowflakeMQueryResolver, + SnowflakeFullTableNameCreator, FunctionName.SNOWFLAKE_DATA_ACCESS, ) + MS_SQL = ( DataPlatformPair( powerbi_data_platform_name="Sql", datahub_data_platform_name="mssql" ), - MSSqlMQueryResolver, + MSSqlFullTableNameCreator, FunctionName.MSSQL_DATA_ACCESS, ) + NATIVE_QUERY = ( + None, + NativeQueryFullTableNameCreator, + FunctionName.NATIVE_QUERY, + ) + def get_data_platform_pair(self) -> DataPlatformPair: return self.value[0] - def get_m_query_resolver(self) -> Type[BaseMQueryResolver]: + def get_table_full_name_creator(self) -> Type[FullTableNameCreator]: return self.value[1] - def get_function_name(self) -> FunctionName: - return self.value[2] - - -def get_resolver(parse_tree: Tree) -> Optional[SupportedDataPlatform]: + def get_function_name(self) -> str: + return self.value[2].value - _filter: Any = parse_tree.find_data("invoke_expression") - - letter_tree: Tree = next(_filter).children[0] - data_access_func: str = tree_function.make_function_name(letter_tree) - - LOGGER.debug( - "Looking for data-access(%s) resolver", - data_access_func, - ) + @staticmethod + def get_function_names() -> List[str]: + functions: List[str] = [] + for supported_resolver in SupportedResolver: + functions.append( + supported_resolver.get_function_name() + ) - for platform in SupportedDataPlatform: - if platform.get_function_name().value == data_access_func: - return platform + return functions - LOGGER.info("M-Query resolver not found for data access function %s", data_access_func) + @staticmethod + def get_resolver(function_name: str) -> Optional["SupportedResolver"]: + for supported_resolver in SupportedResolver: + if function_name == supported_resolver.get_function_name(): + return supported_resolver - return None + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py index 91c9550903bd8..66922e9e11e73 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py @@ -32,7 +32,7 @@ def get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: LOGGER.debug("Actual Value = %s", actual_value) LOGGER.debug("Expected Value = %s", variable) - if actual_value == variable: + if actual_value.lower() == variable.lower(): return tree LOGGER.info("Provided variable(%s) not found in variable rule", variable) @@ -141,3 +141,7 @@ def get_all_function_name(tree: Tree) -> List[str]: first_identifier_func = partial(get_first_rule, rule="identifier") first_primary_expression_func = partial(get_first_rule, rule="primary_expression") first_identifier_func = partial(get_first_rule, rule="identifier") +first_invoke_expression_func = partial(get_first_rule, rule="invoke_expression") +first_type_expression_func = partial(get_first_rule, rule="type_expression") + + diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 7a51006e1e46f..2709a7db0e304 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -33,177 +33,192 @@ ] -def test_parse_m_query1(): - expression: str = M_QUERIES[0] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "TESTTABLE_Table" - - -def test_parse_m_query2(): - expression: str = M_QUERIES[1] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom2"' - - -def test_parse_m_query3(): - expression: str = M_QUERIES[2] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Conditional Column"' - - -def test_parse_m_query4(): - expression: str = M_QUERIES[3] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Changed Type"' - - -def test_parse_m_query5(): - expression: str = M_QUERIES[4] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Renamed Columns"' - - -def test_parse_m_query6(): - expression: str = M_QUERIES[5] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' - - -def test_parse_m_query7(): - expression: str = M_QUERIES[6] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "Source" - - -def test_parse_m_query8(): - expression: str = M_QUERIES[7] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query9(): - expression: str = M_QUERIES[8] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' - - -def test_parse_m_query10(): - expression: str = M_QUERIES[9] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Changed Type1"' - - -def test_parse_m_query11(): - expression: str = M_QUERIES[10] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "Source" - - -def test_parse_m_query12(): - expression: str = M_QUERIES[11] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' - - -def test_parse_m_query13(): - expression: str = M_QUERIES[12] - parse_tree: Tree = parser._parse_expression(expression) - assert tree_function.get_output_variable(parse_tree) == "two_source_table" - - -def test_snowflake_regular_case(): - q: str = M_QUERIES[0] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "TESTTABLE" - assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.get_data_platform_pair().powerbi_data_platform_name - ) - - -def test_postgres_regular_case(): - q: str = M_QUERIES[13] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "order_date" - assert data_platform_tables[0].full_name == "mics.public.order_date" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.POSTGRES_SQL.get_data_platform_pair().powerbi_data_platform_name - ) - - -def test_oracle_regular_case(): - q: str = M_QUERIES[14] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "EMPLOYEES" - assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.ORACLE.get_data_platform_pair().powerbi_data_platform_name - ) - - -def test_mssql_regular_case(): - q: str = M_QUERIES[15] - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=q, - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 1 - assert data_platform_tables[0].name == "book_issue" - assert data_platform_tables[0].full_name == "library.dbo.book_issue" - assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.get_data_platform_pair().powerbi_data_platform_name - ) - +# def test_parse_m_query1(): +# expression: str = M_QUERIES[0] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == "TESTTABLE_Table" +# +# +# def test_parse_m_query2(): +# expression: str = M_QUERIES[1] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Added Custom2"' +# +# +# def test_parse_m_query3(): +# expression: str = M_QUERIES[2] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Added Conditional Column"' +# +# +# def test_parse_m_query4(): +# expression: str = M_QUERIES[3] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Changed Type"' +# +# +# def test_parse_m_query5(): +# expression: str = M_QUERIES[4] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Renamed Columns"' +# +# +# def test_parse_m_query6(): +# expression: str = M_QUERIES[5] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query7(): +# expression: str = M_QUERIES[6] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == "Source" +# +# +# def test_parse_m_query8(): +# expression: str = M_QUERIES[7] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query9(): +# expression: str = M_QUERIES[8] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' +# +# +# def test_parse_m_query10(): +# expression: str = M_QUERIES[9] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Changed Type1"' +# +# +# def test_parse_m_query11(): +# expression: str = M_QUERIES[10] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == "Source" +# +# +# def test_parse_m_query12(): +# expression: str = M_QUERIES[11] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' +# +# +# def test_parse_m_query13(): +# expression: str = M_QUERIES[12] +# parse_tree: Tree = parser._parse_expression(expression) +# assert tree_function.get_output_variable(parse_tree) == "two_source_table" +# +# +# def test_snowflake_regular_case(): +# q: str = M_QUERIES[0] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# +# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "TESTTABLE" +# assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" +# assert ( +# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name +# == SupportedDataPlatform.SNOWFLAKE.get_data_platform_pair().powerbi_data_platform_name +# ) +# +# +# def test_postgres_regular_case(): +# q: str = M_QUERIES[13] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "order_date" +# assert data_platform_tables[0].full_name == "mics.public.order_date" +# assert ( +# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name +# == SupportedDataPlatform.POSTGRES_SQL.get_data_platform_pair().powerbi_data_platform_name +# ) +# +# +# def test_oracle_regular_case(): +# q: str = M_QUERIES[14] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "EMPLOYEES" +# assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" +# assert ( +# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name +# == SupportedDataPlatform.ORACLE.get_data_platform_pair().powerbi_data_platform_name +# ) +# +# +# def test_mssql_regular_case(): +# q: str = M_QUERIES[15] +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=q, +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# +# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( +# table, reporter +# ) +# +# assert len(data_platform_tables) == 1 +# assert data_platform_tables[0].name == "book_issue" +# assert data_platform_tables[0].full_name == "library.dbo.book_issue" +# assert ( +# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name +# == SupportedDataPlatform.MS_SQL.get_data_platform_pair().powerbi_data_platform_name +# ) +# +# +# def test_native_query_disabled(): +# table: PowerBiAPI.Table = PowerBiAPI.Table( +# expression=M_QUERIES[1], +# name="virtual_order_table", +# full_name="OrderDataSet.virtual_order_table", +# ) +# +# reporter = PowerBiDashboardSourceReport() +# +# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( +# table, reporter, native_query_enabled=False +# ) +# +# assert len(data_platform_tables) == 0 def test_native_query_disabled(): table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=M_QUERIES[1], + expression=M_QUERIES[9], name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) @@ -211,7 +226,7 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter, native_query_enabled=False + table, reporter ) assert len(data_platform_tables) == 0 From eb3eda5d3fb27d0b199c50fcc4e8c8708d89e6dd Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 19 Dec 2022 13:52:32 +0530 Subject: [PATCH 26/53] native query in MS-SQL --- .../powerbi/m_query/native_sql_parser.py | 29 ++ .../source/powerbi/m_query/resolver.py | 368 ++++++++---------- .../source/powerbi/m_query/validator.py | 7 +- .../integration/powerbi/test_m_parser.py | 233 ++++++----- 4 files changed, 340 insertions(+), 297 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py new file mode 100644 index 0000000000000..f0ee706c1865b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -0,0 +1,29 @@ +import sqlparse +from typing import List + + +def get_tables(native_query: str) -> List[str]: + # As per current use-case, we are extracting only single table from "from" + tables: List[str] = [] + parsed = sqlparse.parse(native_query)[0] + + tokens: List[sqlparse.sql.Token] = list(parsed.tokens) + length: int = len(tokens) + from_index: int = -1 + for index, token in enumerate(tokens): + if token.value.lower().strip() == "from" and str(token.ttype) == "Token.Keyword": + from_index = index+1 + break + + table_name = None + + while from_index < length: + if isinstance(tokens[from_index], sqlparse.sql.Identifier): + table_name = tokens[from_index].value + break + from_index = from_index + 1 + + if table_name is not None: + tables.append(table_name) + + return tables diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 50a5e488d32e0..bf78b357073f6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -10,7 +10,7 @@ from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport from datahub.ingestion.source.powerbi.proxy import PowerBiAPI -from datahub.ingestion.source.powerbi.m_query import tree_function +from datahub.ingestion.source.powerbi.m_query import tree_function, native_sql_parser LOGGER = logging.getLogger(__name__) @@ -28,11 +28,37 @@ class DataPlatformTable: data_platform_pair: DataPlatformPair -class FullTableNameCreator(ABC): +class SupportedDataPlatform(Enum): + POSTGRES_SQL = DataPlatformPair( + powerbi_data_platform_name="PostgreSQL", + datahub_data_platform_name="postgres" + ) + + ORACLE = DataPlatformPair( + powerbi_data_platform_name="Oracle", + datahub_data_platform_name="oracle" + ) + + SNOWFLAKE = DataPlatformPair( + powerbi_data_platform_name="Snowflake", + datahub_data_platform_name="snowflake" + ) + + MS_SQL = DataPlatformPair( + powerbi_data_platform_name="Sql", + datahub_data_platform_name="mssql" + ) + + +class AbstractTableFullNameCreator(ABC): @abstractmethod def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: pass + @abstractmethod + def get_platform_pair(self) -> DataPlatformPair: + pass + class AbstractDataAccessMQueryResolver(ABC): table: PowerBiAPI.Table @@ -177,16 +203,21 @@ def fill_token_dict(identifier: str, supported_data_access_func: List[str], t_di new_identifier: str = tokens[0] fill_token_dict(new_identifier, supported_data_access_func, t_dict) else: - identifier, key_vs_value = self.get_item_selector_tokens( + new_identifier, key_vs_value = self.get_item_selector_tokens( tree_function.first_expression_func(expression_tree) ) current_selector: Dict[str, Any] = { - f"{identifier}": { - "item_selectors": [key_vs_value], + f"{new_identifier}": { + "item_selectors": [ + { + "items": key_vs_value, + "assigned_to": identifier + } + ], **t_dict, } } - fill_token_dict(identifier, supported_data_access_func, current_selector) + fill_token_dict(new_identifier, supported_data_access_func, current_selector) fill_token_dict(identifier, SupportedResolver.get_function_names(), {}) @@ -216,20 +247,20 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) continue - table_full_name_creator: FullTableNameCreator = supported_resolver.get_table_full_name_creator()() + table_full_name_creator: AbstractTableFullNameCreator = supported_resolver.get_table_full_name_creator()() for table_full_name in table_full_name_creator.get_full_table_names(token_dict): data_platform_tables.append( DataPlatformTable( name=table_full_name.split(".")[-1], full_name=table_full_name, - data_platform_pair=supported_resolver.get_data_platform_pair() + data_platform_pair=table_full_name_creator.get_platform_pair() ) ) return data_platform_tables -class DefaultTwoStepDataAccessSources(FullTableNameCreator): +class DefaultTwoStepDataAccessSources(AbstractTableFullNameCreator, ABC): """ These are the DataSource for which PowerBI Desktop generates default M-Query of following pattern let @@ -239,55 +270,92 @@ class DefaultTwoStepDataAccessSources(FullTableNameCreator): dbo_book_issue """ - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - variable_statement: Optional[Tree] = tree_function.get_variable_statement( - self.parse_tree, output_variable - ) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - f"output variable ({output_variable}) statement not found in table expression", - ) - return None - source, tokens = self.get_item_selector_tokens(cast(Tree, variable_statement)) - if source is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Schema detail not found in table expression", - ) - return None + def two_level_access_pattern(self, token_dict: Dict[str, Any]) -> List[str]: + full_table_names: List[str] = [] - schema_name: str = tokens["Schema"] - table_name: str = tokens["Item"] - # Look for database-name - variable_statement = tree_function.get_variable_statement(self.parse_tree, source) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-source-statement", - f"source variable {source} statement not found in table expression", - ) - return None - arg_list = self.get_argument_list(cast(Tree, variable_statement)) - if arg_list is None or len(arg_list) < 1: - self.reporter.report_warning( - f"{self.table.full_name}-database-arg-list", - "Expected number of argument not found in data-access function of table expression", + LOGGER.debug("Processing PostgreSQL token-dict %s", token_dict) + + for data_access_function in token_dict: + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(token_dict[data_access_function]["arg_list"]) + ), + char="\"" ) - return None + # delete arg_list as we consumed it and don't want to process it in next step + if len(arguments) != 2: + LOGGER.debug("Expected 2 arguments, but got {%s}", len(arguments)) + return full_table_names + + del token_dict[data_access_function]["arg_list"] + + db_name: str = arguments[1] + for source in token_dict[data_access_function]: + source_dict: Dict[str, Any] = token_dict[data_access_function][source] + for schema in source_dict["item_selectors"]: + schema_name: str = schema["items"]["Schema"] + table_name: str = schema["items"]["Item"] + full_table_names.append( + f"{db_name}.{schema_name}.{table_name}" + ) - database_name: str = cast(List[str], arg_list)[1] # 1st token is database name - return cast(Optional[str], f"{database_name}.{schema_name}.{table_name}") + LOGGER.debug("PostgreSQL full-table-names = %s", full_table_names) + return full_table_names -class PostgresFullTableNameCreator(DefaultTwoStepDataAccessSources): - pass +class PostgresTableFullNameCreator(DefaultTwoStepDataAccessSources): + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + return self.two_level_access_pattern(token_dict) -class MSSqlFullTableNameCreator(DefaultTwoStepDataAccessSources): - pass + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.POSTGRES_SQL.value -class OracleFullTableNameCreator(FullTableNameCreator): +class MSSqlTableFullNameCreator(DefaultTwoStepDataAccessSources): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.MS_SQL.value + + def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + full_table_names: List[str] = [] + data_access_dict: Dict[str, Any] = list(token_dict.values())[0] + + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_dict["arg_list"]) + ), + char="\"" + ) + + if len(arguments) == 2: + # It is regular case of MS-SQL + LOGGER.debug("Handling with regular case") + return self.two_level_access_pattern(token_dict) + + if len(arguments) >= 4 and arguments[2] != "Query": + LOGGER.debug("Unsupported case is found. Second index is not the Query") + return full_table_names + + db_name: str = arguments[1] + tables: List[str] = native_sql_parser.get_tables(arguments[3]) + for table in tables: + schema_and_table: List[str] = table.split(".") + if len(schema_and_table) == 1: + # schema name is not present. Default schema name in MS-SQL is dbo + # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16 + schema_and_table.insert(0, "dbo") + + full_table_names.append( + f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" + ) + LOGGER.debug("MS-SQL full-table-names %s", full_table_names) + + return full_table_names + + +class OracleTableFullNameCreator(AbstractTableFullNameCreator): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.ORACLE.value def _get_db_name(self, value: str) -> Optional[str]: error_message: str = f"The target argument ({value}) should in the format of :/[.]" @@ -303,143 +371,72 @@ def _get_db_name(self, value: str) -> Optional[str]: return db_name def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - # Find step for the output variable - variable_statement: Optional[Tree] = tree_function.get_variable_statement( - self.parse_tree, output_variable - ) - - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - f"output variable ({output_variable}) statement not found in table expression", - ) - return None - - schema_variable, tokens = self.get_item_selector_tokens( - cast(Tree, variable_statement) - ) - if schema_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "table name not found in table expression", - ) - return None + full_table_names: List[str] = [] - table_name: str = tokens["Name"] + LOGGER.debug("Processing Oracle token-dict %s", token_dict) - # Find step for the schema variable - variable_statement = tree_function.get_variable_statement( - self.parse_tree, cast(str, schema_variable) - ) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-schema-variable-statement", - f"schema variable ({schema_variable}) statement not found in table expression", - ) - return None + for data_access_function in token_dict: + arguments: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(token_dict[data_access_function]["arg_list"])) + # delete arg_list as we consumed it and don't want to process it in next step + del token_dict[data_access_function]["arg_list"] - source_variable, tokens = self.get_item_selector_tokens(variable_statement) - if source_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Schema not found in table expression", - ) - return None + for source in token_dict[data_access_function]: + source_dict: Dict[str, Any] = token_dict[data_access_function][source] - schema_name: str = tokens["Schema"] + db_name: Optional[str] = self._get_db_name(arguments[0]) + if db_name is None: + return full_table_names - # Find step for the database access variable - variable_statement = tree_function.get_variable_statement(self.parse_tree, source_variable) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-source-variable-statement", - f"schema variable ({source_variable}) statement not found in table expression", - ) - return None - arg_list = self.get_argument_list(variable_statement) - if arg_list is None or len(arg_list) < 1: - self.reporter.report_warning( - f"{self.table.full_name}-database-arg-list", - "Expected number of argument not found in data-access function of table expression", - ) - return None - # The first argument has database name. format localhost:1521/salesdb.GSLAB.COM - db_name: Optional[str] = self._get_db_name(arg_list[0]) - if db_name is None: - LOGGER.debug(f"Fail to extract db name from the target {arg_list}") + for schema in source_dict["item_selectors"]: + schema_name: str = schema["items"]["Schema"] + for item_selectors in source_dict[schema["assigned_to"]]: + for item_selector in source_dict[schema["assigned_to"]][item_selectors]: + table_name: str = item_selector["items"]["Name"] + full_table_names.append( + f"{db_name}.{schema_name}.{table_name}" + ) - return f"{db_name}.{schema_name}.{table_name}" + return full_table_names -class SnowflakeFullTableNameCreator(FullTableNameCreator): +class SnowflakeTableFullNameCreator(AbstractTableFullNameCreator): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.SNOWFLAKE.value def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - # Find step for the output variable - variable_statement: Optional[Tree] = tree_function.get_variable_statement( - self.parse_tree, output_variable - ) + full_table_names: List[str] = [] - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - f"output variable ({output_variable}) statement not found in table expression", - ) - return None + LOGGER.debug("Processing Snowflake token-dict %s", token_dict) - schema_variable, tokens = self.get_item_selector_tokens(variable_statement) - if schema_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "table name not found in table expression", - ) - return None + data_access_dict: Dict[str, Any] = list(token_dict.values())[0] + del data_access_dict["arg_list"] - table_name: str = tokens["Name"] + for source in data_access_dict: + for db_its in data_access_dict[source]["item_selectors"]: + db_name: str = db_its["items"]["Name"] + for schema_its in data_access_dict[source][db_its["assigned_to"]]["item_selectors"]: + schema_name: str = schema_its["items"]["Name"] + for table_its in data_access_dict[source][db_its["assigned_to"]][schema_its["assigned_to"]]["item_selectors"]: + table_name: str = table_its["items"]["Name"] + full_table_names.append( + f"{db_name}.{schema_name}.{table_name}" + ) - # Find step for the schema variable - variable_statement = tree_function.get_variable_statement(self.parse_tree, schema_variable) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-schema-variable-statement", - f"schema variable ({schema_variable}) statement not found in table expression", - ) - return None - - source_variable, tokens = self.get_item_selector_tokens(variable_statement) - if source_variable is None or tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "schema name not found in table expression", - ) - return None - - schema_name: str = tokens["Name"] - - # Find step for the database access variable - variable_statement = tree_function.get_variable_statement(self.parse_tree, source_variable) - if variable_statement is None: - self.reporter.report_warning( - f"{self.table.full_name}-source-variable-statement", - f"schema variable ({source_variable}) statement not found in table expression", - ) - return None - _, tokens = self.get_item_selector_tokens(variable_statement) - if tokens is None: - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "database name not found in table expression", - ) - return None + LOGGER.debug("Snowflake full-table-name %s", full_table_names) - db_name: str = tokens["Name"] + return full_table_names - return f"{db_name}.{schema_name}.{table_name}" - -class NativeQueryFullTableNameCreator(FullTableNameCreator): +class NativeQueryTableFullNameCreator(AbstractTableFullNameCreator): + def get_platform_pair(self) -> DataPlatformPair: + return SupportedDataPlatform.POSTGRES_SQL.value def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - pass + print("===NATIVE========") + for source in token_dict: + print(tree_function.token_values(token_dict[source]["arg_list"])) + return [] class FunctionName(Enum): @@ -452,55 +449,35 @@ class FunctionName(Enum): class SupportedResolver(Enum): POSTGRES_SQL = ( - DataPlatformPair( - powerbi_data_platform_name="PostgreSQL", - datahub_data_platform_name="postgres" - ), - PostgresFullTableNameCreator, + PostgresTableFullNameCreator, FunctionName.POSTGRESQL_DATA_ACCESS, ) ORACLE = ( - DataPlatformPair( - powerbi_data_platform_name="Oracle", - datahub_data_platform_name="oracle" - ), - OracleFullTableNameCreator, + OracleTableFullNameCreator, FunctionName.ORACLE_DATA_ACCESS, ) SNOWFLAKE = ( - DataPlatformPair( - powerbi_data_platform_name="Snowflake", - datahub_data_platform_name="snowflake" - ), - SnowflakeFullTableNameCreator, + SnowflakeTableFullNameCreator, FunctionName.SNOWFLAKE_DATA_ACCESS, ) MS_SQL = ( - DataPlatformPair( - powerbi_data_platform_name="Sql", - datahub_data_platform_name="mssql" - ), - MSSqlFullTableNameCreator, + MSSqlTableFullNameCreator, FunctionName.MSSQL_DATA_ACCESS, ) NATIVE_QUERY = ( - None, - NativeQueryFullTableNameCreator, + NativeQueryTableFullNameCreator, FunctionName.NATIVE_QUERY, ) - def get_data_platform_pair(self) -> DataPlatformPair: + def get_table_full_name_creator(self) -> Type[AbstractTableFullNameCreator]: return self.value[0] - def get_table_full_name_creator(self) -> Type[FullTableNameCreator]: - return self.value[1] - def get_function_name(self) -> str: - return self.value[2].value + return self.value[1].value @staticmethod def get_function_names() -> List[str]: @@ -514,8 +491,9 @@ def get_function_names() -> List[str]: @staticmethod def get_resolver(function_name: str) -> Optional["SupportedResolver"]: + LOGGER.debug("Looking for resolver %s", function_name) for supported_resolver in SupportedResolver: if function_name == supported_resolver.get_function_name(): return supported_resolver - + LOGGER.debug("Looking not found for resolver %s", function_name) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py index 3941e4ed38ed5..02edab6dac758 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -40,12 +40,7 @@ def validate_parse_tree(tree: Tree, native_query_enabled: bool = True) -> Tuple[ """ functions: List[str] = tree_function.get_all_function_name(tree) if len(functions) == 0: - return False, "Function call not found" - - data_access_function_names: List[str] = [x.get_function_name().value for x in resolver.SupportedDataPlatform] - result: Set[str] = set(data_access_function_names) & set(functions) - if len(result) != 1: - return False, f"More than one data-access functions are found in expression. Functions = {result}" + return False, "Function calls not found" if native_query_enabled is False: if resolver.FunctionName.NATIVE_QUERY.value in functions: diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 2709a7db0e304..d65bfd84774e5 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -9,6 +9,7 @@ from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport from datahub.ingestion.source.powerbi.m_query.resolver import ( DataPlatformTable, + SupportedResolver, SupportedDataPlatform, ) from datahub.ingestion.source.powerbi.proxy import PowerBiAPI @@ -111,77 +112,138 @@ # assert tree_function.get_output_variable(parse_tree) == "two_source_table" # # -# def test_snowflake_regular_case(): -# q: str = M_QUERIES[0] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# -# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "TESTTABLE" -# assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" -# assert ( -# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name -# == SupportedDataPlatform.SNOWFLAKE.get_data_platform_pair().powerbi_data_platform_name -# ) -# -# -# def test_postgres_regular_case(): -# q: str = M_QUERIES[13] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "order_date" -# assert data_platform_tables[0].full_name == "mics.public.order_date" -# assert ( -# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name -# == SupportedDataPlatform.POSTGRES_SQL.get_data_platform_pair().powerbi_data_platform_name -# ) -# -# -# def test_oracle_regular_case(): -# q: str = M_QUERIES[14] -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "EMPLOYEES" -# assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" -# assert ( -# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name -# == SupportedDataPlatform.ORACLE.get_data_platform_pair().powerbi_data_platform_name -# ) -# +def test_snowflake_regular_case(): + q: str = M_QUERIES[0] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "TESTTABLE" + assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + ) + + +def test_postgres_regular_case(): + q: str = M_QUERIES[13] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "order_date" + assert data_platform_tables[0].full_name == "mics.public.order_date" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name + ) + + +def test_oracle_regular_case(): + q: str = M_QUERIES[14] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "EMPLOYEES" + assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name + ) + + +def test_mssql_regular_case(): + q: str = M_QUERIES[15] + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=q, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == "book_issue" + assert data_platform_tables[0].full_name == "library.dbo.book_issue" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + ) + + +def test_mssql_with_query(): + mssql_queries: List[str] = [ + M_QUERIES[3], + M_QUERIES[4], + M_QUERIES[5], + M_QUERIES[7], + M_QUERIES[8], + M_QUERIES[11], + ] + expected_tables = [ + "COMMOPSDB.dbo.V_OIP_ENT_2022", + "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022", + "COMMOPSDB.dbo.V_ARR_ADDS", + "COMMOPSDB.dbo.V_PS_CD_RETENTION", + "COMMOPSDB.dbo.V_TPV_LEADERBOARD", + "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE", + ] + + for index, query in enumerate(mssql_queries): + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=query, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter, native_query_enabled=False + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == expected_tables[index].split(".")[2] + assert data_platform_tables[0].full_name == expected_tables[index] + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + ) + # -# def test_mssql_regular_case(): -# q: str = M_QUERIES[15] +# def test_native_query_disabled(): # table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=q, +# expression=M_QUERIES[1], # name="virtual_order_table", # full_name="OrderDataSet.virtual_order_table", # ) @@ -189,21 +251,15 @@ # reporter = PowerBiDashboardSourceReport() # # data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 1 -# assert data_platform_tables[0].name == "book_issue" -# assert data_platform_tables[0].full_name == "library.dbo.book_issue" -# assert ( -# data_platform_tables[0].data_platform_pair.powerbi_data_platform_name -# == SupportedDataPlatform.MS_SQL.get_data_platform_pair().powerbi_data_platform_name +# table, reporter, native_query_enabled=False # ) # -# +# assert len(data_platform_tables) == 0 + # def test_native_query_disabled(): +# # for q in M_QUERIES: # table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=M_QUERIES[1], +# expression=M_QUERIES[13], # name="virtual_order_table", # full_name="OrderDataSet.virtual_order_table", # ) @@ -211,22 +267,7 @@ # reporter = PowerBiDashboardSourceReport() # # data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter, native_query_enabled=False +# table, reporter # ) # # assert len(data_platform_tables) == 0 - -def test_native_query_disabled(): - table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=M_QUERIES[9], - name="virtual_order_table", - full_name="OrderDataSet.virtual_order_table", - ) - - reporter = PowerBiDashboardSourceReport() - - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, reporter - ) - - assert len(data_platform_tables) == 0 From 8c8fff40f9b7a81298f2b51948c680e1f6f01258 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 19 Dec 2022 17:34:15 +0530 Subject: [PATCH 27/53] Working native and regular cases --- .../powerbi/m_query/native_sql_parser.py | 18 ++ .../source/powerbi/m_query/resolver.py | 41 ++- .../source/powerbi/m_query/tree_function.py | 12 + .../integration/powerbi/test_m_parser.py | 239 ++++++++++-------- 4 files changed, 196 insertions(+), 114 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index f0ee706c1865b..bc2881119167f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -1,8 +1,23 @@ +import logging + import sqlparse from typing import List +SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] + +LOGGER = logging.getLogger() + + +def remove_special_characters(native_query: str) -> str: + for char in SPECIAL_CHARACTERS: + native_query = native_query.replace(char, " ") + + return native_query + def get_tables(native_query: str) -> List[str]: + native_query = remove_special_characters(native_query) + LOGGER.debug("Processing query = %s", native_query) # As per current use-case, we are extracting only single table from "from" tables: List[str] = [] parsed = sqlparse.parse(native_query)[0] @@ -11,6 +26,7 @@ def get_tables(native_query: str) -> List[str]: length: int = len(tokens) from_index: int = -1 for index, token in enumerate(tokens): + LOGGER.debug("%s=%s", token.value, token.ttype) if token.value.lower().strip() == "from" and str(token.ttype) == "Token.Keyword": from_index = index+1 break @@ -18,6 +34,8 @@ def get_tables(native_query: str) -> List[str]: table_name = None while from_index < length: + LOGGER.debug("%s=%s", tokens[from_index].value, tokens[from_index].ttype) + LOGGER.debug("Type=%s", type(tokens[from_index])) if isinstance(tokens[from_index], sqlparse.sql.Identifier): table_name = tokens[from_index].value break diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index bf78b357073f6..d787a67d8d225 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -430,13 +430,44 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: class NativeQueryTableFullNameCreator(AbstractTableFullNameCreator): def get_platform_pair(self) -> DataPlatformPair: - return SupportedDataPlatform.POSTGRES_SQL.value + return SupportedDataPlatform.SNOWFLAKE.value def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - print("===NATIVE========") - for source in token_dict: - print(tree_function.token_values(token_dict[source]["arg_list"])) - return [] + full_table_names: List[str] = [] + data_access_dict: Dict[str, Any] = list(token_dict.values())[0] + t1: Tree = tree_function.first_arg_list_func(data_access_dict["arg_list"]) + flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) + + if len(flat_argument_list) != 2: + LOGGER.debug("Expecting 2 argument, actual argument count is %s", len(flat_argument_list)) + LOGGER.debug("Flat argument list = %s", flat_argument_list) + return full_table_names + + data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(flat_argument_list[0]) + ) + if data_access_tokens[0] != SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: + LOGGER.debug("Provided native-query data-platform = %s", data_access_tokens[0]) + LOGGER.debug("Only Snowflake is supported in NativeQuery") + return full_table_names + + # First argument is the query + sql_query: str = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(flat_argument_list[1]) + ), + char="\"" + + )[0] # Remove any whitespaces and double quotes character + + for table in native_sql_parser.get_tables(sql_query): + if len(table.split(".")) != 3: + LOGGER.debug("Skipping table (%s) as it is not as per full_table_name format", table) + full_table_names.append( + table + ) + + return full_table_names class FunctionName(Enum): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py index 66922e9e11e73..f13688c1bd84a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py @@ -135,6 +135,18 @@ def get_all_function_name(tree: Tree) -> List[str]: return functions +def flat_argument_list(tree: Tree) -> List[Tree]: + values: List[str] = [] + + for child in tree.children: + if isinstance(child, Token): + continue + if isinstance(child, Tree) and (child.data == "argument_list" or child.data == "expression"): + values.append(child) + + return values + + first_expression_func = partial(get_first_rule, rule="expression") first_item_selector_func = partial(get_first_rule, rule="item_selector") first_arg_list_func = partial(get_first_rule, rule="argument_list") diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index d65bfd84774e5..8b42a924dab05 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -34,84 +34,84 @@ ] -# def test_parse_m_query1(): -# expression: str = M_QUERIES[0] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == "TESTTABLE_Table" -# -# -# def test_parse_m_query2(): -# expression: str = M_QUERIES[1] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Added Custom2"' -# -# -# def test_parse_m_query3(): -# expression: str = M_QUERIES[2] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Added Conditional Column"' -# -# -# def test_parse_m_query4(): -# expression: str = M_QUERIES[3] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Changed Type"' -# -# -# def test_parse_m_query5(): -# expression: str = M_QUERIES[4] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Renamed Columns"' -# -# -# def test_parse_m_query6(): -# expression: str = M_QUERIES[5] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query7(): -# expression: str = M_QUERIES[6] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == "Source" -# -# -# def test_parse_m_query8(): -# expression: str = M_QUERIES[7] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query9(): -# expression: str = M_QUERIES[8] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' -# -# -# def test_parse_m_query10(): -# expression: str = M_QUERIES[9] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Changed Type1"' -# -# -# def test_parse_m_query11(): -# expression: str = M_QUERIES[10] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == "Source" -# -# -# def test_parse_m_query12(): -# expression: str = M_QUERIES[11] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' -# -# -# def test_parse_m_query13(): -# expression: str = M_QUERIES[12] -# parse_tree: Tree = parser._parse_expression(expression) -# assert tree_function.get_output_variable(parse_tree) == "two_source_table" -# -# +def test_parse_m_query1(): + expression: str = M_QUERIES[0] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "TESTTABLE_Table" + + +def test_parse_m_query2(): + expression: str = M_QUERIES[1] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom2"' + + +def test_parse_m_query3(): + expression: str = M_QUERIES[2] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Conditional Column"' + + +def test_parse_m_query4(): + expression: str = M_QUERIES[3] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Changed Type"' + + +def test_parse_m_query5(): + expression: str = M_QUERIES[4] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Renamed Columns"' + + +def test_parse_m_query6(): + expression: str = M_QUERIES[5] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' + + +def test_parse_m_query7(): + expression: str = M_QUERIES[6] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "Source" + + +def test_parse_m_query8(): + expression: str = M_QUERIES[7] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query9(): + expression: str = M_QUERIES[8] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom1"' + + +def test_parse_m_query10(): + expression: str = M_QUERIES[9] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Changed Type1"' + + +def test_parse_m_query11(): + expression: str = M_QUERIES[10] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "Source" + + +def test_parse_m_query12(): + expression: str = M_QUERIES[11] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == '"Added Custom"' + + +def test_parse_m_query13(): + expression: str = M_QUERIES[12] + parse_tree: Tree = parser._parse_expression(expression) + assert tree_function.get_output_variable(parse_tree) == "two_source_table" + + def test_snowflake_regular_case(): q: str = M_QUERIES[0] table: PowerBiAPI.Table = PowerBiAPI.Table( @@ -240,34 +240,55 @@ def test_mssql_with_query(): == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name ) -# -# def test_native_query_disabled(): -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=M_QUERIES[1], -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# -# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter, native_query_enabled=False -# ) -# -# assert len(data_platform_tables) == 0 - -# def test_native_query_disabled(): -# # for q in M_QUERIES: -# table: PowerBiAPI.Table = PowerBiAPI.Table( -# expression=M_QUERIES[13], -# name="virtual_order_table", -# full_name="OrderDataSet.virtual_order_table", -# ) -# -# reporter = PowerBiDashboardSourceReport() -# -# data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( -# table, reporter -# ) -# -# assert len(data_platform_tables) == 0 + +def test_snowflake_native_query(): + snowflake_queries: List[str] = [ + M_QUERIES[1], + M_QUERIES[2], + M_QUERIES[6], + M_QUERIES[10], + ] + + expected_tables = [ + "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", + "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS", + ] + + for index, query in enumerate(snowflake_queries): + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=query, + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 1 + assert data_platform_tables[0].name == expected_tables[index].split(".")[2] + assert data_platform_tables[0].full_name == expected_tables[index] + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + ) + + +def test_native_query_disabled(): + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=M_QUERIES[1], + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter, native_query_enabled=False + ) + + assert len(data_platform_tables) == 0 + From 3719107b719ee4496e7eda16c7c49e11f1a15cc4 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 19 Dec 2022 18:26:43 +0530 Subject: [PATCH 28/53] lint fix --- .../powerbi/m_query/native_sql_parser.py | 11 +- .../source/powerbi/m_query/parser.py | 17 +- .../source/powerbi/m_query/resolver.py | 213 +++++++++++------- .../source/powerbi/m_query/tree_function.py | 11 +- .../source/powerbi/m_query/validator.py | 21 +- .../ingestion/source/powerbi/powerbi.py | 32 ++- .../integration/powerbi/test_m_parser.py | 19 +- 7 files changed, 188 insertions(+), 136 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index bc2881119167f..e64c3b77cff93 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -1,7 +1,7 @@ import logging +from typing import List import sqlparse -from typing import List SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] @@ -26,9 +26,12 @@ def get_tables(native_query: str) -> List[str]: length: int = len(tokens) from_index: int = -1 for index, token in enumerate(tokens): - LOGGER.debug("%s=%s", token.value, token.ttype) - if token.value.lower().strip() == "from" and str(token.ttype) == "Token.Keyword": - from_index = index+1 + LOGGER.debug("%s=%s", token.value, token.ttype) + if ( + token.value.lower().strip() == "from" + and str(token.ttype) == "Token.Keyword" + ): + from_index = index + 1 break table_name = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 2b442f1394037..1c4b674d5ef05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -1,15 +1,13 @@ import importlib.resources as pkg_resource import logging -from typing import List, Optional +from typing import List, cast import lark from lark import Lark, Tree from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport - +from datahub.ingestion.source.powerbi.m_query import resolver, validator from datahub.ingestion.source.powerbi.proxy import PowerBiAPI -from datahub.ingestion.source.powerbi.m_query import validator -from datahub.ingestion.source.powerbi.m_query import resolver LOGGER = logging.getLogger(__name__) @@ -45,13 +43,12 @@ def get_upstream_tables( try: parse_tree: Tree = _parse_expression(table.expression) - valid, message = validator.validate_parse_tree(parse_tree, native_query_enabled=native_query_enabled) + valid, message = validator.validate_parse_tree( + parse_tree, native_query_enabled=native_query_enabled + ) if valid is False: - LOGGER.debug("Validation failed: %s", message) - reporter.report_warning( - table.full_name, - message - ) + LOGGER.debug("Validation failed: %s", cast(str, message)) + reporter.report_warning(table.full_name, cast(str, message)) return [] except lark.exceptions.UnexpectedCharacters as e: LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index d787a67d8d225..12a216f838b90 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -1,17 +1,15 @@ import logging from abc import ABC, abstractmethod -from typing import Dict, Optional, List, cast, Tuple, Type, Any - -from lark import Tree - from dataclasses import dataclass from enum import Enum +from typing import Any, Dict, List, Optional, Tuple, Type, cast + +from lark import Tree from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function from datahub.ingestion.source.powerbi.proxy import PowerBiAPI -from datahub.ingestion.source.powerbi.m_query import tree_function, native_sql_parser - LOGGER = logging.getLogger(__name__) @@ -30,24 +28,20 @@ class DataPlatformTable: class SupportedDataPlatform(Enum): POSTGRES_SQL = DataPlatformPair( - powerbi_data_platform_name="PostgreSQL", - datahub_data_platform_name="postgres" - ) + powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" + ) ORACLE = DataPlatformPair( - powerbi_data_platform_name="Oracle", - datahub_data_platform_name="oracle" - ) + powerbi_data_platform_name="Oracle", datahub_data_platform_name="oracle" + ) SNOWFLAKE = DataPlatformPair( - powerbi_data_platform_name="Snowflake", - datahub_data_platform_name="snowflake" - ) + powerbi_data_platform_name="Snowflake", datahub_data_platform_name="snowflake" + ) MS_SQL = DataPlatformPair( - powerbi_data_platform_name="Sql", - datahub_data_platform_name="mssql" - ) + powerbi_data_platform_name="Sql", datahub_data_platform_name="mssql" + ) class AbstractTableFullNameCreator(ABC): @@ -74,7 +68,6 @@ def __init__( self.table = table self.parse_tree = parse_tree self.reporter = reporter - self.specific_resolver = {} @abstractmethod def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: @@ -84,16 +77,20 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): @staticmethod def get_item_selector_tokens( - expression_tree: Tree + expression_tree: Tree, ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: - item_selector: Optional[Tree] = tree_function.first_item_selector_func(expression_tree) + item_selector: Optional[Tree] = tree_function.first_item_selector_func( + expression_tree + ) if item_selector is None: LOGGER.debug("Item Selector not found in tree") LOGGER.debug(expression_tree.pretty()) return None, None - identifier_tree: Optional[Tree] = tree_function.first_identifier_func(expression_tree) + identifier_tree: Optional[Tree] = tree_function.first_identifier_func( + expression_tree + ) if identifier_tree is None: LOGGER.debug("Identifier not found in tree") LOGGER.debug(item_selector.pretty()) @@ -101,7 +98,9 @@ def get_item_selector_tokens( # remove whitespaces and quotes from token tokens: List[str] = tree_function.strip_char_from_list( - tree_function.remove_whitespaces_from_list(tree_function.token_values(cast(Tree, item_selector))), + tree_function.remove_whitespaces_from_list( + tree_function.token_values(cast(Tree, item_selector)) + ), '"', ) identifier: List[str] = tree_function.token_values( @@ -113,12 +112,16 @@ def get_item_selector_tokens( return identifier[0], dict(zip(iterator, iterator)) def get_argument_list(self, variable_statement: Tree) -> Optional[Tree]: - expression_tree: Optional[Tree] = tree_function.first_expression_func(variable_statement) + expression_tree: Optional[Tree] = tree_function.first_expression_func( + variable_statement + ) if expression_tree is None: LOGGER.debug("First expression rule not found in input tree") return None - argument_list: Optional[Tree] = tree_function.first_arg_list_func(expression_tree) + argument_list: Optional[Tree] = tree_function.first_arg_list_func( + expression_tree + ) if argument_list is None: LOGGER.debug("First argument-list rule not found in input tree") return None @@ -128,7 +131,11 @@ def get_argument_list(self, variable_statement: Tree) -> Optional[Tree]: def make_token_dict(self, identifier: str) -> Dict[str, Any]: token_dict: Dict[str, Any] = {} - def fill_token_dict(identifier: str, supported_data_access_func: List[str], t_dict: Dict[str, Any]) -> None: + def fill_token_dict( + identifier: str, + supported_data_access_func: List[str], + t_dict: Dict[str, Any], + ) -> None: """ 1) Find statement where identifier appear in the left-hand side i.e. identifier = expression 2) Check expression is function invocation i.e. invoke_expression or item_selector @@ -155,12 +162,16 @@ def fill_token_dict(identifier: str, supported_data_access_func: List[str], t_di ) return None - expression_tree: Optional[Tree] = tree_function.first_expression_func(v_statement) + expression_tree: Optional[Tree] = tree_function.first_expression_func( + v_statement + ) if expression_tree is None: LOGGER.debug("Expression tree not found") LOGGER.debug(v_statement.pretty()) return None - invoke_expression: Optional[Tree] = tree_function.first_invoke_expression_func(expression_tree) + invoke_expression: Optional[ + Tree + ] = tree_function.first_invoke_expression_func(expression_tree) if invoke_expression is not None: letter_tree: Tree = invoke_expression.children[0] data_access_func: str = tree_function.make_function_name(letter_tree) @@ -175,49 +186,61 @@ def fill_token_dict(identifier: str, supported_data_access_func: List[str], t_di ) return - first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func(invoke_expression) + first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func( + invoke_expression + ) if first_arg_tree is None: - LOGGER.debug("Function invocation without argument in expression = %s", invoke_expression.pretty()) + LOGGER.debug( + "Function invocation without argument in expression = %s", + invoke_expression.pretty(), + ) self.reporter.report_warning( f"{self.table.full_name}-variable-statement", - f"Function invocation without argument", + "Function invocation without argument", ) return None - type_expression: Optional[Tree] = tree_function.first_type_expression_func(first_arg_tree) + type_expression: Optional[ + Tree + ] = tree_function.first_type_expression_func(first_arg_tree) if type_expression is None: - LOGGER.debug("Type expression not found in expression = %s", first_arg_tree.pretty()) + LOGGER.debug( + "Type expression not found in expression = %s", + first_arg_tree.pretty(), + ) self.reporter.report_warning( f"{self.table.full_name}-variable-statement", - f"Type expression not found", + "Type expression not found", ) return None tokens: List[str] = tree_function.token_values(type_expression) if len(tokens) != 1: - LOGGER.debug("type-expression has more than one identifier = %s", type_expression.pretty()) + LOGGER.debug( + "type-expression has more than one identifier = %s", + type_expression.pretty(), + ) self.reporter.report_warning( f"{self.table.full_name}-variable-statement", - f"Unsupported type expression", + "Unsupported type expression", ) return None new_identifier: str = tokens[0] fill_token_dict(new_identifier, supported_data_access_func, t_dict) else: - new_identifier, key_vs_value = self.get_item_selector_tokens( - tree_function.first_expression_func(expression_tree) + new_identifier, key_vs_value = self.get_item_selector_tokens( # type: ignore + cast(Tree, tree_function.first_expression_func(expression_tree)) ) current_selector: Dict[str, Any] = { f"{new_identifier}": { "item_selectors": [ - { - "items": key_vs_value, - "assigned_to": identifier - } + {"items": key_vs_value, "assigned_to": identifier} ], **t_dict, } } - fill_token_dict(new_identifier, supported_data_access_func, current_selector) + fill_token_dict( + new_identifier, supported_data_access_func, current_selector + ) fill_token_dict(identifier, SupportedResolver.get_function_names(), {}) @@ -226,7 +249,9 @@ def fill_token_dict(identifier: str, supported_data_access_func: List[str], t_di def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] - output_variable: Optional[str] = tree_function.get_output_variable(self.parse_tree) + output_variable: Optional[str] = tree_function.get_output_variable( + self.parse_tree + ) if output_variable is None: self.reporter.report_warning( f"{self.table.full_name}-output-variable", @@ -240,20 +265,27 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: for data_access_func in token_dict.keys(): supported_resolver = SupportedResolver.get_resolver(data_access_func) if supported_resolver is None: - LOGGER.debug("Resolver not found for the data-access-function %s", data_access_func) + LOGGER.debug( + "Resolver not found for the data-access-function %s", + data_access_func, + ) self.reporter.report_warning( f"{self.table.full_name}-data-access-function", - f"Resolver not found for data-access-function = {data_access_func}" + f"Resolver not found for data-access-function = {data_access_func}", ) continue - table_full_name_creator: AbstractTableFullNameCreator = supported_resolver.get_table_full_name_creator()() - for table_full_name in table_full_name_creator.get_full_table_names(token_dict): + table_full_name_creator: AbstractTableFullNameCreator = ( + supported_resolver.get_table_full_name_creator()() + ) + for table_full_name in table_full_name_creator.get_full_table_names( + token_dict + ): data_platform_tables.append( DataPlatformTable( name=table_full_name.split(".")[-1], full_name=table_full_name, - data_platform_pair=table_full_name_creator.get_platform_pair() + data_platform_pair=table_full_name_creator.get_platform_pair(), ) ) @@ -278,9 +310,11 @@ def two_level_access_pattern(self, token_dict: Dict[str, Any]) -> List[str]: for data_access_function in token_dict: arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(token_dict[data_access_function]["arg_list"]) - ), - char="\"" + tree_function.token_values( + token_dict[data_access_function]["arg_list"] + ) + ), + char='"', ) # delete arg_list as we consumed it and don't want to process it in next step if len(arguments) != 2: @@ -295,9 +329,7 @@ def two_level_access_pattern(self, token_dict: Dict[str, Any]) -> List[str]: for schema in source_dict["item_selectors"]: schema_name: str = schema["items"]["Schema"] table_name: str = schema["items"]["Item"] - full_table_names.append( - f"{db_name}.{schema_name}.{table_name}" - ) + full_table_names.append(f"{db_name}.{schema_name}.{table_name}") LOGGER.debug("PostgreSQL full-table-names = %s", full_table_names) @@ -322,9 +354,9 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_dict["arg_list"]) - ), - char="\"" + tree_function.token_values(data_access_dict["arg_list"]) + ), + char='"', ) if len(arguments) == 2: @@ -361,9 +393,7 @@ def _get_db_name(self, value: str) -> Optional[str]: error_message: str = f"The target argument ({value}) should in the format of :/[.]" splitter_result: List[str] = value.split("/") if len(splitter_result) != 2: - self.reporter.report_warning( - f"{self.table.full_name}-oracle-target", error_message - ) + LOGGER.debug(error_message) return None db_name = splitter_result[1].split(".")[0] @@ -377,7 +407,8 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: for data_access_function in token_dict: arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(token_dict[data_access_function]["arg_list"])) + tree_function.token_values(token_dict[data_access_function]["arg_list"]) + ) # delete arg_list as we consumed it and don't want to process it in next step del token_dict[data_access_function]["arg_list"] @@ -391,7 +422,9 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: for schema in source_dict["item_selectors"]: schema_name: str = schema["items"]["Schema"] for item_selectors in source_dict[schema["assigned_to"]]: - for item_selector in source_dict[schema["assigned_to"]][item_selectors]: + for item_selector in source_dict[schema["assigned_to"]][ + item_selectors + ]: table_name: str = item_selector["items"]["Name"] full_table_names.append( f"{db_name}.{schema_name}.{table_name}" @@ -415,13 +448,15 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: for source in data_access_dict: for db_its in data_access_dict[source]["item_selectors"]: db_name: str = db_its["items"]["Name"] - for schema_its in data_access_dict[source][db_its["assigned_to"]]["item_selectors"]: + for schema_its in data_access_dict[source][db_its["assigned_to"]][ + "item_selectors" + ]: schema_name: str = schema_its["items"]["Name"] - for table_its in data_access_dict[source][db_its["assigned_to"]][schema_its["assigned_to"]]["item_selectors"]: + for table_its in data_access_dict[source][db_its["assigned_to"]][ + schema_its["assigned_to"] + ]["item_selectors"]: table_name: str = table_its["items"]["Name"] - full_table_names.append( - f"{db_name}.{schema_name}.{table_name}" - ) + full_table_names.append(f"{db_name}.{schema_name}.{table_name}") LOGGER.debug("Snowflake full-table-name %s", full_table_names) @@ -435,37 +470,49 @@ def get_platform_pair(self) -> DataPlatformPair: def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: full_table_names: List[str] = [] data_access_dict: Dict[str, Any] = list(token_dict.values())[0] - t1: Tree = tree_function.first_arg_list_func(data_access_dict["arg_list"]) + t1: Tree = cast( + Tree, tree_function.first_arg_list_func(data_access_dict["arg_list"]) + ) flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) if len(flat_argument_list) != 2: - LOGGER.debug("Expecting 2 argument, actual argument count is %s", len(flat_argument_list)) + LOGGER.debug( + "Expecting 2 argument, actual argument count is %s", + len(flat_argument_list), + ) LOGGER.debug("Flat argument list = %s", flat_argument_list) return full_table_names data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( tree_function.token_values(flat_argument_list[0]) ) - if data_access_tokens[0] != SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name: - LOGGER.debug("Provided native-query data-platform = %s", data_access_tokens[0]) + if ( + data_access_tokens[0] + != SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + ): + LOGGER.debug( + "Provided native-query data-platform = %s", data_access_tokens[0] + ) LOGGER.debug("Only Snowflake is supported in NativeQuery") return full_table_names # First argument is the query sql_query: str = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(flat_argument_list[1]) - ), - char="\"" - - )[0] # Remove any whitespaces and double quotes character + tree_function.token_values(flat_argument_list[1]) + ), + char='"', + )[ + 0 + ] # Remove any whitespaces and double quotes character for table in native_sql_parser.get_tables(sql_query): if len(table.split(".")) != 3: - LOGGER.debug("Skipping table (%s) as it is not as per full_table_name format", table) - full_table_names.append( - table - ) + LOGGER.debug( + "Skipping table (%s) as it is not as per full_table_name format", + table, + ) + full_table_names.append(table) return full_table_names @@ -514,9 +561,7 @@ def get_function_name(self) -> str: def get_function_names() -> List[str]: functions: List[str] = [] for supported_resolver in SupportedResolver: - functions.append( - supported_resolver.get_function_name() - ) + functions.append(supported_resolver.get_function_name()) return functions diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py index f13688c1bd84a..b6ab6b5261cf3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py @@ -1,7 +1,6 @@ import logging -from typing import Optional, List, Union, cast, Any - from functools import partial +from typing import Any, List, Optional, Union, cast from lark import Token, Tree @@ -136,12 +135,14 @@ def get_all_function_name(tree: Tree) -> List[str]: def flat_argument_list(tree: Tree) -> List[Tree]: - values: List[str] = [] + values: List[Tree] = [] for child in tree.children: if isinstance(child, Token): continue - if isinstance(child, Tree) and (child.data == "argument_list" or child.data == "expression"): + if isinstance(child, Tree) and ( + child.data == "argument_list" or child.data == "expression" + ): values.append(child) return values @@ -155,5 +156,3 @@ def flat_argument_list(tree: Tree) -> List[Tree]: first_identifier_func = partial(get_first_rule, rule="identifier") first_invoke_expression_func = partial(get_first_rule, rule="invoke_expression") first_type_expression_func = partial(get_first_rule, rule="type_expression") - - diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py index 02edab6dac758..abe7d0e46b05a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -1,15 +1,16 @@ import logging +from typing import List, Optional, Tuple -from datahub.ingestion.source.powerbi.m_query import tree_function -from datahub.ingestion.source.powerbi.m_query import resolver - -from typing import List, Tuple, Optional, Set from lark import Tree +from datahub.ingestion.source.powerbi.m_query import resolver, tree_function + LOGGER = logging.getLogger(__name__) -def any_one_should_present(supported_funcs: List[str], functions: List[str]) -> Tuple[bool, Optional[str]]: +def any_one_should_present( + supported_funcs: List[str], functions: List[str] +) -> Tuple[bool, Optional[str]]: """ Anyone functions from supported_funcs should present in functions list :param supported_funcs: List of function m_query module supports @@ -23,7 +24,9 @@ def any_one_should_present(supported_funcs: List[str], functions: List[str]) -> return False, f"Function from supported function list {supported_funcs} not found" -def all_function_should_be_known(supported_funcs: List[str], functions: List[str]) -> Tuple[bool, Optional[str]]: +def all_function_should_be_known( + supported_funcs: List[str], functions: List[str] +) -> Tuple[bool, Optional[str]]: for f in functions: if f not in supported_funcs: return False, f"Function {f} is unknown" @@ -31,7 +34,9 @@ def all_function_should_be_known(supported_funcs: List[str], functions: List[str return True, None -def validate_parse_tree(tree: Tree, native_query_enabled: bool = True) -> Tuple[bool, str]: +def validate_parse_tree( + tree: Tree, native_query_enabled: bool = True +) -> Tuple[bool, Optional[str]]: """ :param tree: tree to validate as per functions supported by m_parser module :param native_query_enabled: Whether user want to extract lineage from native query @@ -44,6 +49,6 @@ def validate_parse_tree(tree: Tree, native_query_enabled: bool = True) -> Tuple[ if native_query_enabled is False: if resolver.FunctionName.NATIVE_QUERY.value in functions: - return False, f"Lineage extraction from native query is disabled." + return False, "Lineage extraction from native query is disabled." return True, None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index b573956b0fd7b..b0fca163ed000 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -27,9 +27,7 @@ PowerBiDashboardSourceConfig, PowerBiDashboardSourceReport, ) - -from datahub.ingestion.source.powerbi.m_query import resolver -from datahub.ingestion.source.powerbi.m_query import parser +from datahub.ingestion.source.powerbi.m_query import parser, resolver from datahub.ingestion.source.powerbi.proxy import PowerBiAPI from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps from datahub.metadata.schema_classes import ( @@ -162,17 +160,25 @@ def __to_datahub_dataset( if self.__config.extract_lineage is True: # Check if upstreams table is available, parse them and create dataset URN for each upstream table upstreams: List[UpstreamClass] = [] - upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( - table, self.__reporter - ) + upstream_tables: List[ + resolver.DataPlatformTable + ] = parser.get_upstream_tables(table, self.__reporter) for upstream_table in upstream_tables: - if upstream_table.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping[upstream_table.platform_type]: + if ( + upstream_table.data_platform_pair.powerbi_data_platform_name + not in self.__config.dataset_type_mapping.keys() + ): + LOGGER.debug("Skipping upstream table for %s", ds_urn) continue platform: Union[ str, PlatformDetail - ] = self.__config.dataset_type_mapping[upstream_table.platform_type] - platform_name: str = upstream_table.data_platform_pair.datahub_data_platform_name + ] = self.__config.dataset_type_mapping[ + upstream_table.data_platform_pair.powerbi_data_platform_name + ] + platform_name: str = ( + upstream_table.data_platform_pair.datahub_data_platform_name + ) platform_instance_name: Optional[str] = None platform_env: str = DEFAULT_ENV # Determine if PlatformDetail is provided @@ -731,10 +737,12 @@ def create(cls, config_dict, ctx): return cls(config, ctx) def validate_dataset_type_mapping(self): - powerbi_data_platforms: List[str] = [data_platform.get_data_platform_pair().powerbi_data_platform_name for data_platform - in resolver.SupportedDataPlatform] + powerbi_data_platforms: List[str] = [ + data_platform.value.powerbi_data_platform_name + for data_platform in resolver.SupportedDataPlatform + ] - for key in self.source_config.keys(): + for key in self.source_config.dataset_type_mapping.keys(): if key not in powerbi_data_platforms: raise ValueError(f"PowerBI DataPlatform {key} is not supported") diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 8b42a924dab05..ac3fec0a6d303 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -2,14 +2,10 @@ from lark import Tree -from datahub.ingestion.source.powerbi.m_query import ( - parser, - tree_function -) from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.m_query import parser, tree_function from datahub.ingestion.source.powerbi.m_query.resolver import ( DataPlatformTable, - SupportedResolver, SupportedDataPlatform, ) from datahub.ingestion.source.powerbi.proxy import PowerBiAPI @@ -174,8 +170,8 @@ def test_oracle_regular_case(): assert data_platform_tables[0].name == "EMPLOYEES" assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES" assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name ) @@ -236,8 +232,8 @@ def test_mssql_with_query(): assert data_platform_tables[0].name == expected_tables[index].split(".")[2] assert data_platform_tables[0].full_name == expected_tables[index] assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name ) @@ -272,8 +268,8 @@ def test_snowflake_native_query(): assert data_platform_tables[0].name == expected_tables[index].split(".")[2] assert data_platform_tables[0].full_name == expected_tables[index] assert ( - data_platform_tables[0].data_platform_pair.powerbi_data_platform_name - == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name ) @@ -291,4 +287,3 @@ def test_native_query_disabled(): ) assert len(data_platform_tables) == 0 - From bb1dea32080f57b019d35aec58b972135411aa0e Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 19 Dec 2022 21:09:52 +0530 Subject: [PATCH 29/53] flag for switching native query --- .../src/datahub/ingestion/source/powerbi/config.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 5d6c3dc0529d7..55e8c92c423e5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -118,13 +118,17 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): extract_ownership: bool = pydantic.Field( default=True, description="Whether ownership should be ingested" ) + # Enable/Disable extracting report information + extract_reports: bool = pydantic.Field( + default=True, description="Whether reports should be ingested" + ) # Enable/Disable extracting lineage information of PowerBI Dataset extract_lineage: bool = pydantic.Field( default=True, description="Whether lineage should be ingested" ) - # Enable/Disable extracting report information - extract_reports: bool = pydantic.Field( - default=True, description="Whether reports should be ingested" + # Enable/Disable extracting lineage information from PowerBI Native query + native_query_parsing: bool = pydantic.Field( + default=True, description="Whether PowerBI native query should be parsed to extract lineage" ) @validator("dataset_type_mapping") From 788be4e7e4cfcb5af595e01c30b587f3e9e198b1 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 12:03:33 +0530 Subject: [PATCH 30/53] update test-cases --- .../source/powerbi/m_query/resolver.py | 2 +- .../golden_test_disabled_ownership.json | 114 ++++++++- .../powerbi/golden_test_ingest.json | 114 ++++++++- .../powerbi/golden_test_report.json | 230 +++++++++++++++++- .../integration/powerbi/test_m_parser.py | 2 +- .../tests/integration/powerbi/test_powerbi.py | 108 +++++++- 6 files changed, 557 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 12a216f838b90..a04ff735b9860 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -74,7 +74,7 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: pass -class BaseMQueryResolver(AbstractDataAccessMQueryResolver, ABC): +class MQueryResolver(AbstractDataAccessMQueryResolver, ABC): @staticmethod def get_item_selector_tokens( expression_tree: Tree, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json index 2154e4d7c2b56..2aeedb1c44090 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json @@ -27,13 +27,125 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json index 331e4fde518dd..094b612b17299 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json @@ -27,6 +27,118 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "corpuser", "entityUrn": "urn:li:corpuser:users.User1@foo.com", @@ -117,7 +229,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json index cfafce5d452a5..1f01a5206d8de 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json @@ -27,6 +27,118 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "corpuser", "entityUrn": "urn:li:corpuser:users.User1@foo.com", @@ -117,7 +229,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -251,6 +363,118 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "corpuser", "entityUrn": "urn:li:corpuser:users.User1@foo.com", @@ -341,7 +565,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"order\": \"0\"}, \"title\": \"ReportSection\", \"description\": \"Regional Sales Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"order\": \"0\"}, \"title\": \"ReportSection\", \"description\": \"Regional Sales Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -369,7 +593,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"order\": \"1\"}, \"title\": \"ReportSection1\", \"description\": \"Geographic Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"order\": \"1\"}, \"title\": \"ReportSection1\", \"description\": \"Geographic Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index ac3fec0a6d303..cea52c6703bb1 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -275,7 +275,7 @@ def test_snowflake_native_query(): def test_native_query_disabled(): table: PowerBiAPI.Table = PowerBiAPI.Table( - expression=M_QUERIES[1], + expression=M_QUERIES[1], # 1st index has the native query name="virtual_order_table", full_name="OrderDataSet.virtual_order_table", ) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index acaa2fb77307c..56749dc56971b 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -106,6 +106,15 @@ def register_mock_api(request_mock): "webUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445", }, }, + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed": { + "method": "GET", + "status_code": 200, + "json": { + "id": "ba0130a1-5b03-40de-9535-b34e778ea6ed", + "name": "hr_pbi_test", + "webUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed", + }, + }, "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/datasources": { "method": "GET", "status_code": 200, @@ -141,6 +150,7 @@ def register_mock_api(request_mock): "datasets": [ { "id": "05169CD2-E713-41E6-9600-1D8066D95445", + "name": "test_sf_pbi_test", "tables": [ { "name": "public issue_history", @@ -154,9 +164,95 @@ def register_mock_api(request_mock): "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", } ], - } + }, + { + "name": "SNOWFLAKE_TESTTABLE", + "source": [ + { + "expression": "let\n Source = Snowflake.Databases(\"hp123rt5.ap-southeast-2.fakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table", + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, + { + "name": "snowflake native-query", + "source": [ + { + "expression": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"", + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, + { + "name": "job-history", + "source": [ + { + "expression": 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1', + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, + { + "name": "postgres_test_table", + "source": [ + { + "expression": 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, + ], - } + }, + { + "id": "ba0130a1-5b03-40de-9535-b34e778ea6ed", + "name": "hr_pbi_test", + "tables": [ + { + "name": "dbo_book_issue", + "source": [ + { + "expression": 'let\n Source = Sql.Database("localhost", "library"),\n dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]\n in dbo_book_issue', + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, + { + "name": "ms_sql_native_table", + "source": [ + { + "expression": 'let\n Source = Sql.Database("AUPRDWHDB", "COMMOPSDB", [Query="select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,\'-\',\'\'))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION", CommandTimeout=#duration(0, 1, 30, 0)]),\n #"Changed Type" = Table.TransformColumnTypes(Source,{{"mth_date", type date}}),\n #"Added Custom" = Table.AddColumn(#"Changed Type", "Month", each Date.Month([mth_date])),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "TPV Opening", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #"Added Custom1"', + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, + + ], + }, ], }, ] @@ -221,6 +317,7 @@ def default_source_config(): "tenant_id": "0B0C960B-FCDF-4D0F-8C45-2E03BB59DDEB", "workspace_id": "64ED5CAD-7C10-4684-8180-826122881108", "extract_lineage": False, + "extract_reports": False, "dataset_type_mapping": { "PostgreSql": "postgres", "Oracle": "oracle", @@ -243,7 +340,6 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m "type": "powerbi", "config": { **default_source_config(), - "extract_reports": False, }, }, "sink": { @@ -283,7 +379,6 @@ def test_override_ownership( "config": { **default_source_config(), "extract_ownership": False, - "extract_reports": False, }, }, "sink": { @@ -320,6 +415,7 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_ "type": "powerbi", "config": { **default_source_config(), + "extract_reports": True, }, }, "sink": { @@ -333,10 +429,10 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_ pipeline.run() pipeline.raise_from_status() - mce_out_file = "golden_test_report.json" + golden_file = "golden_test_report.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "powerbi_report_mces.json", - golden_path=f"{test_resources_dir}/{mce_out_file}", + golden_path=f"{test_resources_dir}/{golden_file}", ) From b7dc3cb3e83fd518f17f005b4107f4fd4618700b Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 13:01:47 +0530 Subject: [PATCH 31/53] lineage test --- metadata-ingestion/setup.py | 1 + .../source/powerbi/m_query/parser.py | 2 +- .../powerbi/golden_test_lineage.json | 366 ++++++++++++++++++ .../tests/integration/powerbi/test_powerbi.py | 37 ++ 4 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index de339b99a824b..849641d59ec64 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -609,6 +609,7 @@ def get_long_description(): "datahub.metadata": ["schema.avsc"], "datahub.metadata.schemas": ["*.avsc"], "datahub.ingestion.source.feast_image": ["Dockerfile", "requirements.txt"], + "datahub.ingestion.source.powerbi": ["powerbi-lexical-grammar.rule"] }, entry_points=entry_points, # Dependencies. diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 1c4b674d5ef05..1731fa250e0dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -57,7 +57,7 @@ def get_upstream_tables( ) return [] - return resolver.BaseMQueryResolver( + return resolver.MQueryResolver( table=table, parse_tree=parse_tree, reporter=reporter, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json new file mode 100644 index 0000000000000..4ba7ae84d72d2 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json @@ -0,0 +1,366 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.HR.EMPLOYEES,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserInfo", + "aspect": { + "value": "{\"active\": true, \"displayName\": \"user1\", \"email\": \"User1@foo.com\", \"title\": \"user1\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "value": "{\"username\": \"User1@foo.com\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserInfo", + "aspect": { + "value": "{\"active\": true, \"displayName\": \"user2\", \"email\": \"User2@foo.com\", \"title\": \"user2\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "value": "{\"username\": \"User2@foo.com\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "value": "{\"paths\": [\"/powerbi/demo-workspace\"]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "value": "{\"customProperties\": {\"chartCount\": \"1\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"dashboardId\": \"powerbi.linkedin.com/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "value": "{\"owners\": [{\"owner\": \"urn:li:corpuser:users.User1@foo.com\", \"type\": \"NONE\"}, {\"owner\": \"urn:li:corpuser:users.User2@foo.com\", \"type\": \"NONE\"}], \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 56749dc56971b..22a2c23c05980 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -436,3 +436,40 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_ output_path=tmp_path / "powerbi_report_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + register_mock_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-lineage-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + "extract_lineage": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_lineage_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_lineage.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "powerbi_lineage_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) \ No newline at end of file From 3656cc065b1808587cc661b1ec6bceb918f6d903 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 13:45:50 +0530 Subject: [PATCH 32/53] platform instance --- .../powerbi/golden_test_lineage.json | 32 +++++++++++++++++-- .../tests/integration/powerbi/test_powerbi.py | 15 +++++++++ 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json index 4ba7ae84d72d2..45b92dee88075 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json @@ -55,6 +55,20 @@ "runId": "powerbi-lineage-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,sn-2.PBI_TEST.TEST.TESTTABLE,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", @@ -83,6 +97,20 @@ "runId": "powerbi-lineage-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,sn-2.OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -117,7 +145,7 @@ "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { - "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.HR.EMPLOYEES,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:oracle,high_performance_production_unit.salesdb.HR.EMPLOYEES,PROD)\", \"type\": \"TRANSFORMED\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -159,7 +187,7 @@ "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { - "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,operational_instance.mics.public.order_date,PROD)\", \"type\": \"TRANSFORMED\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 22a2c23c05980..e6f112b8d0ef9 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -453,6 +453,21 @@ def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_ "config": { **default_source_config(), "extract_lineage": True, + "dataset_type_mapping": { + "PostgreSql": { + "platform_instance": "operational_instance" + }, + "Oracle": { + "platform_instance": "high_performance_production_unit" + }, + "Sql": { + "platform_instance": "reporting-db" + }, + "Snowflake": { + "platform_instance": "sn-2" + }, + }, + }, }, "sink": { From 1433b605f6a0f89b7f67c98250f5600e80500a11 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 14:09:55 +0530 Subject: [PATCH 33/53] integration test --- .../golden_test_disabled_ownership.json | 102 +++++++++++++- .../powerbi/golden_test_ingest.json | 102 +++++++++++++- .../powerbi/golden_test_lineage.json | 128 +++++++++++++++++- .../powerbi/golden_test_report.json | 102 +++++++++++++- .../tests/integration/powerbi/test_powerbi.py | 6 + 5 files changed, 433 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json index 2aeedb1c44090..f913484fb85f9 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json @@ -139,6 +139,62 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"dbo_book_issue\", \"description\": \"dbo_book_issue\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"ms_sql_native_table\", \"description\": \"ms_sql_native_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "chart", "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", @@ -181,6 +237,48 @@ "runId": "powerbi-test" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"ba0130a1-5b03-40de-9535-b34e778ea6ed\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"yearly_sales\", \"description\": \"yearly_sales\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", @@ -201,7 +299,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"1\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { @@ -237,4 +335,4 @@ "runId": "powerbi-test" } } -] +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json index 094b612b17299..c89ba31b30a2f 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json @@ -139,6 +139,62 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"dbo_book_issue\", \"description\": \"dbo_book_issue\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"ms_sql_native_table\", \"description\": \"ms_sql_native_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "corpuser", "entityUrn": "urn:li:corpuser:users.User1@foo.com", @@ -265,6 +321,48 @@ "runId": "powerbi-test" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"ba0130a1-5b03-40de-9535-b34e778ea6ed\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"yearly_sales\", \"description\": \"yearly_sales\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", @@ -285,7 +383,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"1\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { @@ -335,4 +433,4 @@ "runId": "powerbi-test" } } -] +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json index 45b92dee88075..85fce7f7d4394 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json @@ -195,6 +195,90 @@ "runId": "powerbi-lineage-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"dbo_book_issue\", \"description\": \"dbo_book_issue\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:mssql,reporting-db.library.dbo.book_issue,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"ms_sql_native_table\", \"description\": \"ms_sql_native_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:mssql,reporting-db.COMMOPSDB.dbo.V_PS_CD_RETENTION,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, { "entityType": "corpuser", "entityUrn": "urn:li:corpuser:users.User1@foo.com", @@ -321,6 +405,48 @@ "runId": "powerbi-lineage-test" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"ba0130a1-5b03-40de-9535-b34e778ea6ed\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"yearly_sales\", \"description\": \"yearly_sales\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", @@ -341,7 +467,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"1\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json index 1f01a5206d8de..43707cec35e2e 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json @@ -139,6 +139,62 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"dbo_book_issue\", \"description\": \"dbo_book_issue\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"ms_sql_native_table\", \"description\": \"ms_sql_native_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "corpuser", "entityUrn": "urn:li:corpuser:users.User1@foo.com", @@ -265,6 +321,48 @@ "runId": "powerbi-test" } }, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"ba0130a1-5b03-40de-9535-b34e778ea6ed\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"yearly_sales\", \"description\": \"yearly_sales\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", @@ -285,7 +383,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"1\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { @@ -699,4 +797,4 @@ "runId": "powerbi-test" } } -] +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index e6f112b8d0ef9..bb09f544309d3 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -94,6 +94,12 @@ def register_mock_api(request_mock): "embedUrl": "https://localhost/tiles/embed/1", "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445", }, + { + "id": "23212598-23b5-4980-87cc-5fc0ecd84385", + "title": "yearly_sales", + "embedUrl": "https://localhost/tiles/embed/2", + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed", + } ] }, }, From 979b45753ffb29323b64d98a90578adc22a7f17c Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 14:22:54 +0530 Subject: [PATCH 34/53] lint fix --- .../ingestion/source/powerbi/config.py | 3 ++- .../tests/integration/powerbi/test_powerbi.py | 23 ++++++------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 55e8c92c423e5..fd9725801549d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -128,7 +128,8 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): ) # Enable/Disable extracting lineage information from PowerBI Native query native_query_parsing: bool = pydantic.Field( - default=True, description="Whether PowerBI native query should be parsed to extract lineage" + default=True, + description="Whether PowerBI native query should be parsed to extract lineage", ) @validator("dataset_type_mapping") diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index bb09f544309d3..a3e0dc99674ec 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -99,7 +99,7 @@ def register_mock_api(request_mock): "title": "yearly_sales", "embedUrl": "https://localhost/tiles/embed/2", "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed", - } + }, ] }, }, @@ -175,7 +175,7 @@ def register_mock_api(request_mock): "name": "SNOWFLAKE_TESTTABLE", "source": [ { - "expression": "let\n Source = Snowflake.Databases(\"hp123rt5.ap-southeast-2.fakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table", + "expression": 'let\n Source = Snowflake.Databases("hp123rt5.ap-southeast-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', } ], "datasourceUsages": [ @@ -188,7 +188,7 @@ def register_mock_api(request_mock): "name": "snowflake native-query", "source": [ { - "expression": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"", + "expression": 'let\n Source = Value.NativeQuery(Snowflake.Databases("bu20658.ap-southeast-2.snowflakecomputing.com","operations_analytics_warehouse_prod",[Role="OPERATIONS_ANALYTICS_MEMBER"]){[Name="OPERATIONS_ANALYTICS"]}[Data], "SELECT#(lf)concat((UPPER(REPLACE(SELLER,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,\'-\',\'\'))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4", null, [EnableFolding=true]),\n #"Added Conditional Column" = Table.AddColumn(Source, "SME Units ENT", each if [DEAL_TYPE] = "SME Unit" then [UNIT] else 0),\n #"Added Conditional Column1" = Table.AddColumn(#"Added Conditional Column", "Banklink Units", each if [DEAL_TYPE] = "Banklink" then [UNIT] else 0),\n #"Removed Columns" = Table.RemoveColumns(#"Added Conditional Column1",{"Banklink Units"}),\n #"Added Custom" = Table.AddColumn(#"Removed Columns", "Banklink Units", each if [DEAL_TYPE] = "Banklink" and [SALES_TYPE] = "3 - Upsell"\nthen [UNIT]\n\nelse if [SALES_TYPE] = "Adjusted BL Migration"\nthen [UNIT]\n\nelse 0),\n #"Added Custom1" = Table.AddColumn(#"Added Custom", "SME Units in $ (*$361)", each if [DEAL_TYPE] = "SME Unit" \nand [SALES_TYPE] <> "4 - Renewal"\n then [UNIT] * 361\nelse 0),\n #"Added Custom2" = Table.AddColumn(#"Added Custom1", "Banklink in $ (*$148)", each [Banklink Units] * 148)\nin\n #"Added Custom2"', } ], "datasourceUsages": [ @@ -223,7 +223,6 @@ def register_mock_api(request_mock): } ], }, - ], }, { @@ -256,7 +255,6 @@ def register_mock_api(request_mock): } ], }, - ], }, ], @@ -460,20 +458,13 @@ def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_ **default_source_config(), "extract_lineage": True, "dataset_type_mapping": { - "PostgreSql": { - "platform_instance": "operational_instance" - }, + "PostgreSql": {"platform_instance": "operational_instance"}, "Oracle": { "platform_instance": "high_performance_production_unit" }, - "Sql": { - "platform_instance": "reporting-db" - }, - "Snowflake": { - "platform_instance": "sn-2" - }, + "Sql": {"platform_instance": "reporting-db"}, + "Snowflake": {"platform_instance": "sn-2"}, }, - }, }, "sink": { @@ -493,4 +484,4 @@ def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_ pytestconfig, output_path=tmp_path / "powerbi_lineage_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", - ) \ No newline at end of file + ) From 955245cbdd522c737d6eae3f90a5eeb093511c3d Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 16:28:24 +0530 Subject: [PATCH 35/53] lint fix --- .../ingestion/source/powerbi/config.py | 6 ++--- .../ingestion/source/powerbi/powerbi.py | 4 +-- .../datahub/ingestion/source/powerbi/proxy.py | 26 +++++++++++++++++++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index a652bb42afbe6..448c14700bcb5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -1,17 +1,15 @@ import logging - from dataclasses import dataclass, field as dataclass_field from typing import Dict, List, Union import pydantic from pydantic import validator +from pydantic.class_validators import root_validator import datahub.emitter.mce_builder as builder +from datahub.configuration.common import AllowDenyPattern from datahub.configuration.source_common import DEFAULT_ENV, EnvBasedSourceConfigBase from datahub.ingestion.api.source import SourceReport -from pydantic.class_validators import root_validator - -from datahub.configuration.common import AllowDenyPattern LOGGER = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index e4d457df16ac5..373591ee7e09d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -764,9 +764,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: # Fetch PowerBi workspace for given workspace identifier for workspace_id in self.get_workspace_ids(): LOGGER.info(f"Scanning workspace id: {workspace_id}") - workspace = self.powerbi_client.get_workspace( - workspace_id, self.reporter - ) + workspace = self.powerbi_client.get_workspace(workspace_id, self.reporter) for dashboard in workspace.dashboards: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py index 1b644a4fb4265..a7e027551290a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py @@ -684,6 +684,32 @@ def get_reports( return reports + def get_groups(self): + group_endpoint = PowerBiAPI.BASE_URL + # Hit PowerBi + LOGGER.info(f"Request to get groups endpoint URL={group_endpoint}") + response = requests.get( + group_endpoint, + headers={Constant.Authorization: self.get_access_token()}, + ) + response.raise_for_status() + return response.json() + + def get_workspaces(self): + groups = self.get_groups() + workspaces = [ + PowerBiAPI.Workspace( + id=workspace.get("id"), + name=workspace.get("name"), + state="", + datasets={}, + dashboards=[], + ) + for workspace in groups.get("value", []) + if workspace.get("type", None) == "Workspace" + ] + return workspaces + # flake8: noqa: C901 def get_workspace( self, workspace_id: str, reporter: PowerBiDashboardSourceReport From b53de60710b649c5e9f63d5e373035f6f602a08c Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 17:28:12 +0530 Subject: [PATCH 36/53] fix golden files --- .../golden_test_disabled_ownership.json | 2 +- .../powerbi/golden_test_ingest.json | 4 +- .../powerbi/golden_test_lineage.json | 2 +- .../powerbi/golden_test_report.json | 2 +- .../golden_test_scan_all_workspaces.json | 331 +++++++++++++----- .../tests/integration/powerbi/test_powerbi.py | 42 ++- 6 files changed, 291 insertions(+), 92 deletions(-) diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json index f913484fb85f9..528477ca3d945 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json @@ -299,7 +299,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"64ED5CAD-7C10-4684-8180-826122881108\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json index c89ba31b30a2f..4646baa3ad141 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json @@ -383,7 +383,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"64ED5CAD-7C10-4684-8180-826122881108\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { @@ -433,4 +433,4 @@ "runId": "powerbi-test" } } -] \ No newline at end of file +] diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json index 85fce7f7d4394..d59d38b7d17a9 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json @@ -467,7 +467,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"64ED5CAD-7C10-4684-8180-826122881108\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json index 43707cec35e2e..9092d5bc6ea7f 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json @@ -383,7 +383,7 @@ "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"64ED5CAD-7C10-4684-8180-826122881108\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json b/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json index afa2b182168d1..255a907e39b8f 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json @@ -1,28 +1,49 @@ [ { - "auditHeader": null, "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", - "entityKeyAspect": null, + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { - "value": "{\"customProperties\": {}, \"description\": \"issue_history\", \"tags\": []}", + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)", - "entityKeyAspect": null, + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -31,36 +52,166 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"dbo_book_issue\", \"description\": \"dbo_book_issue\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"ms_sql_native_table\", \"description\": \"ms_sql_native_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "chart", "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:postgres,library_db.public.issue_history,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "chart", "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -69,17 +220,12 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "chart", "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "chartKey", "aspect": { @@ -88,17 +234,54 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"ba0130a1-5b03-40de-9535-b34e778ea6ed\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"yearly_sales\", \"description\": \"yearly_sales\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "browsePaths", "aspect": { @@ -107,36 +290,26 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "dashboardInfo", "aspect": { - "value": "{\"customProperties\": {\"chartCount\": \"1\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"64ED5CAD-7C10-4684-8180-826122881108\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", "contentType": "application/json" }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -145,17 +318,12 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" } }, { - "auditHeader": null, "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", - "entityKeyAspect": null, "changeType": "UPSERT", "aspectName": "dashboardKey", "aspect": { @@ -164,63 +332,60 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "powerbi-test", - "registryName": null, - "registryVersion": null, - "properties": null + "runId": "powerbi-test" } }, { - "aspect": { - "contentType": "application/json", - "value": "{\"paths\": [\"/powerbi/second-demo-workspace\"]}" - }, - "aspectName": "browsePaths", - "changeType": "UPSERT", "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "value": "{\"paths\": [\"/powerbi/second-demo-workspace\"]}", + "contentType": "application/json" + }, "systemMetadata": { "lastObserved": 1643871600000, "runId": "powerbi-test" } }, { - "aspect": { - "contentType": "application/json", - "value": "{\"customProperties\": {\"chartCount\": \"0\", \"workspaceName\": \"second-demo-workspace\", \"workspaceId\": \"7D668CAD-8FFC-4505-9215-655BCA5BEBAE\"}, \"title\": \"test_dashboard2\", \"description\": \"test_dashboard2\", \"charts\": [], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}" - }, - "aspectName": "dashboardInfo", - "changeType": "UPSERT", "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "value": "{\"customProperties\": {\"chartCount\": \"0\", \"workspaceName\": \"second-demo-workspace\", \"workspaceId\": \"64ED5CAD-7C22-4684-8180-826122881108\"}, \"title\": \"test_dashboard2\", \"description\": \"test_dashboard2\", \"charts\": [], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "contentType": "application/json" + }, "systemMetadata": { "lastObserved": 1643871600000, "runId": "powerbi-test" } }, { - "aspect": { - "contentType": "application/json", - "value": "{\"removed\": false}" - }, - "aspectName": "status", - "changeType": "UPSERT", "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, "systemMetadata": { "lastObserved": 1643871600000, "runId": "powerbi-test" } }, { - "aspect": { - "contentType": "application/json", - "value": "{\"dashboardTool\": \"powerbi\", \"dashboardId\": \"powerbi.linkedin.com/dashboards/7D668CAD-8FFC-4505-9215-655BCA5BEBAE\"}" - }, - "aspectName": "dashboardKey", - "changeType": "UPSERT", "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-8FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"dashboardId\": \"powerbi.linkedin.com/dashboards/7D668CAD-8FFC-4505-9215-655BCA5BEBAE\"}", + "contentType": "application/json" + }, "systemMetadata": { "lastObserved": 1643871600000, "runId": "powerbi-test" diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 4105fdceb37ee..7815b369022c0 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -134,6 +134,31 @@ def register_mock_api(request_mock): ] }, }, + "https://api.powerbi.com/v1.0/myorg/admin/dashboards/7D668CAD-8FFC-4505-9215-655BCA5BEBAE/users": { + "method": "GET", + "status_code": 200, + "json": { + "value": [ + { + "identifier": "User3@foo.com", + "displayName": "user3", + "emailAddress": "User3@foo.com", + "datasetUserAccessRight": "ReadWrite", + "graphId": "C9EE53F2-88EA-4711-A173-AF0515A3CD46", + "principalType": "User", + }, + { + "identifier": "User4@foo.com", + "displayName": "user4", + "emailAddress": "User4@foo.com", + "datasetUserAccessRight": "ReadWrite", + "graphId": "C9EE53F2-88EA-4711-A173-AF0515A5REWS", + "principalType": "User", + }, + ] + }, + + }, "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE/tiles": { "method": "GET", "status_code": 200, @@ -168,6 +193,15 @@ def register_mock_api(request_mock): "webUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445", }, }, + "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C22-4684-8180-826122881108/datasets/05169CD2-E713-41E6-96AA-1D8066D95445": { + "method": "GET", + "status_code": 200, + "json": { + "id": "05169CD2-E713-41E6-96AA-1D8066D95445", + "name": "library-dataset", + "webUrl": "http://localhost/groups/64ED5CAD-7C22-4684-8180-826122881108/datasets/05169CD2-E713-41E6-96AA-1D8066D95445", + }, + }, "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed": { "method": "GET", "status_code": 200, @@ -466,12 +500,12 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m pipeline.run() pipeline.raise_from_status() - mce_out_file = "golden_test_ingest.json" + golden_file = "golden_test_ingest.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "powerbi_mces.json", - golden_path=f"{test_resources_dir}/{mce_out_file}", + golden_path=f"{test_resources_dir}/{golden_file}", ) @@ -555,12 +589,12 @@ def test_scan_all_workspaces( pipeline.run() pipeline.raise_from_status() - mce_out_file = "golden_test_scan_all_workspaces.json" + golden_file = "golden_test_scan_all_workspaces.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "powerbi_mces_scan_all_workspaces.json", - golden_path=f"{test_resources_dir}/{mce_out_file}", + golden_path=f"{test_resources_dir}/{golden_file}", ) From 3ca31a0ae2d6c6ce1e37183f77210c64417e65ab Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 18:22:42 +0530 Subject: [PATCH 37/53] fix test --- .../tests/integration/powerbi/test_powerbi.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 7815b369022c0..fcd68e472675a 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,3 +1,4 @@ +from typing import Dict, Any from unittest import mock from freezegun import freeze_time @@ -7,8 +8,6 @@ FROZEN_TIME = "2022-02-03 07:00:00" -call_number = 1 - def mock_msal_cca(*args, **kwargs): class MsalClient: @@ -20,12 +19,16 @@ def acquire_token_for_client(self, *args, **kwargs): return MsalClient() -def scan_init_response(_request, _context): - global call_number - if call_number == 1: - call_number += 1 - return {"id": "4674efd1-603c-4129-8d82-03cf2be05aff"} - return {"id": "a674efd1-603c-4129-8d82-03cf2be05aff"} +def scan_init_response(request, context): + # Request mock is passing POST input in the form of workspaces= + workspace_id = request.text.split("=")[1] + + w_id_vs_response: Dict[str, Any] = { + "64ED5CAD-7C10-4684-8180-826122881108": {"id": "4674efd1-603c-4129-8d82-03cf2be05aff"}, + "64ED5CAD-7C22-4684-8180-826122881108": {"id": "a674efd1-603c-4129-8d82-03cf2be05aff"}, + } + + return w_id_vs_response[workspace_id] def register_mock_api(request_mock): @@ -473,8 +476,6 @@ def default_source_config(): @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - global call_number - call_number = 1 test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -514,8 +515,6 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m def test_override_ownership( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - global call_number - call_number = 1 test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -556,8 +555,6 @@ def test_override_ownership( def test_scan_all_workspaces( mock_msal, pytestconfig, tmp_path, mock_time, requests_mock ): - global call_number - call_number = 1 test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -601,8 +598,6 @@ def test_scan_all_workspaces( @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): - global call_number - call_number = 1 test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" @@ -641,6 +636,7 @@ def test_extract_reports(mock_msal, pytestconfig, tmp_path, mock_time, requests_ @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" register_mock_api(request_mock=requests_mock) @@ -678,6 +674,6 @@ def test_extract_lineage(mock_msal, pytestconfig, tmp_path, mock_time, requests_ mce_helpers.check_golden_file( pytestconfig, - output_path=tmp_path / "powerbi_lineage_mces.json", + output_path=f"{tmp_path}/powerbi_lineage_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) From 68363ff8ff73a84dc115499ad5908728de33b8d3 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 20 Dec 2022 18:39:02 +0530 Subject: [PATCH 38/53] lint fix --- .../tests/integration/powerbi/test_powerbi.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index fcd68e472675a..ce934ffc0a688 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,4 +1,4 @@ -from typing import Dict, Any +from typing import Any, Dict from unittest import mock from freezegun import freeze_time @@ -24,8 +24,12 @@ def scan_init_response(request, context): workspace_id = request.text.split("=")[1] w_id_vs_response: Dict[str, Any] = { - "64ED5CAD-7C10-4684-8180-826122881108": {"id": "4674efd1-603c-4129-8d82-03cf2be05aff"}, - "64ED5CAD-7C22-4684-8180-826122881108": {"id": "a674efd1-603c-4129-8d82-03cf2be05aff"}, + "64ED5CAD-7C10-4684-8180-826122881108": { + "id": "4674efd1-603c-4129-8d82-03cf2be05aff" + }, + "64ED5CAD-7C22-4684-8180-826122881108": { + "id": "a674efd1-603c-4129-8d82-03cf2be05aff" + }, } return w_id_vs_response[workspace_id] @@ -160,7 +164,6 @@ def register_mock_api(request_mock): }, ] }, - }, "https://api.powerbi.com/v1.0/myorg/groups/64ED5CAD-7C10-4684-8180-826122881108/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE/tiles": { "method": "GET", From 3abe48fb4107ff74a21ceaf2076962ff7c097742 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 21 Dec 2022 11:39:01 +0530 Subject: [PATCH 39/53] lint fix --- .../docs/sources/powerbi/powerbi_pre.md | 48 +- .../ingestion/source/powerbi/config.py | 13 +- .../ingestion/source/powerbi/powerbi.py | 23 +- .../golden_test_lower_case_urn_ingest.json | 436 ++++++++++++++++++ .../tests/integration/powerbi/test_powerbi.py | 42 ++ 5 files changed, 555 insertions(+), 7 deletions(-) create mode 100644 metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index c87435a077968..24f7b92cf8998 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -7,10 +7,10 @@ See the - Enhance admin APIs responses with detailed metadata ## Concept mapping -| Power BI | Datahub | +| Power BI | Datahub | |-----------------------|---------------------| | `Dashboard` | `Dashboard` | -| `Dataset, Datasource` | `Dataset` | +| `Dataset's Table` | `Dataset` | | `Tile` | `Chart` | | `Report.webUrl` | `Chart.externalUrl` | | `Workspace` | `N/A` | @@ -18,3 +18,47 @@ See the | `Page` | `Chart` | If Tile is created from report then Chart.externalUrl is set to Report.webUrl. + +## Lineage +You can control table lineage ingestion using `extract_lineage` configuration parameter, by default it is set to `true`. + +PowerBI Source extracts the lineage information by parsing PowerBI M-Query expression. + +PowerBI Source supports M-Query expression for below listed PowerBI Data Sources + +1. Snowflake +2. Oracle +3. PostgreSQL +4. MS-SQL + +Native SQL query parsing is only supported for `Snowflake` data-source and only first table from `FROM` clause will be ingested as upstream table. Advance SQL construct like JOIN and SUB-QUERIES in `FROM` clause are not supported. + +For example refer below native SQL query. The table `OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_UNIT_TARGET` will be ingested as upstream table. + +```shell +let + Source = Value.NativeQuery( + Snowflake.Databases( + "sdfsd788.ws-east-2.fakecomputing.com", + "operations_analytics_prod", + [Role = "OPERATIONS_ANALYTICS_MEMBER"] + ){[Name = "OPERATIONS_ANALYTICS"]}[Data], + "select #(lf)UPPER(REPLACE(AGENT_NAME,\'-\',\'\')) AS Agent,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,\'-\',\'\'))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_UNIT_TARGETS#(lf)#(lf)where YEAR_TARGET >= 2020#(lf)and TEAM_TYPE = \'foo\'#(lf)and TARGET_TEAM = \'bar\'", + null, + [EnableFolding = true] + ), + #"Added Conditional Column" = Table.AddColumn( + Source, + "Has PS Software Quota?", + each + if [TIER] = "Expansion (Medium)" then + "Yes" + else if [TIER] = "Acquisition" then + "Yes" + else + "No" + ) +in + #"Added Conditional Column" +``` + diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 448c14700bcb5..1d820c726544f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -90,7 +90,7 @@ def report_charts_dropped(self, view: str) -> None: class PlatformDetail: platform_instance: str = pydantic.Field( default=None, - description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source", + description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source of particular platform", ) env: str = pydantic.Field( default=DEFAULT_ENV, @@ -143,6 +143,17 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): description="Whether PowerBI native query should be parsed to extract lineage", ) + # convert PowerBI data-set URN to lower-case + convert_urns_to_lowercase: bool = pydantic.Field( + default=False, + description="Whether to convert the PowerBI assets urns to lowercase", + ) + # convert lineage dataset's urns to lowercase + convert_lineage_urns_to_lowercase: bool = pydantic.Field( + default=True, + description="Whether to convert the urns of ingested lineage dataset to lowercase", + ) + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 373591ee7e09d..61119a8658d58 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -57,7 +57,7 @@ class Mapper: """ - Transfrom PowerBi concepts Dashboard, Dataset and Tile to DataHub concepts Dashboard, Dataset and Chart + Transform PowerBi concepts Dashboard, Dataset and Tile to DataHub concepts Dashboard, Dataset and Chart """ class EquableMetadataWorkUnit(MetadataWorkUnit): @@ -80,6 +80,21 @@ def __init__( self.__config = config self.__reporter = reporter + @staticmethod + def urn_to_lowercase(value: str, flag: bool) -> str: + if flag is True: + return value.lower() + + return value + + def lineage_urn_to_lowercase(self, value): + return Mapper.urn_to_lowercase( + value, self.__config.convert_lineage_urns_to_lowercase + ) + + def assets_urn_to_lowercase(self, value): + return Mapper.urn_to_lowercase(value, self.__config.convert_urns_to_lowercase) + def new_mcp( self, entity_type, @@ -131,7 +146,7 @@ def __to_datahub_dataset( # Create a URN for dataset ds_urn = builder.make_dataset_urn( platform=self.__config.platform_name, - name=f"{table.full_name}", + name=self.assets_urn_to_lowercase(table.full_name), env=self.__config.env, ) @@ -192,7 +207,7 @@ def __to_datahub_dataset( platform=platform_name, platform_instance=platform_instance_name, env=platform_env, - name=upstream_table.full_name, + name=self.lineage_urn_to_lowercase(upstream_table.full_name), ) upstream_table_class = UpstreamClass( upstream_urn, @@ -219,7 +234,7 @@ def __to_datahub_chart( Map PowerBi tile to datahub chart """ LOGGER.info("Converting tile {}(id={}) to chart".format(tile.title, tile.id)) - # Create an URN for chart + # Create a URN for chart chart_urn = builder.make_chart_urn( self.__config.platform_name, tile.get_urn_part() ) diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json new file mode 100644 index 0000000000000..2eabb5dcc45f1 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json @@ -0,0 +1,436 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"public issue_history\", \"description\": \"public issue_history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_testtable,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"SNOWFLAKE_TESTTABLE\", \"description\": \"SNOWFLAKE_TESTTABLE\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_testtable,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query\", \"description\": \"snowflake native-query\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"job-history\", \"description\": \"job-history\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"postgres_test_table\", \"description\": \"postgres_test_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"dbo_book_issue\", \"description\": \"dbo_book_issue\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"ms_sql_native_table\", \"description\": \"ms_sql_native_table\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserInfo", + "aspect": { + "value": "{\"active\": true, \"displayName\": \"user1\", \"email\": \"User1@foo.com\", \"title\": \"user1\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "value": "{\"username\": \"User1@foo.com\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserInfo", + "aspect": { + "value": "{\"active\": true, \"displayName\": \"user2\", \"email\": \"User2@foo.com\", \"title\": \"user2\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "value": "{\"username\": \"User2@foo.com\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_testtable,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "value": "{\"customProperties\": {\"datasetId\": \"ba0130a1-5b03-40de-9535-b34e778ea6ed\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"yearly_sales\", \"description\": \"yearly_sales\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"chartId\": \"powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "value": "{\"paths\": [\"/powerbi/demo-workspace\"]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "value": "{\"customProperties\": {\"chartCount\": \"2\", \"workspaceName\": \"demo-workspace\", \"workspaceId\": \"64ED5CAD-7C10-4684-8180-826122881108\"}, \"title\": \"test_dashboard\", \"description\": \"test_dashboard\", \"charts\": [\"urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)\", \"urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)\"], \"datasets\": [], \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"dashboardUrl\": \"https://localhost/dashboards/web/1\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "value": "{\"dashboardTool\": \"powerbi\", \"dashboardId\": \"powerbi.linkedin.com/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE\"}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "value": "{\"owners\": [{\"owner\": \"urn:li:corpuser:users.User1@foo.com\", \"type\": \"NONE\"}, {\"owner\": \"urn:li:corpuser:users.User2@foo.com\", \"type\": \"NONE\"}], \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index ce934ffc0a688..d6ae1b033b10c 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -467,6 +467,7 @@ def default_source_config(): "workspace_id": "64ED5CAD-7C10-4684-8180-826122881108", "extract_lineage": False, "extract_reports": False, + "convert_lineage_urns_to_lowercase": False, "workspace_id_pattern": {"allow": ["64ED5CAD-7C10-4684-8180-826122881108"]}, "dataset_type_mapping": { "PostgreSql": "postgres", @@ -513,6 +514,47 @@ def test_powerbi_ingest(mock_msal, pytestconfig, tmp_path, mock_time, requests_m ) +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_powerbi_ingest_urn_lower_case( + mock_msal, pytestconfig, tmp_path, mock_time, requests_mock +): + + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + register_mock_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + "convert_urns_to_lowercase": True, + "convert_lineage_urns_to_lowercase": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_lower_case_urn_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_lower_case_urn_ingest.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/powerbi_lower_case_urn_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) def test_override_ownership( From 843cf0d4dabb97050fad1f00cd4e5f7d9daab036 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 21 Dec 2022 11:44:26 +0530 Subject: [PATCH 40/53] spell fix --- .../src/datahub/ingestion/source/powerbi/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 1d820c726544f..556c80ba578b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -143,7 +143,7 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): description="Whether PowerBI native query should be parsed to extract lineage", ) - # convert PowerBI data-set URN to lower-case + # convert PowerBI dataset URN to lower-case convert_urns_to_lowercase: bool = pydantic.Field( default=False, description="Whether to convert the PowerBI assets urns to lowercase", From dfe51a0aede1afea0d4ee58e5ace52eaae15aaad Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 26 Dec 2022 21:26:09 +0530 Subject: [PATCH 41/53] 1. Lint fix 2. Multiple data-source support 3. Table.Combine --- .../source/powerbi/m_query/data_classes.py | 43 ++ .../source/powerbi/m_query/parser.py | 1 + .../source/powerbi/m_query/resolver.py | 458 +++++++++++------- .../source/powerbi/m_query/tree_function.py | 1 + .../powerbi/powerbi-lexical-grammar.rule | 2 +- .../integration/powerbi/test_m_parser.py | 56 ++- 6 files changed, 379 insertions(+), 182 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py new file mode 100644 index 0000000000000..6f845a32b7007 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/data_classes.py @@ -0,0 +1,43 @@ +from abc import ABC +from dataclasses import dataclass +from typing import Any, Dict, Optional + +from lark import Tree + + +class AbstractIdentifierAccessor(ABC): # To pass lint + pass + + +# @dataclass +# class ItemSelector: +# items: Dict[str, Any] +# next: Optional[AbstractIdentifierAccessor] + + +@dataclass +class IdentifierAccessor(AbstractIdentifierAccessor): + """ + statement + public_order_date = Source{[Schema="public",Item="order_date"]}[Data] + will be converted to IdentifierAccessor instance + where: + + "Source" is identifier + + "[Schema="public",Item="order_date"]" is "items" in ItemSelector. Data of items varies as per DataSource + + "public_order_date" is in "next" of ItemSelector. The "next" will be None if this identifier is leaf i.e. table + + """ + + identifier: str + items: Dict[str, Any] + next: Optional[AbstractIdentifierAccessor] + + +@dataclass +class DataAccessFunctionDetail: + arg_list: Tree + data_access_function_name: str + identifier_accessor: Optional[IdentifierAccessor] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 1731fa250e0dd..35af1fb89f3b2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -43,6 +43,7 @@ def get_upstream_tables( try: parse_tree: Tree = _parse_expression(table.expression) + print(parse_tree.pretty()) valid, message = validator.validate_parse_tree( parse_tree, native_query_enabled=native_query_enabled ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index a04ff735b9860..592e937e1257c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -2,12 +2,16 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Tuple, Type, cast +from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast from lark import Tree from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function +from datahub.ingestion.source.powerbi.m_query.data_classes import ( + DataAccessFunctionDetail, + IdentifierAccessor, +) from datahub.ingestion.source.powerbi.proxy import PowerBiAPI LOGGER = logging.getLogger(__name__) @@ -46,7 +50,9 @@ class SupportedDataPlatform(Enum): class AbstractTableFullNameCreator(ABC): @abstractmethod - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + def get_full_table_names( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: pass @abstractmethod @@ -58,6 +64,7 @@ class AbstractDataAccessMQueryResolver(ABC): table: PowerBiAPI.Table parse_tree: Tree reporter: PowerBiDashboardSourceReport + data_access_functions: List[str] def __init__( self, @@ -68,6 +75,7 @@ def __init__( self.table = table self.parse_tree = parse_tree self.reporter = reporter + self.data_access_functions = SupportedResolver.get_function_names() @abstractmethod def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: @@ -106,21 +114,16 @@ def get_item_selector_tokens( identifier: List[str] = tree_function.token_values( cast(Tree, identifier_tree) ) # type :ignore + # convert tokens to dict iterator = iter(tokens) - # cast to satisfy lint - return identifier[0], dict(zip(iterator, iterator)) - def get_argument_list(self, variable_statement: Tree) -> Optional[Tree]: - expression_tree: Optional[Tree] = tree_function.first_expression_func( - variable_statement - ) - if expression_tree is None: - LOGGER.debug("First expression rule not found in input tree") - return None + return "".join(identifier), dict(zip(iterator, iterator)) + @staticmethod + def get_argument_list(invoke_expression: Tree) -> Optional[Tree]: argument_list: Optional[Tree] = tree_function.first_arg_list_func( - expression_tree + invoke_expression ) if argument_list is None: LOGGER.debug("First argument-list rule not found in input tree") @@ -128,13 +131,136 @@ def get_argument_list(self, variable_statement: Tree) -> Optional[Tree]: return argument_list - def make_token_dict(self, identifier: str) -> Dict[str, Any]: - token_dict: Dict[str, Any] = {} + def _process_invoke_expression( + self, invoke_expression: Tree + ) -> Union[DataAccessFunctionDetail, List[str], None]: + + letter_tree: Tree = invoke_expression.children[0] + data_access_func: str = tree_function.make_function_name(letter_tree) + # The invoke function is either DataAccess function like PostgreSQL.Database() or + # some other function like Table.AddColumn or Table.Combine and so on + if data_access_func in self.data_access_functions: + arg_list: Optional[Tree] = MQueryResolver.get_argument_list( + invoke_expression + ) + if arg_list is None: + self.reporter.report_warning( + f"{self.table.full_name}-arg-list", + f"Argument list not found for data-access-function {data_access_func}", + ) + return None + + return DataAccessFunctionDetail( + arg_list=arg_list, + data_access_function_name=data_access_func, + identifier_accessor=None, + ) + + # function is not data-access function, lets process function argument + first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func( + invoke_expression + ) + + if first_arg_tree is None: + LOGGER.debug( + "Function invocation without argument in expression = %s", + invoke_expression.pretty(), + ) + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "Function invocation without argument", + ) + return None + + first_argument: Tree = tree_function.flat_argument_list(first_arg_tree)[ + 0 + ] # take first argument only + expression: Optional[Tree] = tree_function.first_list_expression_func( + first_argument + ) + + LOGGER.debug("Extracting token from tree %s", first_argument.pretty()) + if expression is None: + expression = tree_function.first_type_expression_func(first_argument) + if expression is None: + LOGGER.debug( + "Either list_expression or type_expression is not found = %s", + invoke_expression.pretty(), + ) + self.reporter.report_warning( + f"{self.table.full_name}-variable-statement", + "Function argument expression is not supported", + ) + return None + + tokens: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(expression) + ) + + LOGGER.debug("Tokens in invoke expression are %s", tokens) + return tokens + + def _process_item_selector_expression( + self, rh_tree: Tree + ) -> Tuple[Optional[str], Optional[Dict[str, str]]]: + new_identifier, key_vs_value = self.get_item_selector_tokens( # type: ignore + cast(Tree, tree_function.first_expression_func(rh_tree)) + ) + + return new_identifier, key_vs_value + + @staticmethod + def _create_or_update_identifier_accessor( + identifier_accessor: Optional[IdentifierAccessor], + new_identifier: str, + key_vs_value: Dict[str, Any], + ) -> IdentifierAccessor: + + # def create_item_selector(items: Dict[str, Any], _next: IdentifierAccessor): + # return ItemSelector( + # items=items, + # next=_next, + # ) + # + # def update_identifier_accessor(node: IdentifierAccessor, identifier: str, items: Dict[str, Any]) -> bool: + # flag: bool = False + # if node.identifier == identifier: + # node.item_selectors.append( + # create_item_selector( + # items=items + # ) + # ) + # return True + # + # for item_selector in node.item_selectors: + # if item_selector.next is None: + # continue + # flag = update_identifier_accessor(item_selector.next, identifier, items) + # if flag is True: + # break + # + # return flag + + # It is first identifier_accessor + if identifier_accessor is None: + return IdentifierAccessor( + identifier=new_identifier, items=key_vs_value, next=None + ) + + new_identifier_accessor: IdentifierAccessor = IdentifierAccessor( + identifier=new_identifier, items=key_vs_value, next=identifier_accessor + ) + + return new_identifier_accessor - def fill_token_dict( - identifier: str, - supported_data_access_func: List[str], - t_dict: Dict[str, Any], + def create_data_access_functional_detail( + self, identifier: str + ) -> List[DataAccessFunctionDetail]: + table_links: List[DataAccessFunctionDetail] = [] + + def internal( + current_identifier: str, + identifier_accessor: Optional[IdentifierAccessor], ) -> None: """ 1) Find statement where identifier appear in the left-hand side i.e. identifier = expression @@ -146,105 +272,74 @@ def fill_token_dict( 5) This recursion will continue till we reach to data-access function and during recursion we will fill token_dict dictionary for all item_selector we find during traversal. - :param identifier: variable to look for - :param supported_data_access_func: List of supported data-access functions - :param t_dict: dict where key is identifier and value is key-value pair which represent item selected from - identifier + :param current_identifier: variable to look for + :param identifier_accessor: :return: None """ + # Grammar of variable_statement is = + # Examples: Source = PostgreSql.Database() + # public_order_date = Source{[Schema="public",Item="order_date"]}[Data] v_statement: Optional[Tree] = tree_function.get_variable_statement( - self.parse_tree, identifier + self.parse_tree, current_identifier ) if v_statement is None: self.reporter.report_warning( f"{self.table.full_name}-variable-statement", - f"output variable ({identifier}) statement not found in table expression", + f"output variable ({current_identifier}) statement not found in table expression", ) return None - expression_tree: Optional[Tree] = tree_function.first_expression_func( - v_statement - ) - if expression_tree is None: + # Any expression after "=" sign of variable-statement + rh_tree: Optional[Tree] = tree_function.first_expression_func(v_statement) + if rh_tree is None: LOGGER.debug("Expression tree not found") LOGGER.debug(v_statement.pretty()) return None + invoke_expression: Optional[ Tree - ] = tree_function.first_invoke_expression_func(expression_tree) - if invoke_expression is not None: - letter_tree: Tree = invoke_expression.children[0] - data_access_func: str = tree_function.make_function_name(letter_tree) - if data_access_func in supported_data_access_func: - token_dict.update( - { - f"{data_access_func}": { - "arg_list": self.get_argument_list(expression_tree), - **t_dict, - } - } - ) - return + ] = tree_function.first_invoke_expression_func(rh_tree) - first_arg_tree: Optional[Tree] = tree_function.first_arg_list_func( - invoke_expression - ) - if first_arg_tree is None: - LOGGER.debug( - "Function invocation without argument in expression = %s", - invoke_expression.pretty(), - ) - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Function invocation without argument", - ) - return None - type_expression: Optional[ - Tree - ] = tree_function.first_type_expression_func(first_arg_tree) - if type_expression is None: - LOGGER.debug( - "Type expression not found in expression = %s", - first_arg_tree.pretty(), - ) - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Type expression not found", + if invoke_expression is not None: + result: Union[ + DataAccessFunctionDetail, List[str], None + ] = self._process_invoke_expression(invoke_expression) + if result is None: + return None # No need to process some un-expected grammar found while processing invoke_expression + if isinstance(result, DataAccessFunctionDetail): + cast( + DataAccessFunctionDetail, result + ).identifier_accessor = identifier_accessor + table_links.append(result) # Link of a table is completed + identifier_accessor = ( + None # reset the identifier_accessor for other table ) return None + # Process first argument of the function. + # The first argument can be a single table argument or list of table. + # For example Table.Combine({t1,t2},....), here first argument is list of table. + # Table.AddColumn(t1,....), here first argument is single table. + for token in cast(List[str], result): + internal(token, identifier_accessor) - tokens: List[str] = tree_function.token_values(type_expression) - if len(tokens) != 1: - LOGGER.debug( - "type-expression has more than one identifier = %s", - type_expression.pretty(), - ) - self.reporter.report_warning( - f"{self.table.full_name}-variable-statement", - "Unsupported type expression", - ) - return None - new_identifier: str = tokens[0] - fill_token_dict(new_identifier, supported_data_access_func, t_dict) else: - new_identifier, key_vs_value = self.get_item_selector_tokens( # type: ignore - cast(Tree, tree_function.first_expression_func(expression_tree)) + new_identifier, key_vs_value = self._process_item_selector_expression( + rh_tree ) - current_selector: Dict[str, Any] = { - f"{new_identifier}": { - "item_selectors": [ - {"items": key_vs_value, "assigned_to": identifier} - ], - **t_dict, - } - } - fill_token_dict( - new_identifier, supported_data_access_func, current_selector + if new_identifier is None or key_vs_value is None: + LOGGER.debug("Required information not found in rh_tree") + return None + new_identifier_accessor: IdentifierAccessor = ( + self._create_or_update_identifier_accessor( + identifier_accessor, new_identifier, key_vs_value + ) ) - fill_token_dict(identifier, SupportedResolver.get_function_names(), {}) + return internal(new_identifier, new_identifier_accessor) - return token_dict + internal(identifier, None) + + return table_links def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: data_platform_tables: List[DataPlatformTable] = [] @@ -252,6 +347,7 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: output_variable: Optional[str] = tree_function.get_output_variable( self.parse_tree ) + if output_variable is None: self.reporter.report_warning( f"{self.table.full_name}-output-variable", @@ -259,27 +355,32 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: ) return data_platform_tables - token_dict: Dict[str, Any] = self.make_token_dict(output_variable) + table_links: List[ + DataAccessFunctionDetail + ] = self.create_data_access_functional_detail(output_variable) - # each key is data-access function - for data_access_func in token_dict.keys(): - supported_resolver = SupportedResolver.get_resolver(data_access_func) + # Each item is data-access function + for f_detail in table_links: + supported_resolver = SupportedResolver.get_resolver( + f_detail.data_access_function_name + ) if supported_resolver is None: LOGGER.debug( "Resolver not found for the data-access-function %s", - data_access_func, + f_detail.data_access_function_name, ) self.reporter.report_warning( f"{self.table.full_name}-data-access-function", - f"Resolver not found for data-access-function = {data_access_func}", + f"Resolver not found for data-access-function = {f_detail.data_access_function_name}", ) continue table_full_name_creator: AbstractTableFullNameCreator = ( supported_resolver.get_table_full_name_creator()() ) + for table_full_name in table_full_name_creator.get_full_table_names( - token_dict + f_detail ): data_platform_tables.append( DataPlatformTable( @@ -302,34 +403,34 @@ class DefaultTwoStepDataAccessSources(AbstractTableFullNameCreator, ABC): dbo_book_issue """ - def two_level_access_pattern(self, token_dict: Dict[str, Any]) -> List[str]: + def two_level_access_pattern( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: full_table_names: List[str] = [] - LOGGER.debug("Processing PostgreSQL token-dict %s", token_dict) - - for data_access_function in token_dict: - arguments: List[str] = tree_function.strip_char_from_list( - values=tree_function.remove_whitespaces_from_list( - tree_function.token_values( - token_dict[data_access_function]["arg_list"] - ) - ), - char='"', - ) - # delete arg_list as we consumed it and don't want to process it in next step - if len(arguments) != 2: - LOGGER.debug("Expected 2 arguments, but got {%s}", len(arguments)) - return full_table_names + LOGGER.debug( + "Processing PostgreSQL data-access function detail %s", + data_access_func_detail, + ) + arguments: List[str] = tree_function.strip_char_from_list( + values=tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ), + char='"', + ) - del token_dict[data_access_function]["arg_list"] + if len(arguments) != 2: + LOGGER.debug("Expected 2 arguments, but got {%s}", len(arguments)) + return full_table_names - db_name: str = arguments[1] - for source in token_dict[data_access_function]: - source_dict: Dict[str, Any] = token_dict[data_access_function][source] - for schema in source_dict["item_selectors"]: - schema_name: str = schema["items"]["Schema"] - table_name: str = schema["items"]["Item"] - full_table_names.append(f"{db_name}.{schema_name}.{table_name}") + db_name: str = arguments[1] + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Schema"] + table_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Item"] + full_table_names.append(f"{db_name}.{schema_name}.{table_name}") LOGGER.debug("PostgreSQL full-table-names = %s", full_table_names) @@ -337,8 +438,10 @@ def two_level_access_pattern(self, token_dict: Dict[str, Any]) -> List[str]: class PostgresTableFullNameCreator(DefaultTwoStepDataAccessSources): - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - return self.two_level_access_pattern(token_dict) + def get_full_table_names( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: + return self.two_level_access_pattern(data_access_func_detail) def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.POSTGRES_SQL.value @@ -348,13 +451,13 @@ class MSSqlTableFullNameCreator(DefaultTwoStepDataAccessSources): def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.MS_SQL.value - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + def get_full_table_names( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: full_table_names: List[str] = [] - data_access_dict: Dict[str, Any] = list(token_dict.values())[0] - arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( - tree_function.token_values(data_access_dict["arg_list"]) + tree_function.token_values(data_access_func_detail.arg_list) ), char='"', ) @@ -362,7 +465,7 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: if len(arguments) == 2: # It is regular case of MS-SQL LOGGER.debug("Handling with regular case") - return self.two_level_access_pattern(token_dict) + return self.two_level_access_pattern(data_access_func_detail) if len(arguments) >= 4 and arguments[2] != "Query": LOGGER.debug("Unsupported case is found. Second index is not the Query") @@ -380,6 +483,7 @@ def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: full_table_names.append( f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" ) + LOGGER.debug("MS-SQL full-table-names %s", full_table_names) return full_table_names @@ -400,35 +504,31 @@ def _get_db_name(self, value: str) -> Optional[str]: return db_name - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + def get_full_table_names( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: full_table_names: List[str] = [] - LOGGER.debug("Processing Oracle token-dict %s", token_dict) + LOGGER.debug( + "Processing Oracle data-access function detail %s", data_access_func_detail + ) - for data_access_function in token_dict: - arguments: List[str] = tree_function.remove_whitespaces_from_list( - tree_function.token_values(token_dict[data_access_function]["arg_list"]) - ) - # delete arg_list as we consumed it and don't want to process it in next step - del token_dict[data_access_function]["arg_list"] - - for source in token_dict[data_access_function]: - source_dict: Dict[str, Any] = token_dict[data_access_function][source] - - db_name: Optional[str] = self._get_db_name(arguments[0]) - if db_name is None: - return full_table_names - - for schema in source_dict["item_selectors"]: - schema_name: str = schema["items"]["Schema"] - for item_selectors in source_dict[schema["assigned_to"]]: - for item_selector in source_dict[schema["assigned_to"]][ - item_selectors - ]: - table_name: str = item_selector["items"]["Name"] - full_table_names.append( - f"{db_name}.{schema_name}.{table_name}" - ) + arguments: List[str] = tree_function.remove_whitespaces_from_list( + tree_function.token_values(data_access_func_detail.arg_list) + ) + + db_name: Optional[str] = self._get_db_name(arguments[0]) + if db_name is None: + return full_table_names + + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor + ).items["Schema"] + table_name: str = cast( + IdentifierAccessor, + cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, + ).items["Name"] + full_table_names.append(f"{db_name}.{schema_name}.{table_name}") return full_table_names @@ -437,41 +537,39 @@ class SnowflakeTableFullNameCreator(AbstractTableFullNameCreator): def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.SNOWFLAKE.value - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: - full_table_names: List[str] = [] + def get_full_table_names( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: - LOGGER.debug("Processing Snowflake token-dict %s", token_dict) + LOGGER.debug("Processing Snowflake function detail %s", data_access_func_detail) + # First is database name + db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore + # Second is schema name + schema_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor.next # type: ignore + ).items["Name"] + # Third is table name + table_name: str = cast( + IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next # type: ignore + ).items["Name"] - data_access_dict: Dict[str, Any] = list(token_dict.values())[0] - del data_access_dict["arg_list"] + full_table_name: str = f"{db_name}.{schema_name}.{table_name}" - for source in data_access_dict: - for db_its in data_access_dict[source]["item_selectors"]: - db_name: str = db_its["items"]["Name"] - for schema_its in data_access_dict[source][db_its["assigned_to"]][ - "item_selectors" - ]: - schema_name: str = schema_its["items"]["Name"] - for table_its in data_access_dict[source][db_its["assigned_to"]][ - schema_its["assigned_to"] - ]["item_selectors"]: - table_name: str = table_its["items"]["Name"] - full_table_names.append(f"{db_name}.{schema_name}.{table_name}") + LOGGER.debug("Snowflake full-table-name %s", full_table_name) - LOGGER.debug("Snowflake full-table-name %s", full_table_names) - - return full_table_names + return [full_table_name] class NativeQueryTableFullNameCreator(AbstractTableFullNameCreator): def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.SNOWFLAKE.value - def get_full_table_names(self, token_dict: Dict[str, Any]) -> List[str]: + def get_full_table_names( + self, data_access_func_detail: DataAccessFunctionDetail + ) -> List[str]: full_table_names: List[str] = [] - data_access_dict: Dict[str, Any] = list(token_dict.values())[0] t1: Tree = cast( - Tree, tree_function.first_arg_list_func(data_access_dict["arg_list"]) + Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) ) flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py index b6ab6b5261cf3..aac946d9b7987 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py @@ -156,3 +156,4 @@ def flat_argument_list(tree: Tree) -> List[Tree]: first_identifier_func = partial(get_first_rule, rule="identifier") first_invoke_expression_func = partial(get_first_rule, rule="invoke_expression") first_type_expression_func = partial(get_first_rule, rule="type_expression") +first_list_expression_func = partial(get_first_rule, rule="list_expression") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule index 2f84d2cf6365f..fe48cad5d08db 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi-lexical-grammar.rule @@ -428,7 +428,7 @@ each_expression_body: function_body let_expression: "let" NEWLINE WS_INLINE? variable_list WS_INLINE? NEWLINE? in_expression -in_expression: "in" NEWLINE? WS_INLINE NEWLINE? expression +in_expression: "in" NEWLINE? WS_INLINE? NEWLINE? expression variable_list: variable | variable NEWLINE? WS_INLINE? "," NEWLINE? WS_INLINE? variable_list diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index cea52c6703bb1..8074f3ff9a532 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -27,6 +27,7 @@ 'let\n Source = PostgreSQL.Database("localhost" , "mics" ),\n public_order_date = Source{[Schema="public",Item="order_date"]}[Data] \n in \n public_order_date', 'let\n Source = Oracle.Database("localhost:1521/salesdb.GSLAB.COM", [HierarchicalNavigation=true]), HR = Source{[Schema="HR"]}[Data], EMPLOYEES1 = HR{[Name="EMPLOYEES"]}[Data] \n in EMPLOYEES1', 'let\n Source = Sql.Database("localhost", "library"),\n dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]\n in dbo_book_issue', + 'let\n Source = Snowflake.Databases("xaa48144.snowflakecomputing.com","GSL_TEST_WH",[Role="ACCOUNTADMIN"]),\n GSL_TEST_DB_Database = Source{[Name="GSL_TEST_DB",Kind="Database"]}[Data],\n PUBLIC_Schema = GSL_TEST_DB_Database{[Name="PUBLIC",Kind="Schema"]}[Data],\n SALES_FORECAST_Table = PUBLIC_Schema{[Name="SALES_FORECAST",Kind="Table"]}[Data],\n SALES_ANALYST_Table = PUBLIC_Schema{[Name="SALES_ANALYST",Kind="Table"]}[Data],\n RESULT = Table.Combine({SALES_FORECAST_Table, SALES_ANALYST_Table})\n\nin\n RESULT', ] @@ -285,5 +286,58 @@ def test_native_query_disabled(): data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( table, reporter, native_query_enabled=False ) - assert len(data_platform_tables) == 0 + + +def test_multi_source_table(): + + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=M_QUERIES[12], # 1st index has the native query + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter, native_query_enabled=False + ) + + assert len(data_platform_tables) == 2 + assert data_platform_tables[0].full_name == "mics.public.order_date" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name + ) + + assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW" + assert ( + data_platform_tables[1].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + ) + + +def test_table_combine(): + table: PowerBiAPI.Table = PowerBiAPI.Table( + expression=M_QUERIES[16], # 1st index has the native query + name="virtual_order_table", + full_name="OrderDataSet.virtual_order_table", + ) + + reporter = PowerBiDashboardSourceReport() + + data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + table, reporter + ) + + assert len(data_platform_tables) == 2 + assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST" + assert ( + data_platform_tables[0].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + ) + + assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST" + assert ( + data_platform_tables[1].data_platform_pair.powerbi_data_platform_name + == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name + ) From 1e2dc90db689fdbb5077690a8f6575638f44dd40 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 26 Dec 2022 21:45:29 +0530 Subject: [PATCH 42/53] remove un-wanted code --- .../source/powerbi/m_query/resolver.py | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 592e937e1257c..da5740539fd05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -216,31 +216,6 @@ def _create_or_update_identifier_accessor( key_vs_value: Dict[str, Any], ) -> IdentifierAccessor: - # def create_item_selector(items: Dict[str, Any], _next: IdentifierAccessor): - # return ItemSelector( - # items=items, - # next=_next, - # ) - # - # def update_identifier_accessor(node: IdentifierAccessor, identifier: str, items: Dict[str, Any]) -> bool: - # flag: bool = False - # if node.identifier == identifier: - # node.item_selectors.append( - # create_item_selector( - # items=items - # ) - # ) - # return True - # - # for item_selector in node.item_selectors: - # if item_selector.next is None: - # continue - # flag = update_identifier_accessor(item_selector.next, identifier, items) - # if flag is True: - # break - # - # return flag - # It is first identifier_accessor if identifier_accessor is None: return IdentifierAccessor( From 6cb46caca1cf841c04b3d718eb7046bfab948c89 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 26 Dec 2022 21:50:59 +0530 Subject: [PATCH 43/53] Add new line --- .../src/datahub/ingestion/source/powerbi/m_query/resolver.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index da5740539fd05..0dfac5767426b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -399,12 +399,15 @@ def two_level_access_pattern( return full_table_names db_name: str = arguments[1] + schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Schema"] + table_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Item"] + full_table_names.append(f"{db_name}.{schema_name}.{table_name}") LOGGER.debug("PostgreSQL full-table-names = %s", full_table_names) @@ -499,10 +502,12 @@ def get_full_table_names( schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor ).items["Schema"] + table_name: str = cast( IdentifierAccessor, cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next, ).items["Name"] + full_table_names.append(f"{db_name}.{schema_name}.{table_name}") return full_table_names From 63b9b07a076e56e50a3c209575bdb69fbff7ad04 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 26 Dec 2022 22:13:41 +0530 Subject: [PATCH 44/53] review comments --- metadata-ingestion/docs/sources/powerbi/powerbi_pre.md | 2 +- .../src/datahub/ingestion/source/powerbi/proxy.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index 24f7b92cf8998..c71c46700903f 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -29,7 +29,7 @@ PowerBI Source supports M-Query expression for below listed PowerBI Data Sources 1. Snowflake 2. Oracle 3. PostgreSQL -4. MS-SQL +4. Microsoft SQL Server Native SQL query parsing is only supported for `Snowflake` data-source and only first table from `FROM` clause will be ingested as upstream table. Advance SQL construct like JOIN and SUB-QUERIES in `FROM` clause are not supported. diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py index a7e027551290a..e243b263c5da6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py @@ -269,7 +269,7 @@ def __get_users(self, workspace_id: str, entity: str, _id: str) -> List[User]: return users - def __get_report( + def _get_report( self, workspace_id: str, report_id: str ) -> Optional["PowerBiAPI.Report"]: """ @@ -527,7 +527,7 @@ def new_dataset_or_report(tile_instance: Any) -> dict: else None ), "report": ( - self.__get_report( + self._get_report( workspace_id=workspace.id, report_id=tile_instance.get("reportId"), ) From 6b7470c1a2706f1cf929fa9598b1e87b0c182062 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 26 Dec 2022 22:14:35 +0530 Subject: [PATCH 45/53] Review comments --- .../src/datahub/ingestion/source/powerbi/proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py index e243b263c5da6..dc7c0dbfac0e1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py @@ -222,7 +222,7 @@ def __get_users(self, workspace_id: str, entity: str, _id: str) -> List[User]: users: List[PowerBiAPI.User] = [] if self.__config.extract_ownership is False: LOGGER.info( - "ExtractOwnership capabilities is disabled from configuration and hence returning empty users list" + "Extract ownership capabilities is disabled from configuration and hence returning empty users list" ) return users From b378151ad0e562ae17222d0b49675b17951fa9a4 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Mon, 26 Dec 2022 22:29:12 +0530 Subject: [PATCH 46/53] rename methods --- .../ingestion/source/powerbi/powerbi.py | 140 ++++++++++-------- 1 file changed, 75 insertions(+), 65 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 61119a8658d58..e3714ecd4eb7f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -114,7 +114,7 @@ def new_mcp( aspect=aspect, ) - def __to_work_unit( + def _to_work_unit( self, mcp: MetadataChangeProposalWrapper ) -> EquableMetadataWorkUnit: return Mapper.EquableMetadataWorkUnit( @@ -126,7 +126,67 @@ def __to_work_unit( mcp=mcp, ) - def __to_datahub_dataset( + def extract_lineage( + self, table: PowerBiAPI.Table, ds_urn: str + ) -> List[MetadataChangeProposalWrapper]: + mcps: List[MetadataChangeProposalWrapper] = [] + + upstreams: List[UpstreamClass] = [] + upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + table, self.__reporter + ) + + for upstream_table in upstream_tables: + if ( + upstream_table.data_platform_pair.powerbi_data_platform_name + not in self.__config.dataset_type_mapping.keys() + ): + LOGGER.debug("Skipping upstream table for %s", ds_urn) + continue + + platform: Union[str, PlatformDetail] = self.__config.dataset_type_mapping[ + upstream_table.data_platform_pair.powerbi_data_platform_name + ] + + platform_name: str = ( + upstream_table.data_platform_pair.datahub_data_platform_name + ) + + platform_instance_name: Optional[str] = None + platform_env: str = DEFAULT_ENV + # Determine if PlatformDetail is provided + if isinstance(platform, PlatformDetail): + platform_instance_name = cast( + PlatformDetail, platform + ).platform_instance + platform_env = cast(PlatformDetail, platform).env + + upstream_urn = builder.make_dataset_urn_with_platform_instance( + platform=platform_name, + platform_instance=platform_instance_name, + env=platform_env, + name=self.lineage_urn_to_lowercase(upstream_table.full_name), + ) + + upstream_table_class = UpstreamClass( + upstream_urn, + DatasetLineageTypeClass.TRANSFORMED, + ) + upstreams.append(upstream_table_class) + + if len(upstreams) > 0: + upstream_lineage = UpstreamLineageClass(upstreams=upstreams) + mcp = MetadataChangeProposalWrapper( + entityType="dataset", + changeType=ChangeTypeClass.UPSERT, + entityUrn=ds_urn, + aspect=upstream_lineage, + ) + mcps.append(mcp) + + return mcps + + def to_datahub_dataset( self, dataset: Optional[PowerBiAPI.PowerBIDataset] ) -> List[MetadataChangeProposalWrapper]: """ @@ -173,61 +233,11 @@ def __to_datahub_dataset( dataset_mcps.extend([info_mcp, status_mcp]) if self.__config.extract_lineage is True: - # Check if upstreams table is available, parse them and create dataset URN for each upstream table - upstreams: List[UpstreamClass] = [] - upstream_tables: List[ - resolver.DataPlatformTable - ] = parser.get_upstream_tables(table, self.__reporter) - for upstream_table in upstream_tables: - if ( - upstream_table.data_platform_pair.powerbi_data_platform_name - not in self.__config.dataset_type_mapping.keys() - ): - LOGGER.debug("Skipping upstream table for %s", ds_urn) - continue - - platform: Union[ - str, PlatformDetail - ] = self.__config.dataset_type_mapping[ - upstream_table.data_platform_pair.powerbi_data_platform_name - ] - platform_name: str = ( - upstream_table.data_platform_pair.datahub_data_platform_name - ) - platform_instance_name: Optional[str] = None - platform_env: str = DEFAULT_ENV - # Determine if PlatformDetail is provided - if isinstance(platform, PlatformDetail): - platform_instance_name = cast( - PlatformDetail, platform - ).platform_instance - platform_env = cast(PlatformDetail, platform).env - - upstream_urn = builder.make_dataset_urn_with_platform_instance( - platform=platform_name, - platform_instance=platform_instance_name, - env=platform_env, - name=self.lineage_urn_to_lowercase(upstream_table.full_name), - ) - upstream_table_class = UpstreamClass( - upstream_urn, - DatasetLineageTypeClass.TRANSFORMED, - ) - upstreams.append(upstream_table_class) - - if len(upstreams) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstreams) - mcp = MetadataChangeProposalWrapper( - entityType="dataset", - changeType=ChangeTypeClass.UPSERT, - entityUrn=ds_urn, - aspect=upstream_lineage, - ) - dataset_mcps.extend([mcp]) + dataset_mcps.extend(self.extract_lineage(table, ds_urn)) return dataset_mcps - def __to_datahub_chart( + def to_datahub_chart_mcp( self, tile: PowerBiAPI.Tile, ds_mcps: List[MetadataChangeProposalWrapper] ) -> List[MetadataChangeProposalWrapper]: """ @@ -306,7 +316,7 @@ def to_urn_set(self, mcps: List[MetadataChangeProposalWrapper]) -> List[str]: ] ) - def __to_datahub_dashboard( + def to_datahub_dashboard_mcp( self, dashboard: PowerBiAPI.Dashboard, chart_mcps: List[MetadataChangeProposalWrapper], @@ -485,9 +495,9 @@ def to_datahub_chart( if tile is None: continue # First convert the dataset to MCP, because dataset mcp is used in input attribute of chart mcp - dataset_mcps = self.__to_datahub_dataset(tile.dataset) + dataset_mcps = self.to_datahub_dataset(tile.dataset) # Now convert tile to chart MCP - chart_mcp = self.__to_datahub_chart(tile, dataset_mcps) + chart_mcp = self.to_datahub_chart_mcp(tile, dataset_mcps) ds_mcps.extend(dataset_mcps) chart_mcps.extend(chart_mcp) @@ -514,7 +524,7 @@ def to_datahub_work_units( # Lets convert dashboard to datahub dashboard dashboard_mcps: List[ MetadataChangeProposalWrapper - ] = self.__to_datahub_dashboard(dashboard, chart_mcps, user_mcps) + ] = self.to_datahub_dashboard_mcp(dashboard, chart_mcps, user_mcps) # Now add MCPs in sequence mcps.extend(ds_mcps) @@ -523,11 +533,11 @@ def to_datahub_work_units( mcps.extend(dashboard_mcps) # Convert MCP to work_units - work_units = map(self.__to_work_unit, mcps) + work_units = map(self._to_work_unit, mcps) # Return set of work_unit return deduplicate_list([wu for wu in work_units if wu is not None]) - def __pages_to_chart( + def pages_to_chart( self, pages: List[PowerBiAPI.Page], ds_mcps: List[MetadataChangeProposalWrapper] ) -> List[MetadataChangeProposalWrapper]: @@ -588,7 +598,7 @@ def to_chart_mcps( return chart_mcps - def __report_to_dashboard( + def report_to_dashboard( self, workspace_name: str, report: PowerBiAPI.Report, @@ -701,11 +711,11 @@ def report_to_datahub_work_units( # Convert user to CorpUser user_mcps = self.to_datahub_users(report.users) # Convert pages to charts. A report has single dataset and same dataset used in pages to create visualization - ds_mcps = self.__to_datahub_dataset(report.dataset) - chart_mcps = self.__pages_to_chart(report.pages, ds_mcps) + ds_mcps = self.to_datahub_dataset(report.dataset) + chart_mcps = self.pages_to_chart(report.pages, ds_mcps) # Let's convert report to datahub dashboard - report_mcps = self.__report_to_dashboard( + report_mcps = self.report_to_dashboard( workspace.name, report, chart_mcps, user_mcps ) @@ -716,7 +726,7 @@ def report_to_datahub_work_units( mcps.extend(report_mcps) # Convert MCP to work_units - work_units = map(self.__to_work_unit, mcps) + work_units = map(self._to_work_unit, mcps) return work_units From 0bec288bfd62be1eac160e22cfbb0b0654992dd1 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Tue, 27 Dec 2022 23:15:30 +0530 Subject: [PATCH 47/53] updated doc --- .../docs/sources/powerbi/powerbi_pre.md | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index c71c46700903f..d6655dadc2642 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -20,6 +20,11 @@ See the If Tile is created from report then Chart.externalUrl is set to Report.webUrl. ## Lineage + +This source extract table lineage for tables present in Power BI Datasets. Lets consider a PowerBI Dataset `SALES_REPORT` and a PostgreSQL database is configured as data-source in `SALES_REPORT` dataset. + +Consider `SALES_REPORT` PowerBI Dataset has a table `SALES_ANALYSIS` which is backed by `SALES_ANALYSIS_VIEW` of PostgreSQL Database then in this case `SALES_ANALYSIS_VIEW` will appear as upstream dataset for `SALES_ANALYSIS` table. + You can control table lineage ingestion using `extract_lineage` configuration parameter, by default it is set to `true`. PowerBI Source extracts the lineage information by parsing PowerBI M-Query expression. @@ -62,3 +67,31 @@ in #"Added Conditional Column" ``` +## M-Query Pattern Supported For Lineage Extraction +Lets consider a M-Query which combine two PostgreSQL tables. Such M-Query can be written as per below patterns. + +**Pattern-1** + +```shell +let +Source = PostgreSQL.Database("localhost", "book_store"), +book_date = Source{[Schema="public",Item="book"]}[Data], +issue_history = Source{[Schema="public",Item="issue_history"]}[Data], +combine_result = Table.Combine({book_date, issue_history}) +in +combine_result +``` + +**Pattern-2** + +```shell +let +Source = PostgreSQL.Database("localhost", "book_store"), +combine_result = Table.Combine({Source{[Schema="public",Item="book"]}[Data], Source{[Schema="public",Item="issue_history"]}[Data]}) +in +combine_result +``` + +`Pattern-2` is *not* supported for upstream table lineage extraction as it uses nested item-selector i.e. {Source{[Schema="public",Item="book"]}[Data], Source{[Schema="public",Item="issue_history"]}[Data]} as argument to M-QUery table function i.e. Table.Combine + +`Pattern-1` is supported as it first assign the table from schema to variable and then variable is used in M-Query Table function i.e. Table.Combine From 7ce75dc399ba6e88d9f2143b2a3edface8bcdd93 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 28 Dec 2022 12:13:01 +0530 Subject: [PATCH 48/53] support join in native query --- .../powerbi/m_query/native_sql_parser.py | 17 +++--- .../source/powerbi/m_query/parser.py | 1 - .../source/powerbi/m_query/resolver.py | 2 + .../ingestion/source/powerbi/powerbi.py | 6 +- .../powerbi/golden_test_lineage.json | 58 ++++++++++++++++++- .../tests/integration/powerbi/test_powerbi.py | 13 +++++ 6 files changed, 84 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index e64c3b77cff93..cfb3b4769722e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -18,10 +18,8 @@ def remove_special_characters(native_query: str) -> str: def get_tables(native_query: str) -> List[str]: native_query = remove_special_characters(native_query) LOGGER.debug("Processing query = %s", native_query) - # As per current use-case, we are extracting only single table from "from" tables: List[str] = [] parsed = sqlparse.parse(native_query)[0] - tokens: List[sqlparse.sql.Token] = list(parsed.tokens) length: int = len(tokens) from_index: int = -1 @@ -34,17 +32,16 @@ def get_tables(native_query: str) -> List[str]: from_index = index + 1 break - table_name = None - - while from_index < length: + # Collect all identifier after FROM clause till we reach to the end or WHERE clause encounter + while ( + from_index < length + and isinstance(tokens[from_index], sqlparse.sql.Where) is not True + ): LOGGER.debug("%s=%s", tokens[from_index].value, tokens[from_index].ttype) LOGGER.debug("Type=%s", type(tokens[from_index])) if isinstance(tokens[from_index], sqlparse.sql.Identifier): - table_name = tokens[from_index].value - break + # Split on as keyword and collect the table name from 0th position. strip any spaces + tables.append(tokens[from_index].value.split("as")[0].strip()) from_index = from_index + 1 - if table_name is not None: - tables.append(table_name) - return tables diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 35af1fb89f3b2..1731fa250e0dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -43,7 +43,6 @@ def get_upstream_tables( try: parse_tree: Tree = _parse_expression(table.expression) - print(parse_tree.pretty()) valid, message = validator.validate_parse_tree( parse_tree, native_query_enabled=native_query_enabled ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 0dfac5767426b..77370a4f07727 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -590,6 +590,8 @@ def get_full_table_names( "Skipping table (%s) as it is not as per full_table_name format", table, ) + continue + full_table_names.append(table) return full_table_names diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index e3714ecd4eb7f..a6f8a5c211cf7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -141,7 +141,11 @@ def extract_lineage( upstream_table.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping.keys() ): - LOGGER.debug("Skipping upstream table for %s", ds_urn) + LOGGER.debug( + "Skipping upstream table for %s. The platform (%s) is not part of dataset_type_mapping", + ds_urn, + upstream_table.data_platform_pair.powerbi_data_platform_name, + ) continue platform: Union[str, PlatformDetail] = self.__config.dataset_type_mapping[ diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json index d59d38b7d17a9..14a81cedf6db1 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json @@ -111,6 +111,62 @@ "runId": "powerbi-lineage-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,sn-2.GSL_TEST_DB.PUBLIC.SALES_ANALYST,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,sn-2.GSL_TEST_DB.PUBLIC.SALES_FORECAST,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "value": "{\"upstreams\": [{\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,sn-2.GSL_TEST_DB.PUBLIC.SALES_ANALYST,PROD)\", \"type\": \"TRANSFORMED\"}, {\"auditStamp\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"dataset\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,sn-2.GSL_TEST_DB.PUBLIC.SALES_FORECAST,PROD)\", \"type\": \"TRANSFORMED\"}]}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-lineage-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -369,7 +425,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index d6ae1b033b10c..4afc112f1b2d1 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -316,6 +316,19 @@ def register_mock_api(request_mock): } ], }, + { + "name": "snowflake native-query-with-join", + "source": [ + { + "expression": 'let\n Source = Value.NativeQuery(Snowflake.Databases("xaa48144.snowflakecomputing.com","GSL_TEST_WH",[Role="ACCOUNTADMIN"]){[Name="GSL_TEST_DB"]}[Data], "select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, \'mo\')", null, [EnableFolding=true])\nin\n Source', + } + ], + "datasourceUsages": [ + { + "datasourceInstanceId": "DCE90B40-84D6-467A-9A5C-648E830E72D3", + } + ], + }, { "name": "job-history", "source": [ From 383697e900abab7820181eac9870aa879b524168 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 28 Dec 2022 12:29:30 +0530 Subject: [PATCH 49/53] integration test fix for native query --- .../golden_test_disabled_ownership.json | 30 ++++++++- .../powerbi/golden_test_ingest.json | 32 +++++++++- .../golden_test_lower_case_urn_ingest.json | 30 ++++++++- .../powerbi/golden_test_report.json | 62 ++++++++++++++++++- .../golden_test_scan_all_workspaces.json | 30 ++++++++- 5 files changed, 176 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json index 528477ca3d945..4590fef410601 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json @@ -83,6 +83,34 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -201,7 +229,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json index 4646baa3ad141..c0568fd7385f4 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json @@ -83,6 +83,34 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -285,7 +313,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -433,4 +461,4 @@ "runId": "powerbi-test" } } -] +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json index 2eabb5dcc45f1..fdb243a0e727d 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json @@ -83,6 +83,34 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -285,7 +313,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_testtable,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_testtable,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json index 9092d5bc6ea7f..9e0a4f348d00d 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json @@ -83,6 +83,34 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -285,7 +313,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -517,6 +545,34 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -663,7 +719,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"order\": \"0\"}, \"title\": \"ReportSection\", \"description\": \"Regional Sales Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"order\": \"0\"}, \"title\": \"ReportSection\", \"description\": \"Regional Sales Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { @@ -691,7 +747,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"order\": \"1\"}, \"title\": \"ReportSection1\", \"description\": \"Geographic Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"order\": \"1\"}, \"title\": \"ReportSection1\", \"description\": \"Geographic Analysis\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json b/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json index 255a907e39b8f..14e47301af7a0 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json @@ -83,6 +83,34 @@ "runId": "powerbi-test" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "value": "{\"customProperties\": {}, \"name\": \"snowflake native-query-with-join\", \"description\": \"snowflake native-query-with-join\", \"tags\": []}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "value": "{\"removed\": false}", + "contentType": "application/json" + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", @@ -201,7 +229,7 @@ "changeType": "UPSERT", "aspectName": "chartInfo", "aspect": { - "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", + "value": "{\"customProperties\": {\"datasetId\": \"05169CD2-E713-41E6-9600-1D8066D95445\", \"reportId\": \"\", \"datasetWebUrl\": \"http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details\", \"createdFrom\": \"Dataset\"}, \"title\": \"test_tile\", \"description\": \"test_tile\", \"lastModified\": {\"created\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}, \"lastModified\": {\"time\": 0, \"actor\": \"urn:li:corpuser:unknown\"}}, \"inputs\": [{\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)\"}, {\"string\": \"urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)\"}]}", "contentType": "application/json" }, "systemMetadata": { From 1efcb98a88c04903cc487418c9285812cc00d000 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 28 Dec 2022 12:29:54 +0530 Subject: [PATCH 50/53] native sql query unit test --- .../powerbi/test_native_sql_parser.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py diff --git a/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py new file mode 100644 index 0000000000000..53e184515c1d8 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/test_native_sql_parser.py @@ -0,0 +1,21 @@ +from typing import List + +from datahub.ingestion.source.powerbi.m_query import native_sql_parser + + +def test_join(): + query: str = "select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, 'mo')" + tables: List[str] = native_sql_parser.get_tables(query) + + assert len(tables) == 2 + assert tables[0] == "GSL_TEST_DB.PUBLIC.SALES_ANALYST" + assert tables[1] == "GSL_TEST_DB.PUBLIC.SALES_FORECAST" + + +def test_simple_from(): + query: str = "SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" + + tables: List[str] = native_sql_parser.get_tables(query) + + assert len(tables) == 1 + assert tables[0] == "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4" From fd911105ebfacf4d867b11b9f800a1511109a8e8 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 28 Dec 2022 12:57:31 +0530 Subject: [PATCH 51/53] review comment --- .../source/powerbi/m_query/parser.py | 2 +- .../datahub/ingestion/source/powerbi/proxy.py | 21 ++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 1731fa250e0dd..7ea28d5b579fa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -38,7 +38,7 @@ def get_upstream_tables( native_query_enabled: bool = True, ) -> List[resolver.DataPlatformTable]: if table.expression is None: - reporter.report_warning(table.full_name, "Expression is none") + LOGGER.debug(table.full_name, "Expression is none") return [] try: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py index dc7c0dbfac0e1..f6998f57fc0b1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py @@ -752,26 +752,28 @@ def create_scan_job(): LOGGER.info("Scan id({})".format(id)) return id - def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: - """ - Poll the PowerBi service for workspace scan to complete - """ - minimum_sleep = 3 + def calculate_max_trial(minimum_sleep: int, timeout: int) -> int: if timeout < minimum_sleep: LOGGER.info( f"Setting timeout to minimum_sleep time {minimum_sleep} seconds" ) timeout = minimum_sleep - max_trial = timeout // minimum_sleep + return timeout // minimum_sleep + + def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: + """ + Poll the PowerBi service for workspace scan to complete + """ + minimum_sleep = 3 + max_trial: int = calculate_max_trial(minimum_sleep, timeout) LOGGER.info(f"Max trial {max_trial}") + scan_get_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_GET] scan_get_endpoint = scan_get_endpoint.format( POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id ) - LOGGER.info(f"Hitting URL={scan_get_endpoint}") - trail = 1 while True: LOGGER.info(f"Trial = {trail}") @@ -781,9 +783,7 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: ) if res.status_code != 200: message = f"API({scan_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - LOGGER.warning(message) - raise ConnectionError(message) if res.json()["status"].upper() == "Succeeded".upper(): @@ -792,6 +792,7 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: if trail == max_trial: break + LOGGER.info(f"Sleeping for {minimum_sleep} seconds") sleep(minimum_sleep) trail += 1 From 97b8b7f8ad57941e212e3c3b4109b32a28ea1f51 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Wed, 28 Dec 2022 13:36:30 +0530 Subject: [PATCH 52/53] updated config --- .../src/datahub/ingestion/source/powerbi/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index de9345fff18b4..4892fa37e2fe9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -1,6 +1,6 @@ import logging from dataclasses import dataclass, field as dataclass_field -from typing import Dict, List, Union, Optional +from typing import Dict, List, Optional, Union import pydantic from pydantic import validator @@ -103,8 +103,8 @@ class PowerBiAPIConfig(EnvBasedSourceConfigBase): tenant_id: str = pydantic.Field(description="PowerBI tenant identifier") # PowerBi workspace identifier workspace_id: Optional[str] = pydantic.Field( - description="[deprecated] Use workspace_id_pattern instead", - default=None) + description="[deprecated] Use workspace_id_pattern instead", default=None + ) # PowerBi workspace identifier workspace_id_pattern: AllowDenyPattern = pydantic.Field( default=AllowDenyPattern.allow_all(), From 9f480369f149b11bd2575dd4bc79391528aa3a19 Mon Sep 17 00:00:00 2001 From: MohdSiddique Bagwan Date: Thu, 29 Dec 2022 13:25:29 +0530 Subject: [PATCH 53/53] review comments --- .../ingestion/source/powerbi/config.py | 8 +- .../powerbi/m_query/native_sql_parser.py | 10 +- .../source/powerbi/m_query/parser.py | 32 ++-- .../source/powerbi/m_query/resolver.py | 66 ++++---- .../source/powerbi/m_query/tree_function.py | 10 +- .../source/powerbi/m_query/validator.py | 28 +--- .../ingestion/source/powerbi/powerbi.py | 36 ++--- .../datahub/ingestion/source/powerbi/proxy.py | 150 +++++++++--------- 8 files changed, 163 insertions(+), 177 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 4892fa37e2fe9..c3115aae60395 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -11,7 +11,7 @@ from datahub.configuration.source_common import DEFAULT_ENV, EnvBasedSourceConfigBase from datahub.ingestion.api.source import SourceReport -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class Constant: @@ -88,7 +88,7 @@ def report_charts_dropped(self, view: str) -> None: @dataclass class PlatformDetail: - platform_instance: str = pydantic.Field( + platform_instance: Optional[str] = pydantic.Field( default=None, description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source of particular platform", ) @@ -174,14 +174,14 @@ def workspace_id_backward_compatibility(cls, values: Dict) -> Dict: workspace_id_pattern = values.get("workspace_id_pattern") if workspace_id_pattern == AllowDenyPattern.allow_all() and workspace_id: - LOGGER.warning( + logger.warning( "workspace_id_pattern is not set but workspace_id is set, setting workspace_id as workspace_id_pattern. workspace_id will be deprecated, please use workspace_id_pattern instead." ) values["workspace_id_pattern"] = AllowDenyPattern( allow=[f"^{workspace_id}$"] ) elif workspace_id_pattern != AllowDenyPattern.allow_all() and workspace_id: - LOGGER.warning( + logger.warning( "workspace_id will be ignored in favour of workspace_id_pattern. workspace_id will be deprecated, please use workspace_id_pattern only." ) values.pop("workspace_id") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index cfb3b4769722e..5e78048629403 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -5,7 +5,7 @@ SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] -LOGGER = logging.getLogger() +logger = logging.getLogger() def remove_special_characters(native_query: str) -> str: @@ -17,14 +17,14 @@ def remove_special_characters(native_query: str) -> str: def get_tables(native_query: str) -> List[str]: native_query = remove_special_characters(native_query) - LOGGER.debug("Processing query = %s", native_query) + logger.debug("Processing query = %s", native_query) tables: List[str] = [] parsed = sqlparse.parse(native_query)[0] tokens: List[sqlparse.sql.Token] = list(parsed.tokens) length: int = len(tokens) from_index: int = -1 for index, token in enumerate(tokens): - LOGGER.debug("%s=%s", token.value, token.ttype) + logger.debug("%s=%s", token.value, token.ttype) if ( token.value.lower().strip() == "from" and str(token.ttype) == "Token.Keyword" @@ -37,8 +37,8 @@ def get_tables(native_query: str) -> List[str]: from_index < length and isinstance(tokens[from_index], sqlparse.sql.Where) is not True ): - LOGGER.debug("%s=%s", tokens[from_index].value, tokens[from_index].ttype) - LOGGER.debug("Type=%s", type(tokens[from_index])) + logger.debug("%s=%s", tokens[from_index].value, tokens[from_index].ttype) + logger.debug("Type=%s", type(tokens[from_index])) if isinstance(tokens[from_index], sqlparse.sql.Identifier): # Split on as keyword and collect the table name from 0th position. strip any spaces tables.append(tokens[from_index].value.split("as")[0].strip()) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 7ea28d5b579fa..7f607b8e82005 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -1,6 +1,6 @@ import importlib.resources as pkg_resource import logging -from typing import List, cast +from typing import List, Optional, cast import lark from lark import Lark, Tree @@ -9,25 +9,35 @@ from datahub.ingestion.source.powerbi.m_query import resolver, validator from datahub.ingestion.source.powerbi.proxy import PowerBiAPI -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) +lark_parser: Optional[Lark] = None + + +def get_lark_parser(): + global lark_parser + if lark_parser is not None: + return lark_parser -def _parse_expression(expression: str) -> Tree: # Read lexical grammar as text grammar: str = pkg_resource.read_text( "datahub.ingestion.source.powerbi", "powerbi-lexical-grammar.rule" ) - # Create lark parser for the grammar text - lark_parser = Lark(grammar, start="let_expression", regex=True) + return Lark(grammar, start="let_expression", regex=True) + + +def _parse_expression(expression: str) -> Tree: + lark_parser: Lark = get_lark_parser() parse_tree: Tree = lark_parser.parse(expression) - LOGGER.debug("Parse Tree") + logger.debug("Parsing expression = %s", expression) + if ( - LOGGER.level == logging.DEBUG + logger.level == logging.DEBUG ): # Guard condition to avoid heavy pretty() function call - LOGGER.debug(parse_tree.pretty()) + logger.debug(parse_tree.pretty()) return parse_tree @@ -38,7 +48,7 @@ def get_upstream_tables( native_query_enabled: bool = True, ) -> List[resolver.DataPlatformTable]: if table.expression is None: - LOGGER.debug(table.full_name, "Expression is none") + logger.debug(table.full_name, "Expression is none") return [] try: @@ -47,11 +57,11 @@ def get_upstream_tables( parse_tree, native_query_enabled=native_query_enabled ) if valid is False: - LOGGER.debug("Validation failed: %s", cast(str, message)) + logger.debug("Validation failed: %s", cast(str, message)) reporter.report_warning(table.full_name, cast(str, message)) return [] except lark.exceptions.UnexpectedCharacters as e: - LOGGER.debug(f"Fail to parse expression {table.expression}", exc_info=e) + logger.debug(f"Fail to parse expression {table.expression}", exc_info=e) reporter.report_warning( table.full_name, f"UnSupported expression = {table.expression}" ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 77370a4f07727..b3fe9d31026be 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -14,7 +14,7 @@ ) from datahub.ingestion.source.powerbi.proxy import PowerBiAPI -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) @dataclass @@ -92,16 +92,16 @@ def get_item_selector_tokens( expression_tree ) if item_selector is None: - LOGGER.debug("Item Selector not found in tree") - LOGGER.debug(expression_tree.pretty()) + logger.debug("Item Selector not found in tree") + logger.debug(expression_tree.pretty()) return None, None identifier_tree: Optional[Tree] = tree_function.first_identifier_func( expression_tree ) if identifier_tree is None: - LOGGER.debug("Identifier not found in tree") - LOGGER.debug(item_selector.pretty()) + logger.debug("Identifier not found in tree") + logger.debug(item_selector.pretty()) return None, None # remove whitespaces and quotes from token @@ -126,7 +126,7 @@ def get_argument_list(invoke_expression: Tree) -> Optional[Tree]: invoke_expression ) if argument_list is None: - LOGGER.debug("First argument-list rule not found in input tree") + logger.debug("First argument-list rule not found in input tree") return None return argument_list @@ -162,7 +162,7 @@ def _process_invoke_expression( ) if first_arg_tree is None: - LOGGER.debug( + logger.debug( "Function invocation without argument in expression = %s", invoke_expression.pretty(), ) @@ -179,11 +179,11 @@ def _process_invoke_expression( first_argument ) - LOGGER.debug("Extracting token from tree %s", first_argument.pretty()) + logger.debug("Extracting token from tree %s", first_argument.pretty()) if expression is None: expression = tree_function.first_type_expression_func(first_argument) if expression is None: - LOGGER.debug( + logger.debug( "Either list_expression or type_expression is not found = %s", invoke_expression.pretty(), ) @@ -197,7 +197,7 @@ def _process_invoke_expression( tree_function.token_values(expression) ) - LOGGER.debug("Tokens in invoke expression are %s", tokens) + logger.debug("Tokens in invoke expression are %s", tokens) return tokens def _process_item_selector_expression( @@ -267,8 +267,8 @@ def internal( # Any expression after "=" sign of variable-statement rh_tree: Optional[Tree] = tree_function.first_expression_func(v_statement) if rh_tree is None: - LOGGER.debug("Expression tree not found") - LOGGER.debug(v_statement.pretty()) + logger.debug("Expression tree not found") + logger.debug(v_statement.pretty()) return None invoke_expression: Optional[ @@ -302,7 +302,7 @@ def internal( rh_tree ) if new_identifier is None or key_vs_value is None: - LOGGER.debug("Required information not found in rh_tree") + logger.debug("Required information not found in rh_tree") return None new_identifier_accessor: IdentifierAccessor = ( self._create_or_update_identifier_accessor( @@ -340,7 +340,7 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]: f_detail.data_access_function_name ) if supported_resolver is None: - LOGGER.debug( + logger.debug( "Resolver not found for the data-access-function %s", f_detail.data_access_function_name, ) @@ -383,7 +383,7 @@ def two_level_access_pattern( ) -> List[str]: full_table_names: List[str] = [] - LOGGER.debug( + logger.debug( "Processing PostgreSQL data-access function detail %s", data_access_func_detail, ) @@ -395,7 +395,7 @@ def two_level_access_pattern( ) if len(arguments) != 2: - LOGGER.debug("Expected 2 arguments, but got {%s}", len(arguments)) + logger.debug("Expected 2 arguments, but got {%s}", len(arguments)) return full_table_names db_name: str = arguments[1] @@ -410,7 +410,11 @@ def two_level_access_pattern( full_table_names.append(f"{db_name}.{schema_name}.{table_name}") - LOGGER.debug("PostgreSQL full-table-names = %s", full_table_names) + logger.debug( + "Platform(%s) full-table-names = %s", + self.get_platform_pair().datahub_data_platform_name, + full_table_names, + ) return full_table_names @@ -442,11 +446,11 @@ def get_full_table_names( if len(arguments) == 2: # It is regular case of MS-SQL - LOGGER.debug("Handling with regular case") + logger.debug("Handling with regular case") return self.two_level_access_pattern(data_access_func_detail) if len(arguments) >= 4 and arguments[2] != "Query": - LOGGER.debug("Unsupported case is found. Second index is not the Query") + logger.debug("Unsupported case is found. Second index is not the Query") return full_table_names db_name: str = arguments[1] @@ -462,7 +466,7 @@ def get_full_table_names( f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}" ) - LOGGER.debug("MS-SQL full-table-names %s", full_table_names) + logger.debug("MS-SQL full-table-names %s", full_table_names) return full_table_names @@ -475,7 +479,7 @@ def _get_db_name(self, value: str) -> Optional[str]: error_message: str = f"The target argument ({value}) should in the format of :/[.]" splitter_result: List[str] = value.split("/") if len(splitter_result) != 2: - LOGGER.debug(error_message) + logger.debug(error_message) return None db_name = splitter_result[1].split(".")[0] @@ -487,7 +491,7 @@ def get_full_table_names( ) -> List[str]: full_table_names: List[str] = [] - LOGGER.debug( + logger.debug( "Processing Oracle data-access function detail %s", data_access_func_detail ) @@ -521,7 +525,7 @@ def get_full_table_names( self, data_access_func_detail: DataAccessFunctionDetail ) -> List[str]: - LOGGER.debug("Processing Snowflake function detail %s", data_access_func_detail) + logger.debug("Processing Snowflake function detail %s", data_access_func_detail) # First is database name db_name: str = data_access_func_detail.identifier_accessor.items["Name"] # type: ignore # Second is schema name @@ -535,7 +539,7 @@ def get_full_table_names( full_table_name: str = f"{db_name}.{schema_name}.{table_name}" - LOGGER.debug("Snowflake full-table-name %s", full_table_name) + logger.debug("Snowflake full-table-name %s", full_table_name) return [full_table_name] @@ -554,11 +558,11 @@ def get_full_table_names( flat_argument_list: List[Tree] = tree_function.flat_argument_list(t1) if len(flat_argument_list) != 2: - LOGGER.debug( + logger.debug( "Expecting 2 argument, actual argument count is %s", len(flat_argument_list), ) - LOGGER.debug("Flat argument list = %s", flat_argument_list) + logger.debug("Flat argument list = %s", flat_argument_list) return full_table_names data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( @@ -568,10 +572,10 @@ def get_full_table_names( data_access_tokens[0] != SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name ): - LOGGER.debug( + logger.debug( "Provided native-query data-platform = %s", data_access_tokens[0] ) - LOGGER.debug("Only Snowflake is supported in NativeQuery") + logger.debug("Only Snowflake is supported in NativeQuery") return full_table_names # First argument is the query @@ -586,7 +590,7 @@ def get_full_table_names( for table in native_sql_parser.get_tables(sql_query): if len(table.split(".")) != 3: - LOGGER.debug( + logger.debug( "Skipping table (%s) as it is not as per full_table_name format", table, ) @@ -647,9 +651,9 @@ def get_function_names() -> List[str]: @staticmethod def get_resolver(function_name: str) -> Optional["SupportedResolver"]: - LOGGER.debug("Looking for resolver %s", function_name) + logger.debug("Looking for resolver %s", function_name) for supported_resolver in SupportedResolver: if function_name == supported_resolver.get_function_name(): return supported_resolver - LOGGER.debug("Looking not found for resolver %s", function_name) + logger.debug("Looking not found for resolver %s", function_name) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py index aac946d9b7987..c8a2807084611 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/tree_function.py @@ -4,7 +4,7 @@ from lark import Token, Tree -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) def get_output_variable(root: Tree) -> Optional[str]: @@ -28,13 +28,13 @@ def get_variable_statement(parse_tree: Tree, variable: str) -> Optional[Tree]: for tree in _filter: values: List[str] = token_values(tree.children[0]) actual_value: str = "".join(strip_char_from_list(values, " ")) - LOGGER.debug("Actual Value = %s", actual_value) - LOGGER.debug("Expected Value = %s", variable) + logger.debug("Actual Value = %s", actual_value) + logger.debug("Expected Value = %s", variable) if actual_value.lower() == variable.lower(): return tree - LOGGER.info("Provided variable(%s) not found in variable rule", variable) + logger.info("Provided variable(%s) not found in variable rule", variable) return None @@ -120,7 +120,7 @@ def get_all_function_name(tree: Tree) -> List[str]: _filter: Any = tree.find_data("invoke_expression") for node in _filter: - LOGGER.debug("Tree = %s", node.pretty()) + logger.debug("Tree = %s", node.pretty()) primary_expression_node: Optional[Tree] = first_primary_expression_func(node) if primary_expression_node is None: continue diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py index abe7d0e46b05a..5bb8f811fa61b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/validator.py @@ -5,33 +5,7 @@ from datahub.ingestion.source.powerbi.m_query import resolver, tree_function -LOGGER = logging.getLogger(__name__) - - -def any_one_should_present( - supported_funcs: List[str], functions: List[str] -) -> Tuple[bool, Optional[str]]: - """ - Anyone functions from supported_funcs should present in functions list - :param supported_funcs: List of function m_query module supports - :param functions: List of functions retrieved from expression - :return: True or False - """ - for f in supported_funcs: - if f in functions: - return True, None - - return False, f"Function from supported function list {supported_funcs} not found" - - -def all_function_should_be_known( - supported_funcs: List[str], functions: List[str] -) -> Tuple[bool, Optional[str]]: - for f in functions: - if f not in supported_funcs: - return False, f"Function {f} is unknown" - - return True, None +logger = logging.getLogger(__name__) def validate_parse_tree( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index a6f8a5c211cf7..8218a7ea718eb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -52,7 +52,7 @@ from datahub.utilities.dedup_list import deduplicate_list # Logger instance -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class Mapper: @@ -141,7 +141,7 @@ def extract_lineage( upstream_table.data_platform_pair.powerbi_data_platform_name not in self.__config.dataset_type_mapping.keys() ): - LOGGER.debug( + logger.debug( "Skipping upstream table for %s. The platform (%s) is not part of dataset_type_mapping", ds_urn, upstream_table.data_platform_pair.powerbi_data_platform_name, @@ -202,8 +202,8 @@ def to_datahub_dataset( if dataset is None: return dataset_mcps - LOGGER.info( - f"Converting dataset={dataset.name}(id={dataset.id}) to datahub dataset" + logger.debug( + f"Mapping dataset={dataset.name}(id={dataset.id}) to datahub dataset" ) for table in dataset.tables: @@ -214,7 +214,7 @@ def to_datahub_dataset( env=self.__config.env, ) - LOGGER.info(f"{Constant.Dataset_URN}={ds_urn}") + logger.debug(f"{Constant.Dataset_URN}={ds_urn}") # Create datasetProperties mcp ds_properties = DatasetPropertiesClass( name=table.name, description=table.name @@ -247,13 +247,13 @@ def to_datahub_chart_mcp( """ Map PowerBi tile to datahub chart """ - LOGGER.info("Converting tile {}(id={}) to chart".format(tile.title, tile.id)) + logger.info("Converting tile {}(id={}) to chart".format(tile.title, tile.id)) # Create a URN for chart chart_urn = builder.make_chart_urn( self.__config.platform_name, tile.get_urn_part() ) - LOGGER.info("{}={}".format(Constant.CHART_URN, chart_urn)) + logger.info("{}={}".format(Constant.CHART_URN, chart_urn)) ds_input: List[str] = self.to_urn_set(ds_mcps) @@ -431,9 +431,7 @@ def to_datahub_user( Map PowerBi user to datahub user """ - LOGGER.info( - f"Converting user {user.displayName}(id={user.id}) to datahub's user" - ) + logger.debug(f"Mapping user {user.displayName}(id={user.id}) to datahub's user") # Create an URN for user user_urn = builder.make_user_urn(user.get_urn_part()) @@ -493,7 +491,7 @@ def to_datahub_chart( if not tiles: return [], [] - LOGGER.info(f"Converting tiles(count={len(tiles)}) to charts") + logger.info(f"Converting tiles(count={len(tiles)}) to charts") for tile in tiles: if tile is None: @@ -515,7 +513,7 @@ def to_datahub_work_units( ) -> List[EquableMetadataWorkUnit]: mcps = [] - LOGGER.info( + logger.info( f"Converting dashboard={dashboard.displayName} to datahub dashboard" ) @@ -551,18 +549,18 @@ def pages_to_chart( if not pages: return [] - LOGGER.debug(f"Converting pages(count={len(pages)}) to charts") + logger.debug(f"Converting pages(count={len(pages)}) to charts") def to_chart_mcps( page: PowerBiAPI.Page, ds_mcps: List[MetadataChangeProposalWrapper] ) -> List[MetadataChangeProposalWrapper]: - LOGGER.debug("Converting page {} to chart".format(page.displayName)) + logger.debug("Converting page {} to chart".format(page.displayName)) # Create a URN for chart chart_urn = builder.make_chart_urn( self.__config.platform_name, page.get_urn_part() ) - LOGGER.debug("{}={}".format(Constant.CHART_URN, chart_urn)) + logger.debug("{}={}".format(Constant.CHART_URN, chart_urn)) ds_input: List[str] = self.to_urn_set(ds_mcps) @@ -710,7 +708,7 @@ def report_to_datahub_work_units( ) -> Iterable[MetadataWorkUnit]: mcps: List[MetadataChangeProposalWrapper] = [] - LOGGER.debug(f"Converting dashboard={report.name} to datahub dashboard") + logger.debug(f"Converting dashboard={report.name} to datahub dashboard") # Convert user to CorpUser user_mcps = self.to_datahub_users(report.users) @@ -787,12 +785,12 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: """ Datahub Ingestion framework invoke this method """ - LOGGER.info("PowerBi plugin execution is started") + logger.info("PowerBi plugin execution is started") # Validate dataset type mapping self.validate_dataset_type_mapping() # Fetch PowerBi workspace for given workspace identifier for workspace_id in self.get_workspace_ids(): - LOGGER.info(f"Scanning workspace id: {workspace_id}") + logger.info(f"Scanning workspace id: {workspace_id}") workspace = self.powerbi_client.get_workspace(workspace_id, self.reporter) for dashboard in workspace.dashboards: @@ -806,7 +804,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: except Exception as e: message = f"Error ({e}) occurred while loading dashboard {dashboard.displayName}(id={dashboard.id}) tiles." - LOGGER.exception(message, e) + logger.exception(message, e) self.reporter.report_warning(dashboard.id, message) # Convert PowerBi Dashboard and child entities to Datahub work unit to ingest into Datahub workunits = self.mapper.to_datahub_work_units(dashboard) diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py index f6998f57fc0b1..f1cec25b48b73 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/proxy.py @@ -16,7 +16,7 @@ ) # Logger instance -LOGGER = logging.getLogger(__name__) +logger = logging.getLogger(__name__) class PowerBiAPI: @@ -208,9 +208,9 @@ def __init__(self, config: PowerBiAPIConfig) -> None: ) # Test connection by generating a access token - LOGGER.info("Trying to connect to {}".format(self.__get_authority_url())) + logger.info("Trying to connect to {}".format(self.__get_authority_url())) self.get_access_token() - LOGGER.info("Able to connect to {}".format(self.__get_authority_url())) + logger.info("Able to connect to {}".format(self.__get_authority_url())) def __get_authority_url(self): return "{}{}".format(PowerBiAPI.AUTHORITY, self.__config.tenant_id) @@ -221,7 +221,7 @@ def __get_users(self, workspace_id: str, entity: str, _id: str) -> List[User]: """ users: List[PowerBiAPI.User] = [] if self.__config.extract_ownership is False: - LOGGER.info( + logger.info( "Extract ownership capabilities is disabled from configuration and hence returning empty users list" ) return users @@ -234,7 +234,7 @@ def __get_users(self, workspace_id: str, entity: str, _id: str) -> List[User]: ENTITY_ID=_id, ) # Hit PowerBi - LOGGER.info(f"Request to URL={user_list_endpoint}") + logger.info(f"Request to URL={user_list_endpoint}") response = requests.get( user_list_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -242,15 +242,15 @@ def __get_users(self, workspace_id: str, entity: str, _id: str) -> List[User]: # Check if we got response from PowerBi if response.status_code != 200: - LOGGER.warning( + logger.warning( "Failed to fetch user list from power-bi. http_status=%s. message=%s", response.status_code, response.text, ) - LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.info(f"{Constant.ENTITY}={entity}") - LOGGER.info(f"{Constant.ID}={_id}") + logger.info(f"{Constant.WorkspaceId}={workspace_id}") + logger.info(f"{Constant.ENTITY}={entity}") + logger.info(f"{Constant.ID}={_id}") raise ConnectionError("Failed to fetch the user list from the power-bi") users_dict: List[Any] = response.json()[Constant.VALUE] @@ -276,9 +276,9 @@ def _get_report( Fetch the report from PowerBi for the given report identifier """ if workspace_id is None or report_id is None: - LOGGER.info("Input values are None") - LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.info(f"{Constant.ReportId}={report_id}") + logger.info("Input values are None") + logger.info(f"{Constant.WorkspaceId}={workspace_id}") + logger.info(f"{Constant.ReportId}={report_id}") return None report_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_GET] @@ -289,7 +289,7 @@ def _get_report( REPORT_ID=report_id, ) # Hit PowerBi - LOGGER.info(f"Request to report URL={report_get_endpoint}") + logger.info(f"Request to report URL={report_get_endpoint}") response = requests.get( report_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -298,9 +298,9 @@ def _get_report( # Check if we got response from PowerBi if response.status_code != 200: message: str = "Failed to fetch report from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.warning(f"{Constant.ReportId}={report_id}") + logger.warning(message) + logger.warning(f"{Constant.WorkspaceId}={workspace_id}") + logger.warning(f"{Constant.ReportId}={report_id}") raise ConnectionError(message) response_dict = response.json() @@ -320,28 +320,28 @@ def _get_report( def get_access_token(self): if self.__access_token != "": - LOGGER.info("Returning the cached access token") + logger.debug("Returning the cached access token") return self.__access_token - LOGGER.info("Generating PowerBi access token") + logger.info("Generating PowerBi access token") auth_response = self.__msal_client.acquire_token_for_client( scopes=[PowerBiAPI.SCOPE] ) if not auth_response.get("access_token"): - LOGGER.warning( + logger.warning( "Failed to generate the PowerBi access token. Please check input configuration" ) raise ConfigurationError( "Powerbi authorization failed . Please check your input configuration." ) - LOGGER.info("Generated PowerBi access token") + logger.info("Generated PowerBi access token") self.__access_token = "Bearer {}".format(auth_response.get("access_token")) - LOGGER.debug(f"{Constant.PBIAccessToken}={self.__access_token}") + logger.debug(f"{Constant.PBIAccessToken}={self.__access_token}") return self.__access_token @@ -366,7 +366,7 @@ def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: POWERBI_BASE_URL=PowerBiAPI.BASE_URL, WORKSPACE_ID=workspace.id ) # Hit PowerBi - LOGGER.info(f"Request to URL={dashboard_list_endpoint}") + logger.info(f"Request to URL={dashboard_list_endpoint}") response = requests.get( dashboard_list_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -374,8 +374,8 @@ def get_dashboards(self, workspace: Workspace) -> List[Dashboard]: # Check if we got response from PowerBi if response.status_code != 200: - LOGGER.warning("Failed to fetch dashboard list from power-bi for") - LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") + logger.warning("Failed to fetch dashboard list from power-bi for") + logger.warning(f"{Constant.WorkspaceId}={workspace.id}") raise ConnectionError( "Failed to fetch the dashboard list from the power-bi" ) @@ -406,9 +406,9 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: Fetch the dataset from PowerBi for the given dataset identifier """ if workspace_id is None or dataset_id is None: - LOGGER.info("Input values are None") - LOGGER.info(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.info(f"{Constant.DatasetId}={dataset_id}") + logger.info("Input values are None") + logger.info(f"{Constant.WorkspaceId}={workspace_id}") + logger.info(f"{Constant.DatasetId}={dataset_id}") return None dataset_get_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.DATASET_GET] @@ -419,7 +419,7 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: DATASET_ID=dataset_id, ) # Hit PowerBi - LOGGER.info(f"Request to dataset URL={dataset_get_endpoint}") + logger.info(f"Request to dataset URL={dataset_get_endpoint}") response = requests.get( dataset_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -428,13 +428,13 @@ def get_dataset(self, workspace_id: str, dataset_id: str) -> Any: # Check if we got response from PowerBi if response.status_code != 200: message: str = "Failed to fetch dataset from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") - LOGGER.warning(f"{Constant.DatasetId}={dataset_id}") + logger.warning(message) + logger.warning(f"{Constant.WorkspaceId}={workspace_id}") + logger.warning(f"{Constant.DatasetId}={dataset_id}") raise ConnectionError(message) response_dict = response.json() - LOGGER.debug("datasets = {}".format(response_dict)) + logger.debug("datasets = {}".format(response_dict)) # PowerBi Always return the webURL, in-case if it is None then setting complete webURL to None instead of # None/details return PowerBiAPI.PowerBIDataset( @@ -462,7 +462,7 @@ def get_data_sources( DATASET_ID=dataset.id, ) # Hit PowerBi - LOGGER.info(f"Request to datasource URL={datasource_get_endpoint}") + logger.info(f"Request to datasource URL={datasource_get_endpoint}") response = requests.get( datasource_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -471,11 +471,11 @@ def get_data_sources( # Check if we got response from PowerBi if response.status_code != 200: message: str = "Failed to fetch datasource from power-bi for" - LOGGER.warning(message) - LOGGER.warning("{}={}".format(Constant.WorkspaceId, dataset.workspace_id)) - LOGGER.warning("{}={}".format(Constant.DatasetId, dataset.id)) - LOGGER.warning("{}={}".format(Constant.HTTP_RESPONSE_TEXT, response.text)) - LOGGER.warning( + logger.warning(message) + logger.warning("{}={}".format(Constant.WorkspaceId, dataset.workspace_id)) + logger.warning("{}={}".format(Constant.DatasetId, dataset.id)) + logger.warning("{}={}".format(Constant.HTTP_RESPONSE_TEXT, response.text)) + logger.warning( "{}={}".format(Constant.HTTP_RESPONSE_STATUS_CODE, response.status_code) ) @@ -484,14 +484,14 @@ def get_data_sources( res = response.json() value = res["value"] if len(value) == 0: - LOGGER.info( + logger.info( f"datasource is not found for dataset {dataset.name}({dataset.id})" ) return None data_sources: Dict[str, "PowerBiAPI.DataSource"] = {} - LOGGER.debug("data-sources = {}".format(value)) + logger.debug("data-sources = {}".format(value)) for datasource_dict in value: # Create datasource instance with basic detail available datasource = PowerBiAPI.DataSource( @@ -545,7 +545,7 @@ def new_dataset_or_report(tile_instance: Any) -> dict: else: report_fields["createdFrom"] = PowerBiAPI.Tile.CreatedFrom.VISUALIZATION - LOGGER.info( + logger.info( "Tile %s(%s) is created from %s", tile_instance.get("title"), tile_instance.get("id"), @@ -562,7 +562,7 @@ def new_dataset_or_report(tile_instance: Any) -> dict: DASHBOARD_ID=dashboard.id, ) # Hit PowerBi - LOGGER.info("Request to URL={}".format(tile_list_endpoint)) + logger.info("Request to URL={}".format(tile_list_endpoint)) response = requests.get( tile_list_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -570,14 +570,14 @@ def new_dataset_or_report(tile_instance: Any) -> dict: # Check if we got response from PowerBi if response.status_code != 200: - LOGGER.warning("Failed to fetch tiles list from power-bi for") - LOGGER.warning("{}={}".format(Constant.WorkspaceId, workspace.id)) - LOGGER.warning("{}={}".format(Constant.DashboardId, dashboard.id)) + logger.warning("Failed to fetch tiles list from power-bi for") + logger.warning("{}={}".format(Constant.WorkspaceId, workspace.id)) + logger.warning("{}={}".format(Constant.DashboardId, dashboard.id)) raise ConnectionError("Failed to fetch the tile list from the power-bi") # Iterate through response and create a list of PowerBiAPI.Dashboard tile_dict: List[Any] = response.json()[Constant.VALUE] - LOGGER.debug("Tile Dict = {}".format(tile_dict)) + logger.debug("Tile Dict = {}".format(tile_dict)) tiles: List[PowerBiAPI.Tile] = [ PowerBiAPI.Tile( id=instance.get("id"), @@ -598,7 +598,7 @@ def get_pages_by_report( Fetch the report from PowerBi for the given report identifier """ if workspace_id is None or report_id is None: - LOGGER.info("workspace_id or report_id is None") + logger.info("workspace_id or report_id is None") return [] pages_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.PAGE_BY_REPORT] @@ -609,7 +609,7 @@ def get_pages_by_report( REPORT_ID=report_id, ) # Hit PowerBi - LOGGER.info(f"Request to pages URL={pages_endpoint}") + logger.info(f"Request to pages URL={pages_endpoint}") response = requests.get( pages_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -618,8 +618,8 @@ def get_pages_by_report( # Check if we got response from PowerBi if response.status_code != 200: message: str = "Failed to fetch reports from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace_id}") + logger.warning(message) + logger.warning(f"{Constant.WorkspaceId}={workspace_id}") raise ConnectionError(message) response_dict = response.json() @@ -640,7 +640,7 @@ def get_reports( Fetch the report from PowerBi for the given report identifier """ if workspace is None: - LOGGER.info("workspace is None") + logger.info("workspace is None") return [] report_list_endpoint: str = PowerBiAPI.API_ENDPOINTS[Constant.REPORT_LIST] @@ -650,7 +650,7 @@ def get_reports( WORKSPACE_ID=workspace.id, ) # Hit PowerBi - LOGGER.info(f"Request to report URL={report_list_endpoint}") + logger.info(f"Request to report URL={report_list_endpoint}") response = requests.get( report_list_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -659,8 +659,8 @@ def get_reports( # Check if we got response from PowerBi if response.status_code != 200: message: str = "Failed to fetch reports from power-bi for" - LOGGER.warning(message) - LOGGER.warning(f"{Constant.WorkspaceId}={workspace.id}") + logger.warning(message) + logger.warning(f"{Constant.WorkspaceId}={workspace.id}") raise ConnectionError(message) response_dict = response.json() @@ -687,7 +687,7 @@ def get_reports( def get_groups(self): group_endpoint = PowerBiAPI.BASE_URL # Hit PowerBi - LOGGER.info(f"Request to get groups endpoint URL={group_endpoint}") + logger.info(f"Request to get groups endpoint URL={group_endpoint}") response = requests.get( group_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -744,17 +744,17 @@ def create_scan_job(): if res.status_code not in (200, 202): message = f"API({scan_create_endpoint}) return error code {res.status_code} for workspace id({workspace_id})" - LOGGER.warning(message) + logger.warning(message) raise ConnectionError(message) # Return Id of Scan created for the given workspace id = res.json()["id"] - LOGGER.info("Scan id({})".format(id)) + logger.info("Scan id({})".format(id)) return id def calculate_max_trial(minimum_sleep: int, timeout: int) -> int: if timeout < minimum_sleep: - LOGGER.info( + logger.info( f"Setting timeout to minimum_sleep time {minimum_sleep} seconds" ) timeout = minimum_sleep @@ -767,33 +767,33 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: """ minimum_sleep = 3 max_trial: int = calculate_max_trial(minimum_sleep, timeout) - LOGGER.info(f"Max trial {max_trial}") + logger.info(f"Max trial {max_trial}") scan_get_endpoint = PowerBiAPI.API_ENDPOINTS[Constant.SCAN_GET] scan_get_endpoint = scan_get_endpoint.format( POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id ) - LOGGER.info(f"Hitting URL={scan_get_endpoint}") + logger.debug(f"Hitting URL={scan_get_endpoint}") trail = 1 while True: - LOGGER.info(f"Trial = {trail}") + logger.info(f"Trial = {trail}") res = requests.get( scan_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, ) if res.status_code != 200: message = f"API({scan_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - LOGGER.warning(message) + logger.warning(message) raise ConnectionError(message) if res.json()["status"].upper() == "Succeeded".upper(): - LOGGER.info(f"Scan result is available for scan id({scan_id})") + logger.info(f"Scan result is available for scan id({scan_id})") return True if trail == max_trial: break - LOGGER.info(f"Sleeping for {minimum_sleep} seconds") + logger.info(f"Sleeping for {minimum_sleep} seconds") sleep(minimum_sleep) trail += 1 @@ -801,8 +801,8 @@ def wait_for_scan_to_complete(scan_id: str, timeout: int) -> Any: return False def get_scan_result(scan_id: str) -> dict: - LOGGER.info("Fetching scan result") - LOGGER.info(f"{Constant.SCAN_ID}={scan_id}") + logger.info("Fetching scan result") + logger.info(f"{Constant.SCAN_ID}={scan_id}") scan_result_get_endpoint = PowerBiAPI.API_ENDPOINTS[ Constant.SCAN_RESULT_GET ] @@ -810,7 +810,7 @@ def get_scan_result(scan_id: str) -> dict: POWERBI_ADMIN_BASE_URL=PowerBiAPI.ADMIN_BASE_URL, SCAN_ID=scan_id ) - LOGGER.info(f"Hitting URL={scan_result_get_endpoint}") + logger.debug(f"Hitting URL={scan_result_get_endpoint}") res = requests.get( scan_result_get_endpoint, headers={Constant.Authorization: self.get_access_token()}, @@ -818,7 +818,7 @@ def get_scan_result(scan_id: str) -> dict: if res.status_code != 200: message = f"API({scan_result_get_endpoint}) return error code {res.status_code} for scan id({scan_id})" - LOGGER.warning(message) + logger.warning(message) raise ConnectionError(message) @@ -832,11 +832,11 @@ def json_to_dataset_map(scan_result: dict) -> dict: dataset_map: dict = {} if datasets is None or len(datasets) == 0: - LOGGER.warning( + logger.warning( f'Workspace {scan_result["name"]}({scan_result["id"]}) does not have datasets' ) - LOGGER.info("Returning empty datasets") + logger.info("Returning empty datasets") return dataset_map for dataset_dict in datasets: @@ -877,11 +877,11 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: return None - LOGGER.info("Creating scan job for workspace") - LOGGER.info("{}={}".format(Constant.WorkspaceId, workspace_id)) - LOGGER.info("Hitting URL={}".format(scan_create_endpoint)) + logger.info("Creating scan job for workspace") + logger.info("{}={}".format(Constant.WorkspaceId, workspace_id)) + logger.debug("Hitting URL={}".format(scan_create_endpoint)) scan_id = create_scan_job() - LOGGER.info("Waiting for scan to complete") + logger.info("Waiting for scan to complete") if ( wait_for_scan_to_complete( scan_id=scan_id, timeout=self.__config.scan_timeout @@ -895,7 +895,7 @@ def init_dashboard_tiles(workspace: PowerBiAPI.Workspace) -> None: # Scan is complete lets take the result scan_result = get_scan_result(scan_id=scan_id) - LOGGER.debug(f"scan result = %s", json.dumps(scan_result, indent=1)) + logger.debug(f"scan result = %s", json.dumps(scan_result, indent=1)) workspace = PowerBiAPI.Workspace( id=scan_result["id"], name=scan_result["name"],