From dd218dde8a57865a0af59f0e8e06ad3f30c5fd92 Mon Sep 17 00:00:00 2001 From: Jarek Potiuk Date: Wed, 23 Aug 2023 17:36:24 +0200 Subject: [PATCH] Improve provider verification pre-commit (#33640) There were a numer of problems with the provider verification pre-commit scripts: * It missed verification of "notifications" * It did not check if the classes or modules specified in provider yaml raised deprecation warnings * The messages produced by the script when some discrepancies were found were pretty cryptic and it was difficult to guess what kind of action should be taken to fix the problem This PR fixes all those problems: * verification of notification is performed * when importing all the classes and modules, check for the AirflowProviderDeprecationWarnings is done and treated as error * The messages produced provide clear actionable instructions on what to do and explain what are the discrepancies of expected vs. current list in a clear way --- .../pre_commit_check_provider_yaml_files.py | 2 + .../run_provider_yaml_files_check.py | 138 +++++++++++++++--- 2 files changed, 116 insertions(+), 24 deletions(-) diff --git a/scripts/ci/pre_commit/pre_commit_check_provider_yaml_files.py b/scripts/ci/pre_commit/pre_commit_check_provider_yaml_files.py index 995ff18ba0407..400b958065ff7 100755 --- a/scripts/ci/pre_commit/pre_commit_check_provider_yaml_files.py +++ b/scripts/ci/pre_commit/pre_commit_check_provider_yaml_files.py @@ -51,6 +51,8 @@ *get_extra_docker_flags(MOUNT_SELECTED), "-e", "SKIP_ENVIRONMENT_INITIALIZATION=true", + "-e", + "PYTHONWARNINGS=default", "--pull", "never", airflow_image, diff --git a/scripts/in_container/run_provider_yaml_files_check.py b/scripts/in_container/run_provider_yaml_files_check.py index 9e873f90c37e6..61cf5842cc8b0 100755 --- a/scripts/in_container/run_provider_yaml_files_check.py +++ b/scripts/in_container/run_provider_yaml_files_check.py @@ -26,6 +26,7 @@ import platform import sys import textwrap +import warnings from collections import Counter from enum import Enum from typing import Any, Iterable @@ -37,18 +38,22 @@ from tabulate import tabulate from airflow.cli.commands.info_command import Architecture +from airflow.exceptions import AirflowProviderDeprecationWarning from airflow.providers_manager import ProvidersManager # Those are deprecated modules that contain removed Hooks/Sensors/Operators that we left in the code # so that users can get a very specific error message when they try to use them. -EXCLUDED_MODULES = [ +DEPRECATED_MODULES = [ "airflow.providers.apache.hdfs.sensors.hdfs", "airflow.providers.apache.hdfs.hooks.hdfs", "airflow.providers.cncf.kubernetes.triggers.kubernetes_pod", "airflow.providers.cncf.kubernetes.operators.kubernetes_pod", ] +KNOWN_DEPRECATED_CLASSES = [ + "airflow.providers.google.cloud.links.dataproc.DataprocLink", +] try: from yaml import CSafeLoader as SafeLoader @@ -71,6 +76,13 @@ errors: list[str] = [] console = Console(width=400, color_system="standard") +# you need to enable warnings for all deprecations - needed by importlib library to show deprecations +if os.environ.get("PYTHONWARNINGS") != "default": + console.print( + "[red]Error: PYTHONWARNINGS not set[/]\n" + "You must set `PYTHONWARNINGS=default` environment variable to run this script" + ) + sys.exit(1) suspended_providers: set[str] = set() suspended_logos: set[str] = set() @@ -136,7 +148,14 @@ def check_integration_duplicates(yaml_files: dict[str, dict]): sys.exit(3) -def assert_sets_equal(set1, set2, allow_extra_in_set2=False): +def assert_sets_equal( + set1: set[str], + set_name_1: str, + set2: set[str], + set_name_2: str, + allow_extra_in_set2=False, + extra_message: str = "", +): try: difference1 = set1.difference(set2) except TypeError as e: @@ -153,6 +172,8 @@ def assert_sets_equal(set1, set2, allow_extra_in_set2=False): if difference1 or (difference2 and not allow_extra_in_set2): lines = [] + lines.append(f" Left set:{set_name_1}") + lines.append(f" Right set:{set_name_2}") if difference1: lines.append(" -- Items in the left set but not the right:") for item in sorted(difference1): @@ -163,6 +184,8 @@ def assert_sets_equal(set1, set2, allow_extra_in_set2=False): lines.append(f" {item!r}") standard_msg = "\n".join(lines) + if extra_message: + standard_msg += f"\n{extra_message}" raise AssertionError(standard_msg) @@ -174,12 +197,37 @@ class ObjectType(Enum): def check_if_object_exist(object_name: str, resource_type: str, yaml_file_path: str, object_type: ObjectType): try: if object_type == ObjectType.CLASS: - module_name, object_name = object_name.rsplit(".", maxsplit=1) - the_class = getattr(importlib.import_module(module_name), object_name) + module_name, class_name = object_name.rsplit(".", maxsplit=1) + with warnings.catch_warnings(record=True) as w: + the_class = getattr(importlib.import_module(module_name), class_name) + for warn in w: + if warn.category == AirflowProviderDeprecationWarning: + if object_name in KNOWN_DEPRECATED_CLASSES: + console.print( + f"[yellow]The {object_name} class is deprecated and we know about it. " + f"It should be removed in the future." + ) + continue + errors.append( + f"The `{class_name}` class in {resource_type} list in {yaml_file_path} " + f"is deprecated with this message: '{warn.message}'.\n" + f"[yellow]How to fix it[/]: Please remove it from provider.yaml and replace with " + f"the new class." + ) if the_class and inspect.isclass(the_class): return elif object_type == ObjectType.MODULE: - module = importlib.import_module(object_name) + with warnings.catch_warnings(record=True) as w: + module = importlib.import_module(object_name) + for warn in w: + if warn.category == AirflowProviderDeprecationWarning: + errors.append( + f"The `{object_name}` module in {resource_type} list in {yaml_file_path} " + f"is deprecated with this message: '{warn.message}'.\n" + f"[yellow]How to fix it[/]: Please remove it from provider.yaml and replace it " + f"with the new module. If you see warnings in classes - fix the classes so that " + f"they are not raising Deprecation Warnings when module is imported." + ) if inspect.ismodule(module): return else: @@ -231,23 +279,32 @@ def parse_module_data(provider_data, resource_type, yaml_file_path): return expected_modules, provider_package, resource_data -def check_correctness_of_list_of_sensors_operators_hook_modules(yaml_files: dict[str, dict]): - print("Checking completeness of list of {sensors, hooks, operators, triggers}") - print(" -- {sensors, hooks, operators, triggers} - Expected modules (left) : Current modules (right)") +def check_correctness_of_list_of_sensors_operators_hook_trigger_modules(yaml_files: dict[str, dict]): + print(" -- Checking completeness of list of {sensors, hooks, operators, triggers}") for (yaml_file_path, provider_data), resource_type in itertools.product( yaml_files.items(), ["sensors", "operators", "hooks", "triggers"] ): expected_modules, provider_package, resource_data = parse_module_data( provider_data, resource_type, yaml_file_path ) - expected_modules = {module for module in expected_modules if module not in EXCLUDED_MODULES} + expected_modules = {module for module in expected_modules if module not in DEPRECATED_MODULES} current_modules = {str(i) for r in resource_data for i in r.get("python-modules", [])} check_if_objects_exist_and_belong_to_package( current_modules, provider_package, yaml_file_path, resource_type, ObjectType.MODULE ) try: - assert_sets_equal(set(expected_modules), set(current_modules)) + package_name = os.fspath(ROOT_DIR.joinpath(yaml_file_path).parent.relative_to(ROOT_DIR)).replace( + "/", "." + ) + assert_sets_equal( + set(expected_modules), + f"Found list of {resource_type} modules in provider package: {package_name}", + set(current_modules), + f"Currently configured list of {resource_type} modules in {yaml_file_path}", + extra_message="[yellow]If there are deprecated modules in the list, please add them to " + f"DEPRECATED_MODULES in {pathlib.Path(__file__).relative_to(ROOT_DIR)}[/]", + ) except AssertionError as ex: nested_error = textwrap.indent(str(ex), " ") errors.append( @@ -276,19 +333,27 @@ def check_completeness_of_list_of_transfers(yaml_files: dict[str, dict]): print("Checking completeness of list of transfers") resource_type = "transfers" - print(" -- Expected transfers modules(Left): Current transfers Modules(Right)") + print(" -- Checking transfers modules") for yaml_file_path, provider_data in yaml_files.items(): expected_modules, provider_package, resource_data = parse_module_data( provider_data, resource_type, yaml_file_path ) - expected_modules = {module for module in expected_modules if module not in EXCLUDED_MODULES} + expected_modules = {module for module in expected_modules if module not in DEPRECATED_MODULES} current_modules = {r.get("python-module") for r in resource_data} check_if_objects_exist_and_belong_to_package( current_modules, provider_package, yaml_file_path, resource_type, ObjectType.MODULE ) try: - assert_sets_equal(set(expected_modules), set(current_modules)) + package_name = os.fspath(ROOT_DIR.joinpath(yaml_file_path).parent.relative_to(ROOT_DIR)).replace( + "/", "." + ) + assert_sets_equal( + set(expected_modules), + f"Found list of transfer modules in provider package: {package_name}", + set(current_modules), + f"Currently configured list of transfer modules in {yaml_file_path}", + ) except AssertionError as ex: nested_error = textwrap.indent(str(ex), " ") errors.append( @@ -337,6 +402,18 @@ def check_extra_link_classes(yaml_files: dict[str, dict]): ) +def check_notification_classes(yaml_files: dict[str, dict]): + print("Checking notifications belong to package, exist and are classes") + resource_type = "notifications" + for yaml_file_path, provider_data in yaml_files.items(): + provider_package = pathlib.Path(yaml_file_path).parent.as_posix().replace("/", ".") + notifications = provider_data.get(resource_type) + if notifications: + check_if_objects_exist_and_belong_to_package( + notifications, provider_package, yaml_file_path, resource_type, ObjectType.CLASS + ) + + def check_duplicates_in_list_of_transfers(yaml_files: dict[str, dict]): print("Checking for duplicates in list of transfers") errors = [] @@ -435,11 +512,20 @@ def check_doc_files(yaml_files: dict[str, dict]): } try: - print(" -- Checking document urls: expected (left), current (right)") - assert_sets_equal(set(expected_doc_urls), set(current_doc_urls)) - - print(" -- Checking logo urls: expected (left), current (right)") - assert_sets_equal(set(expected_logo_urls), set(current_logo_urls)) + print(" -- Checking document urls") + assert_sets_equal( + set(expected_doc_urls), + "Document urls found in airflow/docs", + set(current_doc_urls), + "Document urls configured in provider.yaml files", + ) + print(" -- Checking logo urls") + assert_sets_equal( + set(expected_logo_urls), + "Logo urls found in airflow/docs/integration-logos", + set(current_logo_urls), + "Logo urls configured in provider.yaml files", + ) except AssertionError as ex: print(ex) sys.exit(1) @@ -465,12 +551,15 @@ def check_providers_are_mentioned_in_issue_template(yaml_files: dict[str, dict]) issue_template = yaml.safe_load(issue_file) all_mentioned_providers = [match.value for match in jsonpath_expr.find(issue_template)] try: - print( - f" -- Checking providers: present in code (left), " - f"mentioned in {PROVIDER_ISSUE_TEMPLATE_PATH} (right)" - ) + print(f" -- Checking providers are mentioned in {PROVIDER_ISSUE_TEMPLATE_PATH}") # in case of suspended providers, we still want to have them in the issue template - assert_sets_equal(set(short_provider_names), set(all_mentioned_providers), allow_extra_in_set2=True) + assert_sets_equal( + set(short_provider_names), + "Provider names found in provider.yaml files", + set(all_mentioned_providers), + f"Provider names mentioned in {PROVIDER_ISSUE_TEMPLATE_PATH}", + allow_extra_in_set2=True, + ) except AssertionError as ex: print(ex) sys.exit(1) @@ -512,7 +601,8 @@ def check_providers_have_all_documentation_files(yaml_files: dict[str, dict]): check_hook_connection_classes(all_parsed_yaml_files) check_plugin_classes(all_parsed_yaml_files) check_extra_link_classes(all_parsed_yaml_files) - check_correctness_of_list_of_sensors_operators_hook_modules(all_parsed_yaml_files) + check_correctness_of_list_of_sensors_operators_hook_trigger_modules(all_parsed_yaml_files) + check_notification_classes(all_parsed_yaml_files) check_unique_provider_name(all_parsed_yaml_files) check_providers_have_all_documentation_files(all_parsed_yaml_files)