From c9064e9d471610fa621e885290fa15a88136d9f9 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Feb 2023 09:04:50 -0500 Subject: [PATCH 01/16] Remove unnecessary deleted_manifest in partial parsing --- core/dbt/parser/partial.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/core/dbt/parser/partial.py b/core/dbt/parser/partial.py index d6afe223278..f6e4dbb8123 100644 --- a/core/dbt/parser/partial.py +++ b/core/dbt/parser/partial.py @@ -73,7 +73,6 @@ def __init__(self, saved_manifest: Manifest, new_files: MutableMapping[str, AnyS self.project_parser_files: Dict = {} self.saved_files = self.saved_manifest.files self.project_parser_files = {} - self.deleted_manifest = Manifest() self.macro_child_map: Dict[str, List[str]] = {} ( self.env_vars_changed_source_files, @@ -268,7 +267,7 @@ def delete_from_saved(self, file_id): # macros/tests if saved_source_file.parse_file_type in mssat_files: self.remove_mssat_file(saved_source_file) - self.deleted_manifest.files[file_id] = self.saved_manifest.files.pop(file_id) + self.saved_manifest.files.pop(file_id) # macros if saved_source_file.parse_file_type in mg_files: @@ -311,7 +310,6 @@ def update_mssat_in_saved(self, new_source_file, old_source_file): # replace source_file in saved and add to parsing list file_id = new_source_file.file_id - self.deleted_manifest.files[file_id] = old_source_file self.saved_files[file_id] = deepcopy(new_source_file) self.add_to_pp_files(new_source_file) for unique_id in unique_ids: @@ -321,7 +319,6 @@ def remove_node_in_saved(self, source_file, unique_id): if unique_id in self.saved_manifest.nodes: # delete node in saved node = self.saved_manifest.nodes.pop(unique_id) - self.deleted_manifest.nodes[unique_id] = node elif ( source_file.file_id in self.disabled_by_file_id and unique_id in self.saved_manifest.disabled @@ -456,7 +453,7 @@ def delete_macro_file(self, source_file, follow_references=False): file_id = source_file.file_id # It's not clear when this file_id would not exist in saved_files if file_id in self.saved_files: - self.deleted_manifest.files[file_id] = self.saved_files.pop(file_id) + self.saved_files.pop(file_id) def check_for_special_deleted_macros(self, source_file): for unique_id in source_file.macros: @@ -487,7 +484,6 @@ def handle_macro_file_links(self, source_file, follow_references=False): continue base_macro = self.saved_manifest.macros.pop(unique_id) - self.deleted_manifest.macros[unique_id] = base_macro # Recursively check children of this macro # The macro_child_map might not exist if a macro is removed by @@ -565,16 +561,14 @@ def delete_doc_node(self, source_file): # remove the nodes in the 'docs' dictionary docs = source_file.docs.copy() for unique_id in docs: - self.deleted_manifest.docs[unique_id] = self.saved_manifest.docs.pop(unique_id) + self.saved_manifest.docs.pop(unique_id) source_file.docs.remove(unique_id) # The unique_id of objects that contain a doc call are stored in the # doc source_file.nodes self.schedule_nodes_for_parsing(source_file.nodes) source_file.nodes = [] # Remove the file object - self.deleted_manifest.files[source_file.file_id] = self.saved_manifest.files.pop( - source_file.file_id - ) + self.saved_manifest.files.pop(source_file.file_id) # Schema files ----------------------- # Changed schema files @@ -608,7 +602,7 @@ def delete_schema_file(self, file_id): saved_yaml_dict = saved_schema_file.dict_from_yaml new_yaml_dict = {} self.handle_schema_file_changes(saved_schema_file, saved_yaml_dict, new_yaml_dict) - self.deleted_manifest.files[file_id] = self.saved_manifest.files.pop(file_id) + self.saved_manifest.files.pop(file_id) # For each key in a schema file dictionary, process the changed, deleted, and added # elemnts for the key lists @@ -853,8 +847,7 @@ def remove_tests(self, schema_file, dict_key, name): tests = schema_file.get_tests(dict_key, name) for test_unique_id in tests: if test_unique_id in self.saved_manifest.nodes: - node = self.saved_manifest.nodes.pop(test_unique_id) - self.deleted_manifest.nodes[test_unique_id] = node + self.saved_manifest.nodes.pop(test_unique_id) schema_file.remove_tests(dict_key, name) def delete_schema_source(self, schema_file, source_dict): @@ -869,7 +862,6 @@ def delete_schema_source(self, schema_file, source_dict): source = self.saved_manifest.sources[unique_id] if source.source_name == source_name: source = self.saved_manifest.sources.pop(unique_id) - self.deleted_manifest.sources[unique_id] = source schema_file.sources.remove(unique_id) self.schedule_referencing_nodes_for_parsing(unique_id) @@ -881,7 +873,6 @@ def delete_schema_macro_patch(self, schema_file, macro): del schema_file.macro_patches[macro["name"]] if macro_unique_id and macro_unique_id in self.saved_manifest.macros: macro = self.saved_manifest.macros.pop(macro_unique_id) - self.deleted_manifest.macros[macro_unique_id] = macro macro_file_id = macro.file_id if macro_file_id in self.new_files: self.saved_files[macro_file_id] = deepcopy(self.new_files[macro_file_id]) @@ -896,9 +887,7 @@ def delete_schema_exposure(self, schema_file, exposure_dict): if unique_id in self.saved_manifest.exposures: exposure = self.saved_manifest.exposures[unique_id] if exposure.name == exposure_name: - self.deleted_manifest.exposures[unique_id] = self.saved_manifest.exposures.pop( - unique_id - ) + self.saved_manifest.exposures.pop(unique_id) schema_file.exposures.remove(unique_id) elif unique_id in self.saved_manifest.disabled: self.delete_disabled(unique_id, schema_file.file_id) @@ -914,9 +903,7 @@ def delete_schema_metric(self, schema_file, metric_dict): # Need to find everything that referenced this metric and schedule for parsing if unique_id in self.saved_manifest.child_map: self.schedule_nodes_for_parsing(self.saved_manifest.child_map[unique_id]) - self.deleted_manifest.metrics[unique_id] = self.saved_manifest.metrics.pop( - unique_id - ) + self.saved_manifest.metrics.pop(unique_id) schema_file.metrics.remove(unique_id) elif unique_id in self.saved_manifest.disabled: self.delete_disabled(unique_id, schema_file.file_id) From 521c8583f26b69d6bafb7d24f08fa8d522827d10 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Feb 2023 14:46:57 -0500 Subject: [PATCH 02/16] Switch existing file reading to use ReadFilesFromFileSystem --- core/dbt/parser/manifest.py | 23 ++-- core/dbt/parser/read_files.py | 204 +++++++++++++++++----------------- 2 files changed, 120 insertions(+), 107 deletions(-) diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py index 903852f6ed7..fbdd354a5f2 100644 --- a/core/dbt/parser/manifest.py +++ b/core/dbt/parser/manifest.py @@ -44,7 +44,7 @@ from dbt.context.configured import generate_macro_context from dbt.context.providers import ParseProvider from dbt.contracts.files import FileHash, ParseFileType, SchemaSourceFile -from dbt.parser.read_files import read_files, load_source_file +from dbt.parser.read_files import ReadFilesFromFileSystem, load_source_file, FileDiff from dbt.parser.partial import PartialParsing, special_override_macros from dbt.contracts.graph.manifest import ( Manifest, @@ -148,6 +148,7 @@ def __init__( root_project: RuntimeConfig, all_projects: Mapping[str, Project], macro_hook: Optional[Callable[[Manifest], Any]] = None, + file_diff: Optional[FileDiff] = None, ) -> None: self.root_project: RuntimeConfig = root_project self.all_projects: Mapping[str, Project] = all_projects @@ -185,6 +186,7 @@ def get_full_manifest( cls, config: RuntimeConfig, *, + file_diff: Optional[FileDiff] = None, reset: bool = False, ) -> Manifest: @@ -220,17 +222,24 @@ def get_full_manifest( # This is where the main action happens def load(self): - # Read files creates a dictionary of projects to a dictionary - # of parsers to lists of file strings. The file strings are - # used to get the SourceFiles from the manifest files. start_read_files = time.perf_counter() + # This is a dictionary of project_parser_files = {} saved_files = {} if self.saved_manifest: saved_files = self.saved_manifest.files - for project in self.all_projects.values(): - read_files(project, self.manifest.files, project_parser_files, saved_files) - orig_project_parser_files = project_parser_files + # This updates the "files" dictionary in self.manifest, and creates + # the partial_parser_files dictionary (see read_files.py). + # Read files creates a dictionary of projects to a dictionary + # of parsers to lists of file strings. The file strings are + # used to get the SourceFiles from the manifest files. + file_reader = ReadFilesFromFileSystem( + all_projects=self.all_projects, + files=self.manifest.files, + saved_files=saved_files, + ) + file_reader.read_files() + project_parser_files = orig_project_parser_files = file_reader.project_parser_files self._perf_info.path_count = len(self.manifest.files) self._perf_info.read_files_elapsed = time.perf_counter() - start_read_files diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 531e5f39560..152c2686492 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -1,6 +1,7 @@ import os import pathspec # type: ignore import pathlib +from dataclasses import dataclass, field from dbt.clients.system import load_file_contents from dbt.contracts.files import ( FilePath, @@ -10,11 +11,29 @@ AnySourceFile, SchemaSourceFile, ) - +from dbt.config import Project from dbt.parser.schemas import yaml_from_file, schema_file_keys, check_format_version from dbt.exceptions import ParsingError from dbt.parser.search import filesystem_search -from typing import Optional +from typing import Optional, Dict, List, Mapping + + +@dataclass +class InputFile: + path: str + contents: str + + +@dataclass +class ProjectFileDiff: + project_name: str + deleted_files: List[str] + changed_files: List + + +@dataclass +class FileDiff: + projects: List[ProjectFileDiff] # This loads the files contents and creates the SourceFile object @@ -131,9 +150,10 @@ def get_source_files(project, paths, extension, parse_file_type, saved_files, ig return fb_list -def read_files_for_parser(project, files, dirs, extensions, parse_ft, saved_files, ignore_spec): +def read_files_for_parser(project, files, parse_ft, file_type_info, saved_files, ignore_spec): + dirs = file_type_info["paths"] parser_files = [] - for extension in extensions: + for extension in file_type_info["extensions"]: source_files = get_source_files( project, dirs, extension, parse_ft, saved_files, ignore_spec ) @@ -153,104 +173,88 @@ def generate_dbt_ignore_spec(project_root): return ignore_spec -# This needs to read files for multiple projects, so the 'files' -# dictionary needs to be passed in. What determines the order of -# the various projects? Is the root project always last? Do the -# non-root projects need to be done separately in order? -def read_files(project, files, parser_files, saved_files): - dbt_ignore_spec = generate_dbt_ignore_spec(project.project_root) - project_files = {} +@dataclass +class ReadFilesFromFileSystem: + all_projects: Mapping[str, Project] + # This is a reference to the "files" dictionary in the current manifest, so the + # manifest in implicitly updated by this code. + files: Dict[str, AnySourceFile] + # saved_files is only used to compare schema files + saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) + # project_parser_files = { + # "my_project": { + # "ModelParser": ["my_project://models/my_model.sql"] + # } + # } + # + project_parser_files: Dict = field(default_factory=dict) - project_files["MacroParser"] = read_files_for_parser( - project, - files, - project.macro_paths, - [".sql"], - ParseFileType.Macro, - saved_files, - dbt_ignore_spec, - ) + def read_files(self): + for project in self.all_projects.values(): + file_types = get_file_types_for_project(project) + self.read_files_for_project(project, file_types) - project_files["ModelParser"] = read_files_for_parser( - project, - files, - project.model_paths, - [".sql", ".py"], - ParseFileType.Model, - saved_files, - dbt_ignore_spec, - ) + def read_files_for_project(self, project, file_types): + dbt_ignore_spec = generate_dbt_ignore_spec(project.project_root) + project_files = self.project_parser_files[project.project_name] = {} - project_files["SnapshotParser"] = read_files_for_parser( - project, - files, - project.snapshot_paths, - [".sql"], - ParseFileType.Snapshot, - saved_files, - dbt_ignore_spec, - ) + for parse_ft, file_type_info in file_types.items(): + project_files[file_type_info["parser"]] = read_files_for_parser( + project, + self.files, + parse_ft, + file_type_info, + self.saved_files, + dbt_ignore_spec, + ) - project_files["AnalysisParser"] = read_files_for_parser( - project, - files, - project.analysis_paths, - [".sql"], - ParseFileType.Analysis, - saved_files, - dbt_ignore_spec, - ) - - project_files["SingularTestParser"] = read_files_for_parser( - project, - files, - project.test_paths, - [".sql"], - ParseFileType.SingularTest, - saved_files, - dbt_ignore_spec, - ) - - # all generic tests within /tests must be nested under a /generic subfolder - project_files["GenericTestParser"] = read_files_for_parser( - project, - files, - project.generic_test_paths, - [".sql"], - ParseFileType.GenericTest, - saved_files, - dbt_ignore_spec, - ) - - project_files["SeedParser"] = read_files_for_parser( - project, - files, - project.seed_paths, - [".csv"], - ParseFileType.Seed, - saved_files, - dbt_ignore_spec, - ) - - project_files["DocumentationParser"] = read_files_for_parser( - project, - files, - project.docs_paths, - [".md"], - ParseFileType.Documentation, - saved_files, - dbt_ignore_spec, - ) - - project_files["SchemaParser"] = read_files_for_parser( - project, - files, - project.all_source_paths, - [".yml", ".yaml"], - ParseFileType.Schema, - saved_files, - dbt_ignore_spec, - ) - # Store the parser files for this particular project - parser_files[project.project_name] = project_files +def get_file_types_for_project(project): + file_types = { + ParseFileType.Macro: { + "paths": project.macro_paths, + "extensions": [".sql"], + "parser": "MacroParser", + }, + ParseFileType.Model: { + "paths": project.model_paths, + "extensions": [".sql", ".py"], + "parser": "ModelParser", + }, + ParseFileType.Snapshot: { + "paths": project.snapshot_paths, + "extensions": [".sql"], + "parser": "SnapshotParser", + }, + ParseFileType.Analysis: { + "paths": project.analysis_paths, + "extensions": [".sql"], + "parser": "AnalysisParser", + }, + ParseFileType.SingularTest: { + "paths": project.test_paths, + "extensions": [".sql"], + "parser": "SingularTestParser", + }, + ParseFileType.GenericTest: { + "paths": project.generic_test_paths, + "extensions": [".sql"], + "parser": "GenericTestParser", + }, + ParseFileType.Seed: { + "paths": project.seed_paths, + "extensions": [".csv"], + "parser": "SeedParser", + }, + ParseFileType.Documentation: { + "paths": project.docs_paths, + "extensions": [".md"], + "parser": "DocumentationParser", + }, + ParseFileType.Schema: { + "paths": project.all_source_paths, + "extensions": [".yml", ".yaml"], + "parser": "SchemaParser", + }, + } + return file_types From e9d746988ee5c48d1b82d4145361908ead24051d Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Feb 2023 15:40:07 -0500 Subject: [PATCH 03/16] minor cleanup --- core/dbt/parser/manifest.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py index fbdd354a5f2..a261a1346c1 100644 --- a/core/dbt/parser/manifest.py +++ b/core/dbt/parser/manifest.py @@ -223,16 +223,12 @@ def get_full_manifest( # This is where the main action happens def load(self): start_read_files = time.perf_counter() - # This is a dictionary of - project_parser_files = {} - saved_files = {} - if self.saved_manifest: - saved_files = self.saved_manifest.files # This updates the "files" dictionary in self.manifest, and creates - # the partial_parser_files dictionary (see read_files.py). - # Read files creates a dictionary of projects to a dictionary + # the partial_parser_files dictionary (see read_files.py), + # which is a dictionary of projects to a dictionary # of parsers to lists of file strings. The file strings are # used to get the SourceFiles from the manifest files. + saved_files = self.saved_manifest.files if self.saved_manifest else {} file_reader = ReadFilesFromFileSystem( all_projects=self.all_projects, files=self.manifest.files, From 97498d684f4f28b8ee62e4ffdbccbde509c0e28a Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Feb 2023 19:31:34 -0500 Subject: [PATCH 04/16] A bunch of code in ReadFilesFromDiff --- core/dbt/clients/system.py | 7 +- core/dbt/events/proto_types.py | 27 +++--- core/dbt/events/types.proto | 21 +++-- core/dbt/events/types.py | 17 ++-- core/dbt/parser/read_files.py | 168 +++++++++++++++++++++++++++++++-- tests/unit/test_events.py | 1 - 6 files changed, 197 insertions(+), 44 deletions(-) diff --git a/core/dbt/clients/system.py b/core/dbt/clients/system.py index 6c72fadea52..ab5413695e9 100644 --- a/core/dbt/clients/system.py +++ b/core/dbt/clients/system.py @@ -16,7 +16,6 @@ from dbt.events.functions import fire_event from dbt.events.types import ( - SystemErrorRetrievingModTime, SystemCouldNotWrite, SystemExecutingCmd, SystemStdOut, @@ -75,11 +74,7 @@ def find_matching( relative_path = os.path.relpath(absolute_path, absolute_path_to_search) relative_path_to_root = os.path.join(relative_path_to_search, relative_path) - modification_time = 0.0 - try: - modification_time = os.path.getmtime(absolute_path) - except OSError: - fire_event(SystemErrorRetrievingModTime(path=absolute_path)) + modification_time = os.path.getmtime(absolute_path) if reobj.match(local_file) and ( not ignore_spec or not ignore_spec.match_file(relative_path_to_root) ): diff --git a/core/dbt/events/proto_types.py b/core/dbt/events/proto_types.py index 84c01cac101..471fb97dece 100644 --- a/core/dbt/events/proto_types.py +++ b/core/dbt/events/proto_types.py @@ -975,6 +975,20 @@ class ParseCmdOutMsg(betterproto.Message): data: "ParseCmdOut" = betterproto.message_field(2) +@dataclass +class InputFileDiffError(betterproto.Message): + """I002""" + + category: str = betterproto.string_field(1) + file_id: str = betterproto.string_field(2) + + +@dataclass +class InputFileDiffErrorMsg(betterproto.Message): + info: "EventInfo" = betterproto.message_field(1) + data: "InputFileDiffError" = betterproto.message_field(2) + + @dataclass class GenericTestFileParse(betterproto.Message): """I011""" @@ -2224,19 +2238,6 @@ class MainStackTraceMsg(betterproto.Message): data: "MainStackTrace" = betterproto.message_field(2) -@dataclass -class SystemErrorRetrievingModTime(betterproto.Message): - """Z004""" - - path: str = betterproto.string_field(1) - - -@dataclass -class SystemErrorRetrievingModTimeMsg(betterproto.Message): - info: "EventInfo" = betterproto.message_field(1) - data: "SystemErrorRetrievingModTime" = betterproto.message_field(2) - - @dataclass class SystemCouldNotWrite(betterproto.Message): """Z005""" diff --git a/core/dbt/events/types.proto b/core/dbt/events/types.proto index 1819f357a2b..a9c06411e08 100644 --- a/core/dbt/events/types.proto +++ b/core/dbt/events/types.proto @@ -773,6 +773,17 @@ message ParseCmdOutMsg { ParseCmdOut data = 2; } +// I002 +message InputFileDiffError { + string category = 1; + string file_id = 2; +} + +message InputFileDiffErrorMsg { + EventInfo info = 1; + InputFileDiffError data = 2; +} + // Skipping I002, I003, I004, I005, I006, I007, I008, I009, I010 @@ -1787,15 +1798,7 @@ message MainStackTraceMsg { MainStackTrace data = 2; } -// Z004 -message SystemErrorRetrievingModTime { - string path = 1; -} - -message SystemErrorRetrievingModTimeMsg { - EventInfo info = 1; - SystemErrorRetrievingModTime data = 2; -} +// Skipped Z004 // Z005 message SystemCouldNotWrite { diff --git a/core/dbt/events/types.py b/core/dbt/events/types.py index 0b005b65b25..6862f88307b 100644 --- a/core/dbt/events/types.py +++ b/core/dbt/events/types.py @@ -777,6 +777,15 @@ def message(self) -> str: return self.msg +@dataclass +class InputFileDiffError(DebugLevel, pt.InputFileDiffError): + def code(self): + return "I002" + + def message(self) -> str: + return f"Error process file diff: {self.category}, {self.file_id}" + + # Skipping I002, I003, I004, I005, I006, I007, I008, I009, I010 @@ -1873,13 +1882,7 @@ def message(self) -> str: return self.stack_trace -@dataclass -class SystemErrorRetrievingModTime(ErrorLevel, pt.SystemErrorRetrievingModTime): - def code(self): - return "Z004" - - def message(self) -> str: - return f"Error retrieving modification time for file {self.path}" +# Skipped Z004 @dataclass diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 152c2686492..8361dbed8e2 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -16,24 +16,22 @@ from dbt.exceptions import ParsingError from dbt.parser.search import filesystem_search from typing import Optional, Dict, List, Mapping +from dbt.events.types import InputFileDiffError +from dbt.events.functions import fire_event @dataclass class InputFile: path: str contents: str - - -@dataclass -class ProjectFileDiff: - project_name: str - deleted_files: List[str] - changed_files: List + modification_time: Optional[float] = None @dataclass class FileDiff: - projects: List[ProjectFileDiff] + deleted_files: List[str] + changed_files: List[InputFile] + added_files: List[InputFile] # This loads the files contents and creates the SourceFile object @@ -209,6 +207,160 @@ def read_files_for_project(self, project, file_types): ) +@dataclass +class ReadFilesFromDiff: + root_project_name: str + all_projects: Mapping[str, Project] + files: Dict[str, AnySourceFile] + file_diff: FileDiff + # Is saved_files required? Could a file diff contain the entire project? + saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) + project_parser_files: Dict = field(default_factory=dict) + project_file_types: Dict = field(default_factory=dict) + + def build_files(self): + # Copy the base file information from the existing manifest. + # We will do deletions, adds, changes from the file_diff to emulate + # a complete read of the project file system. + for file_id, source_file in self.saved_files.items(): + if isinstance(source_file, SchemaSourceFile): + file_cls = SchemaSourceFile + else: + file_cls = SourceFile + new_source_file = file_cls( + path=source_file.path, + checksum=source_file.checksum, + project_name=source_file.project_name, + parse_file_type=source_file.parse_file_type, + contents=source_file.contents, + ) + self.files[file_id] = new_source_file + + # Now that we have a copy of the files, remove deleted files + # For now, we assume that all files are in the root_project, until + # we've determined whether project name will be provided or deduced + # from the directory. + for input_file_path in self.file_diff.deleted: + project_name = self.get_project_name(input_file_path) + file_id = f"{project_name}://{input_file_path}" + if file_id in self.files: + self.files.pop(file_id) + else: + fire_event(InputFileDiffError(category="deleted file not found", file_id=file_id)) + + # Now we do the changes + for input_file in self.file_diff.changed: + project_name = self.get_project_name(input_file.path) + file_id = f"{project_name}://{input_file.path}" + if file_id in self.files: + # Get the existing source_file object and update the contents and mod time + source_file = self.files[file_id] + source_file.contents = input_file.contents + source_file.checksum = FileHash.from_contents(input_file.contents) + source_file.path.modification_time = input_file.modification_time + # Handle creation of dictionary version of schema file content + if isinstance(source_file, SchemaSourceFile) and source_file.contents: + dfy = yaml_from_file(source_file) + if dfy: + validate_yaml(source_file.path.original_file_path, dfy) + source_file.dfy = dfy + # TODO: ensure we have a file object even for empty files, such as schema files + + # Now the new files + for input_file in self.file_diff.added: + project_name = self.get_project_name(input_file.path) + # FilePath + # searched_path i.e. "models" + # relative_path i.e. the part after searched_path, or "model.sql" + # modification_time float, default 0.0... + # project_root + # We use PurePath because there's no actual filesystem to look at + pure_path = pathlib.PurePath(input_file["path"]) + extension = pure_path.suffix + searched_path = pure_path.parts[0] + # check what happens with generic tests... searched_path/relative_path + + relative_path_parts = pure_path.parts[1:] + relative_path = pathlib.PurePath.joinpaths(relative_path_parts) + # Create FilePath object + input_file_path = FilePath( + searched_path=searched_path, + relative_path=relative_path, + modification_time=input_file["modification_time"], + project_root=self.all_projects[project_name].project_root, + ) + + # Now use the extension and "searched_path" to determine which file_type + (file_types, file_type_lookup) = self.get_project_file_types(project_name) + parse_ft_for_extension = set() + parse_ft_for_path = set() + if extension in file_type_lookup["extensions"]: + parse_ft_for_extension = file_type_lookup["extensions"][extension] + if searched_path in file_type_lookup["paths"]: + parse_ft_for_path = file_type_lookup["paths"][searched_path] + if len(parse_ft_for_extension) == 0 or len(parse_ft_for_path) == 0: + fire_event(InputFileDiffError(category="not a project file", file_id=file_id)) + continue + parse_ft_set = parse_ft_for_extension.intersetion(parse_ft_for_path) + if ( + len(parse_ft_set) != 1 + ): # There should only be one result for a path/extension combination + fire_event( + InputFileDiffError( + category="unable to resolve diff file location", file_id=file_id + ) + ) + continue + parse_ft = parse_ft_set.pop() + source_file_cls = SourceFile + if parse_ft == ParseFileType.Schema: + source_file_cls = SchemaSourceFile + source_file = source_file_cls( + path=input_file_path, + contents=input_file.contents, + checksum=FileHash.from_contents(input_file.contents), + project_name=project_name, + parse_file_type=parse_ft, + ) + if source_file_cls == SchemaSourceFile: + dfy = yaml_from_file(source_file) + if dfy: + validate_yaml(source_file.path.original_file_path, dfy) + source_file.dfy = dfy + else: + # don't include in files because no content + continue + self.files[source_file.file_id] = source_file + + def get_project_name(self, path): + # just return root_project_name for now + return self.root_project_name + + def get_project_file_types(self, project_name): + if project_name not in self.project_file_types: + file_types = get_file_types_for_project(self.all_projects[project_name]) + file_type_lookup = self.get_file_type_lookup(file_types) + self.project_file_types[project_name] = { + "file_types": file_types, + "file_type_lookup": file_type_lookup, + } + file_types = self.project_file_types[project_name]["file_types"] + file_type_lookup = self.project_file_types[project_name]["file_type_lookup"] + + def get_file_type_lookup(self, file_types): + file_type_lookup = {"paths": {}, "extensions": {}} + for parse_ft, file_type in file_types: + for path in file_type["paths"]: + if path not in file_type_lookup["paths"]: + file_type_lookup["paths"][path] = set() + file_type_lookup["paths"][path].add(parse_ft) + for extension in file_type["extensions"]: + if extension not in file_type_lookup["extensions"]: + file_type_lookup["extensions"][extension] = set() + file_type_lookup["extensions"][extension].add(parse_ft) + return file_type_lookup + + def get_file_types_for_project(project): file_types = { ParseFileType.Macro: { diff --git a/tests/unit/test_events.py b/tests/unit/test_events.py index c7b9ca24d66..963ba1b056d 100644 --- a/tests/unit/test_events.py +++ b/tests/unit/test_events.py @@ -336,7 +336,6 @@ def test_event_codes(self): MainKeyboardInterrupt(), MainEncounteredError(exc=""), MainStackTrace(stack_trace=""), - SystemErrorRetrievingModTime(path=""), SystemCouldNotWrite(path="", reason="", exc=""), SystemExecutingCmd(cmd=[""]), SystemStdOut(bmsg=b""), From d73e500d07bd453c20cf230f1cd0fd8aebe91337 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Sat, 4 Feb 2023 12:20:12 -0500 Subject: [PATCH 05/16] initial test working --- core/dbt/parser/manifest.py | 44 +++++++++++++---- core/dbt/parser/read_files.py | 48 ++++++++++--------- core/dbt/tests/util.py | 5 ++ .../partial_parsing/test_file_diff.py | 22 +++++++++ 4 files changed, 88 insertions(+), 31 deletions(-) create mode 100644 tests/functional/partial_parsing/test_file_diff.py diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py index a261a1346c1..6359a43cc97 100644 --- a/core/dbt/parser/manifest.py +++ b/core/dbt/parser/manifest.py @@ -37,14 +37,19 @@ from dbt.node_types import NodeType from dbt.clients.jinja import get_rendered, MacroStack from dbt.clients.jinja_static import statically_extract_macro_calls -from dbt.clients.system import make_directory +from dbt.clients.system import make_directory, path_exists, read_json from dbt.config import Project, RuntimeConfig from dbt.context.docs import generate_runtime_docs_context from dbt.context.macro_resolver import MacroResolver, TestMacroNamespace from dbt.context.configured import generate_macro_context from dbt.context.providers import ParseProvider from dbt.contracts.files import FileHash, ParseFileType, SchemaSourceFile -from dbt.parser.read_files import ReadFilesFromFileSystem, load_source_file, FileDiff +from dbt.parser.read_files import ( + ReadFilesFromFileSystem, + load_source_file, + FileDiff, + ReadFilesFromDiff, +) from dbt.parser.partial import PartialParsing, special_override_macros from dbt.contracts.graph.manifest import ( Manifest, @@ -152,6 +157,7 @@ def __init__( ) -> None: self.root_project: RuntimeConfig = root_project self.all_projects: Mapping[str, Project] = all_projects + self.file_diff = file_diff self.manifest: Manifest = Manifest() self.new_manifest = self.manifest self.manifest.metadata = root_project.get_metadata() @@ -198,12 +204,19 @@ def get_full_manifest( adapter.clear_macro_manifest() macro_hook = adapter.connections.set_query_header + # Hack to test file_diffs + if os.environ.get("DBT_PP_FILE_DIFF_TEST"): + file_diff_path = "file_diff.json" + if path_exists(file_diff_path): + file_diff_dct = read_json(file_diff_path) + file_diff = FileDiff.from_dict(file_diff_dct) + with PARSING_STATE: # set up logbook.Processor for parsing # Start performance counting start_load_all = time.perf_counter() projects = config.load_dependencies() - loader = cls(config, projects, macro_hook) + loader = cls(config, projects, macro_hook=macro_hook, file_diff=file_diff) manifest = loader.load() @@ -223,18 +236,33 @@ def get_full_manifest( # This is where the main action happens def load(self): start_read_files = time.perf_counter() + # This updates the "files" dictionary in self.manifest, and creates # the partial_parser_files dictionary (see read_files.py), # which is a dictionary of projects to a dictionary # of parsers to lists of file strings. The file strings are # used to get the SourceFiles from the manifest files. saved_files = self.saved_manifest.files if self.saved_manifest else {} - file_reader = ReadFilesFromFileSystem( - all_projects=self.all_projects, - files=self.manifest.files, - saved_files=saved_files, - ) + if self.file_diff: + # We're getting files from a file diff + file_reader = ReadFilesFromDiff( + all_projects=self.all_projects, + files=self.manifest.files, + saved_files=saved_files, + root_project_name=self.root_project.project_name, + file_diff=self.file_diff, + ) + else: + # We're getting files from a file_diff structure + file_reader = ReadFilesFromFileSystem( + all_projects=self.all_projects, + files=self.manifest.files, + saved_files=saved_files, + ) + + # Set the files in the manifest and save the project_parser_files file_reader.read_files() + self.manifest.files = file_reader.files project_parser_files = orig_project_parser_files = file_reader.project_parser_files self._perf_info.path_count = len(self.manifest.files) self._perf_info.read_files_elapsed = time.perf_counter() - start_read_files diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 8361dbed8e2..d218a15e9c3 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -12,6 +12,7 @@ SchemaSourceFile, ) from dbt.config import Project +from dbt.dataclass_schema import dbtClassMixin from dbt.parser.schemas import yaml_from_file, schema_file_keys, check_format_version from dbt.exceptions import ParsingError from dbt.parser.search import filesystem_search @@ -21,17 +22,17 @@ @dataclass -class InputFile: +class InputFile(dbtClassMixin): path: str - contents: str - modification_time: Optional[float] = None + content: str + modification_time: float = 0.0 @dataclass -class FileDiff: - deleted_files: List[str] - changed_files: List[InputFile] - added_files: List[InputFile] +class FileDiff(dbtClassMixin): + deleted: List[str] + changed: List[InputFile] + added: List[InputFile] # This loads the files contents and creates the SourceFile object @@ -176,7 +177,7 @@ class ReadFilesFromFileSystem: all_projects: Mapping[str, Project] # This is a reference to the "files" dictionary in the current manifest, so the # manifest in implicitly updated by this code. - files: Dict[str, AnySourceFile] + files: Dict[str, AnySourceFile] = field(default_factory=dict) # saved_files is only used to compare schema files saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) # project_parser_files = { @@ -211,14 +212,14 @@ def read_files_for_project(self, project, file_types): class ReadFilesFromDiff: root_project_name: str all_projects: Mapping[str, Project] - files: Dict[str, AnySourceFile] file_diff: FileDiff + files: Dict[str, AnySourceFile] = field(default_factory=dict) # Is saved_files required? Could a file diff contain the entire project? saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) project_parser_files: Dict = field(default_factory=dict) project_file_types: Dict = field(default_factory=dict) - def build_files(self): + def read_files(self): # Copy the base file information from the existing manifest. # We will do deletions, adds, changes from the file_diff to emulate # a complete read of the project file system. @@ -255,8 +256,8 @@ def build_files(self): if file_id in self.files: # Get the existing source_file object and update the contents and mod time source_file = self.files[file_id] - source_file.contents = input_file.contents - source_file.checksum = FileHash.from_contents(input_file.contents) + source_file.contents = input_file.content + source_file.checksum = FileHash.from_contents(input_file.content) source_file.path.modification_time = input_file.modification_time # Handle creation of dictionary version of schema file content if isinstance(source_file, SchemaSourceFile) and source_file.contents: @@ -275,18 +276,18 @@ def build_files(self): # modification_time float, default 0.0... # project_root # We use PurePath because there's no actual filesystem to look at - pure_path = pathlib.PurePath(input_file["path"]) - extension = pure_path.suffix - searched_path = pure_path.parts[0] + input_file_path = pathlib.PurePath(input_file.path) + extension = input_file_path.suffix + searched_path = input_file_path.parts[0] # check what happens with generic tests... searched_path/relative_path - relative_path_parts = pure_path.parts[1:] - relative_path = pathlib.PurePath.joinpaths(relative_path_parts) + relative_path_parts = input_file_path.parts[1:] + relative_path = pathlib.PurePath("").joinpath(*relative_path_parts) # Create FilePath object input_file_path = FilePath( searched_path=searched_path, - relative_path=relative_path, - modification_time=input_file["modification_time"], + relative_path=str(relative_path), + modification_time=input_file.modification_time, project_root=self.all_projects[project_name].project_root, ) @@ -301,7 +302,7 @@ def build_files(self): if len(parse_ft_for_extension) == 0 or len(parse_ft_for_path) == 0: fire_event(InputFileDiffError(category="not a project file", file_id=file_id)) continue - parse_ft_set = parse_ft_for_extension.intersetion(parse_ft_for_path) + parse_ft_set = parse_ft_for_extension.intersection(parse_ft_for_path) if ( len(parse_ft_set) != 1 ): # There should only be one result for a path/extension combination @@ -317,8 +318,8 @@ def build_files(self): source_file_cls = SchemaSourceFile source_file = source_file_cls( path=input_file_path, - contents=input_file.contents, - checksum=FileHash.from_contents(input_file.contents), + contents=input_file.content, + checksum=FileHash.from_contents(input_file.content), project_name=project_name, parse_file_type=parse_ft, ) @@ -346,10 +347,11 @@ def get_project_file_types(self, project_name): } file_types = self.project_file_types[project_name]["file_types"] file_type_lookup = self.project_file_types[project_name]["file_type_lookup"] + return (file_types, file_type_lookup) def get_file_type_lookup(self, file_types): file_type_lookup = {"paths": {}, "extensions": {}} - for parse_ft, file_type in file_types: + for parse_ft, file_type in file_types.items(): for path in file_type["paths"]: if path not in file_type_lookup["paths"]: file_type_lookup["paths"][path] = set() diff --git a/core/dbt/tests/util.py b/core/dbt/tests/util.py index 245648ceb48..39274476adc 100644 --- a/core/dbt/tests/util.py +++ b/core/dbt/tests/util.py @@ -164,6 +164,11 @@ def get_artifact(*paths): return dct +def write_artifact(dct, *paths): + json_output = json.dumps(dct) + write_file(json_output, *paths) + + # For updating yaml config files def update_config_file(updates, *paths): current_yaml = read_file(*paths) diff --git a/tests/functional/partial_parsing/test_file_diff.py b/tests/functional/partial_parsing/test_file_diff.py new file mode 100644 index 00000000000..a2ca6628285 --- /dev/null +++ b/tests/functional/partial_parsing/test_file_diff.py @@ -0,0 +1,22 @@ +import os + +from dbt.tests.util import run_dbt, write_artifact + +first_file_diff = { + "deleted": [], + "changed": [], + "added": [{"path": "models/model_one.sql", "content": "select 1 as fun"}], +} + + +class TestFileDiffs: + def test_file_diffs(self, project): + + os.environ["DBT_PP_FILE_DIFF_TEST"] = "true" + + # We start with an empty project + results = run_dbt() + + write_artifact(first_file_diff, "file_diff.json") + results = run_dbt() + assert len(results) == 1 From 8f52b31c38b975002100de0cc966139559985f66 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Mon, 6 Feb 2023 08:45:22 -0500 Subject: [PATCH 06/16] add another model to test --- tests/functional/partial_parsing/test_file_diff.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/functional/partial_parsing/test_file_diff.py b/tests/functional/partial_parsing/test_file_diff.py index a2ca6628285..4d3db5486ad 100644 --- a/tests/functional/partial_parsing/test_file_diff.py +++ b/tests/functional/partial_parsing/test_file_diff.py @@ -9,6 +9,13 @@ } +second_file_diff = { + "deleted": [], + "changed": [], + "added": [{"path": "models/model_two.sql", "content": "select 123 as notfun"}], +} + + class TestFileDiffs: def test_file_diffs(self, project): @@ -20,3 +27,7 @@ def test_file_diffs(self, project): write_artifact(first_file_diff, "file_diff.json") results = run_dbt() assert len(results) == 1 + + write_artifact(second_file_diff, "file_diff.json") + results = run_dbt() + assert len(results) == 2 From 88f1982948d68ce66ef9167fbaba4ed133e3be7d Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Mon, 6 Feb 2023 08:47:57 -0500 Subject: [PATCH 07/16] Changie --- .changes/unreleased/Features-20230206-084749.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Features-20230206-084749.yaml diff --git a/.changes/unreleased/Features-20230206-084749.yaml b/.changes/unreleased/Features-20230206-084749.yaml new file mode 100644 index 00000000000..53d867ed403 --- /dev/null +++ b/.changes/unreleased/Features-20230206-084749.yaml @@ -0,0 +1,6 @@ +kind: Features +body: Enable diff based partial parsing +time: 2023-02-06T08:47:49.688889-05:00 +custom: + Author: gshank + Issue: "6592" From 5d028c0968c8141e8689e8e8070840de05dcd992 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Tue, 7 Feb 2023 15:07:59 -0500 Subject: [PATCH 08/16] minor updates --- core/dbt/contracts/files.py | 2 -- core/dbt/events/types.py | 2 +- core/dbt/parser/read_files.py | 11 +++++------ tests/unit/test_events.py | 1 + 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/core/dbt/contracts/files.py b/core/dbt/contracts/files.py index 93f12a1411e..a7f1812aa3c 100644 --- a/core/dbt/contracts/files.py +++ b/core/dbt/contracts/files.py @@ -61,8 +61,6 @@ def absolute_path(self) -> str: @property def original_file_path(self) -> str: - # this is mostly used for reporting errors. It doesn't show the project - # name, should it? return os.path.join(self.searched_path, self.relative_path) def seed_too_large(self) -> bool: diff --git a/core/dbt/events/types.py b/core/dbt/events/types.py index 6862f88307b..ced6f1a6f8f 100644 --- a/core/dbt/events/types.py +++ b/core/dbt/events/types.py @@ -783,7 +783,7 @@ def code(self): return "I002" def message(self) -> str: - return f"Error process file diff: {self.category}, {self.file_id}" + return f"Error processing file diff: {self.category}, {self.file_id}" # Skipping I002, I003, I004, I005, I006, I007, I008, I009, I010 diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index d218a15e9c3..f2e284537a2 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -31,6 +31,8 @@ class InputFile(dbtClassMixin): @dataclass class FileDiff(dbtClassMixin): deleted: List[str] + # Note: it would be possible to not distinguish between + # added and changed files, but we would lose some error handling. changed: List[InputFile] added: List[InputFile] @@ -68,16 +70,14 @@ def load_source_file( if not skip_loading_schema_file: file_contents = load_file_contents(path.absolute_path, strip=False) - source_file.checksum = FileHash.from_contents(file_contents) source_file.contents = file_contents.strip() + source_file.checksum = FileHash.from_contents(source_file.contents) if parse_file_type == ParseFileType.Schema and source_file.contents: dfy = yaml_from_file(source_file) if dfy: validate_yaml(source_file.path.original_file_path, dfy) source_file.dfy = dfy - else: - source_file = None return source_file @@ -175,8 +175,6 @@ def generate_dbt_ignore_spec(project_root): @dataclass class ReadFilesFromFileSystem: all_projects: Mapping[str, Project] - # This is a reference to the "files" dictionary in the current manifest, so the - # manifest in implicitly updated by this code. files: Dict[str, AnySourceFile] = field(default_factory=dict) # saved_files is only used to compare schema files saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) @@ -214,7 +212,8 @@ class ReadFilesFromDiff: all_projects: Mapping[str, Project] file_diff: FileDiff files: Dict[str, AnySourceFile] = field(default_factory=dict) - # Is saved_files required? Could a file diff contain the entire project? + # saved_files is used to construct a fresh copy of files, without + # additional information from parsing saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) project_parser_files: Dict = field(default_factory=dict) project_file_types: Dict = field(default_factory=dict) diff --git a/tests/unit/test_events.py b/tests/unit/test_events.py index 963ba1b056d..7cb98e0f8ff 100644 --- a/tests/unit/test_events.py +++ b/tests/unit/test_events.py @@ -178,6 +178,7 @@ def test_event_codes(self): FinishedRunningStats(stat_line="", execution="", execution_time=0), # I - Project parsing ====================== ParseCmdOut(msg="testing"), + InputFileDiffError(category="failed"), GenericTestFileParse(path=""), MacroFileParse(path=""), PartialParsingErrorProcessingFile(file=""), From 2846e493cce183f0743118679dffe0fcb382b20f Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Wed, 8 Feb 2023 11:20:39 -0500 Subject: [PATCH 09/16] Add some dependencies to test_file_diff.py to test finding project directories (not done) --- core/dbt/parser/read_files.py | 2 + .../partial_parsing/test_file_diff.py | 120 +++++++++++++++++- 2 files changed, 120 insertions(+), 2 deletions(-) diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index f2e284537a2..2cbb6974597 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -334,6 +334,8 @@ def read_files(self): def get_project_name(self, path): # just return root_project_name for now + for project_name, project in self.all_projects.items(): + print(f"--- project_name: {project_name}, project_root: {project.project_root}") return self.root_project_name def get_project_file_types(self, project_name): diff --git a/tests/functional/partial_parsing/test_file_diff.py b/tests/functional/partial_parsing/test_file_diff.py index 4d3db5486ad..729ccd45712 100644 --- a/tests/functional/partial_parsing/test_file_diff.py +++ b/tests/functional/partial_parsing/test_file_diff.py @@ -1,6 +1,72 @@ import os +import pytest from dbt.tests.util import run_dbt, write_artifact +from dbt.tests.fixtures.project import write_project_files + + +local_dependency__dbt_project_yml = """ +name: 'local_dep' +version: '1.0' +config-version: 2 + +profile: 'default' + +require-dbt-version: '>=0.1.0' + +target-path: "target" # directory which will store compiled SQL files +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + +seeds: + quote_columns: False + +""" + +local_dependency__models__schema_yml = """ +version: 2 +sources: + - name: seed_source + schema: schema + tables: + - name: "local_seed" + columns: + - name: id + tests: + - unique + +seeds: + - name: local_seed + - name: seed + config: + column_types: + id: integer + first_name: varchar(11) + email: varchar(31) + ip_address: varchar(15) + updated_at: timestamp without time zone + +""" + +local_dependency__models__model_to_import_sql = """ +select * from {{ ref('seed') }} + +""" + +local_dependency__seeds__seed_csv = """id +1 +""" + +# This seed is to make the dbt_integration_project work, but is actually installed +# by the local_dependency +integration_project__seed_csv = """id,first_name,email,ip_address,updated_at +1,Larry,lking0@miitbeian.gov.cn,69.135.206.194,2008-09-12 19:08:31 +2,Larry,lperkins1@toplist.cz,64.210.133.162,1978-05-09 04:15:14 +3,Anna,amontgomery2@miitbeian.gov.cn,168.104.64.114,2011-10-16 04:07:57 +4,Sandra,sgeorge3@livejournal.com,229.235.252.98,1973-07-19 10:52:43 +""" + first_file_diff = { "deleted": [], @@ -17,17 +83,67 @@ class TestFileDiffs: + # We need to be able to test changes to dependencies + @pytest.fixture(scope="class", autouse=True) + def setUp(self, project_root): + local_dependency_files = { + "dbt_project.yml": local_dependency__dbt_project_yml, + "models": { + "schema.yml": local_dependency__models__schema_yml, + "model_to_import.sql": local_dependency__models__model_to_import_sql, + }, + "seeds": { + "local_seed.csv": local_dependency__seeds__seed_csv, + "seed.csv": integration_project__seed_csv, + }, + } + write_project_files(project_root, "local_dependency", local_dependency_files) + + @pytest.fixture(scope="class") + def packages(self): + return { + "packages": [ + { + "git": "https://github.com/dbt-labs/dbt-integration-project", + "revision": "1.1", + }, + {"local": "local_dependency"}, + ] + } + + # @pytest.fixture(scope="class") + # def project_config_update(self): + # return { + # "seeds": { + # "quote_columns": False, + # "test": { + # "seed": { + # "+column_types": { + # "id": "integer", + # "first_name": "varchar(11)", + # "email": "varchar(31)", + # "ip_address": "varchar(15)", + # "updated_at": "timestamp without time zone", + # }, + # }, + # }, + # }, + # } + def test_file_diffs(self, project): os.environ["DBT_PP_FILE_DIFF_TEST"] = "true" + run_dbt(["deps"]) + run_dbt(["seed"]) + # We start with an empty project results = run_dbt() write_artifact(first_file_diff, "file_diff.json") results = run_dbt() - assert len(results) == 1 + assert len(results) == 5 write_artifact(second_file_diff, "file_diff.json") results = run_dbt() - assert len(results) == 2 + assert len(results) == 6 From 439b449600d57cb39712648174f1a88654229ce2 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Wed, 8 Feb 2023 16:54:46 -0500 Subject: [PATCH 10/16] Get information on packages, not working... --- core/dbt/parser/read_files.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 2cbb6974597..a9c4ba16dcb 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -11,6 +11,7 @@ AnySourceFile, SchemaSourceFile, ) +from dbt.contracts.project import LocalPackage from dbt.config import Project from dbt.dataclass_schema import dbtClassMixin from dbt.parser.schemas import yaml_from_file, schema_file_keys, check_format_version @@ -217,6 +218,7 @@ class ReadFilesFromDiff: saved_files: Dict[str, AnySourceFile] = field(default_factory=dict) project_parser_files: Dict = field(default_factory=dict) project_file_types: Dict = field(default_factory=dict) + local_package_dirs: Optional[List[str]] = None def read_files(self): # Copy the base file information from the existing manifest. @@ -333,9 +335,24 @@ def read_files(self): self.files[source_file.file_id] = source_file def get_project_name(self, path): - # just return root_project_name for now - for project_name, project in self.all_projects.items(): - print(f"--- project_name: {project_name}, project_root: {project.project_root}") + if self.local_package_dirs is None: + packages = self.all_projects[self.root_project_name].packages + self.local_package_dirs = [] + for package_spec in packages.packages: + if isinstance(package_spec, LocalPackage): + self.local_package_dirs.append(package_spec.local) + if len(self.local_package_dirs) == 0: + return self.root_project_name + + # Check whether this change is to a local package + input_file_path = pathlib.PurePath(path) + input_file_path_parts = input_file_path.parts + for local_package_dir in self.local_package_dirs: + if local_package_dir in input_file_path_parts: + pass + # we need the package_name of the local package_dir and + # there's currently no way to get it + return self.root_project_name def get_project_file_types(self, project_name): From a5aac4a4f3f1e3cd2f9961ed826e8707f0bb8bf5 Mon Sep 17 00:00:00 2001 From: Github Build Bot Date: Fri, 10 Feb 2023 22:06:56 +0000 Subject: [PATCH 11/16] Add generated CLI API docs --- .../docs/build/doctrees/environment.pickle | Bin 207366 -> 207366 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/core/dbt/docs/build/doctrees/environment.pickle b/core/dbt/docs/build/doctrees/environment.pickle index f999a716d9dd676cce7d8486fd6fbdde74859b15..6a147490f375de0f442ace19aae6ca9b04063a40 100644 GIT binary patch delta 32 mcmZp>!qawzXG60*>-zr>F1&5-kZ!qawzXG60*>zv>g` Date: Fri, 3 Mar 2023 10:30:58 -0500 Subject: [PATCH 12/16] Fix merge errors --- core/dbt/events/proto_types.py | 1 + tests/unit/test_events.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/core/dbt/events/proto_types.py b/core/dbt/events/proto_types.py index 273d26a6957..c7ad5fdf83f 100644 --- a/core/dbt/events/proto_types.py +++ b/core/dbt/events/proto_types.py @@ -976,6 +976,7 @@ class InputFileDiffErrorMsg(betterproto.Message): data: "InputFileDiffError" = betterproto.message_field(2) +@dataclass class InvalidValueForField(betterproto.Message): """I008""" diff --git a/tests/unit/test_events.py b/tests/unit/test_events.py index e70a095fa2b..3487ce737cb 100644 --- a/tests/unit/test_events.py +++ b/tests/unit/test_events.py @@ -172,6 +172,7 @@ def test_event_codes(self): types.HooksRunning(num_hooks=0, hook_type=""), types.FinishedRunningStats(stat_line="", execution="", execution_time=0), # I - Project parsing ====================== + types.InputFileDiffError(category="testing", file_id="my_file"), types.InvalidValueForField(field_name="test", field_value="test"), types.ValidationWarning(resource_type="model", field_name="access", node_name="my_macro"), types.ParsePerfInfoPath(path=""), @@ -333,7 +334,6 @@ def test_event_codes(self): types.MainKeyboardInterrupt(), types.MainEncounteredError(exc=""), types.MainStackTrace(stack_trace=""), - types.SystemErrorRetrievingModTime(path=""), types.SystemCouldNotWrite(path="", reason="", exc=""), types.SystemExecutingCmd(cmd=[""]), types.SystemStdOut(bmsg=b""), From dd1bd9216890761189081465bb3683e19c2fe895 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Mar 2023 11:08:38 -0500 Subject: [PATCH 13/16] Fix checksums in artifacts to match change in strip --- core/dbt/parser/read_files.py | 8 +++++--- tests/functional/artifacts/expected_manifest.py | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index a9c4ba16dcb..5780df94e3e 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -70,8 +70,10 @@ def load_source_file( skip_loading_schema_file = True if not skip_loading_schema_file: - file_contents = load_file_contents(path.absolute_path, strip=False) - source_file.contents = file_contents.strip() + # We strip the file_contents before generating the checksum because we want + # the checksum to match the stored file contents + file_contents = load_file_contents(path.absolute_path, strip=True) + source_file.contents = file_contents source_file.checksum = FileHash.from_contents(source_file.contents) if parse_file_type == ParseFileType.Schema and source_file.contents: @@ -117,7 +119,7 @@ def load_seed_source_file(match: FilePath, project_name) -> SourceFile: # We don't want to calculate a hash of this file. Use the path. source_file = SourceFile.big_seed(match) else: - file_contents = load_file_contents(match.absolute_path, strip=False) + file_contents = load_file_contents(match.absolute_path, strip=True) checksum = FileHash.from_contents(file_contents) source_file = SourceFile(path=match, checksum=checksum) source_file.contents = "" diff --git a/tests/functional/artifacts/expected_manifest.py b/tests/functional/artifacts/expected_manifest.py index a7ebf270ae6..5f058f5db09 100644 --- a/tests/functional/artifacts/expected_manifest.py +++ b/tests/functional/artifacts/expected_manifest.py @@ -167,7 +167,8 @@ def checksum_file(path): silly things if we just open(..., 'r').encode('utf-8'). """ with open(path, "rb") as fp: - hashed = hashlib.sha256(fp.read()).hexdigest() + # We strip the file contents because we want the checksum to match the store contents + hashed = hashlib.sha256(fp.read().strip()).hexdigest() return { "name": "sha256", "checksum": hashed, From d57e862416d8bfef517ebbecaa76a5037a5e9177 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Mar 2023 11:14:28 -0500 Subject: [PATCH 14/16] Remove reference to deleted_manifest --- core/dbt/parser/partial.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/core/dbt/parser/partial.py b/core/dbt/parser/partial.py index 571f95cfc03..83280d3a780 100644 --- a/core/dbt/parser/partial.py +++ b/core/dbt/parser/partial.py @@ -924,9 +924,7 @@ def delete_schema_group(self, schema_file, group_dict): group = self.saved_manifest.groups[unique_id] if group.name == group_name: self.schedule_nodes_for_parsing(self.saved_manifest.group_map[group.name]) - self.deleted_manifest.groups[unique_id] = self.saved_manifest.groups.pop( - unique_id - ) + self.saved_manifest.groups.pop(unique_id) schema_file.groups.remove(unique_id) # metrics are created only from schema files, but also can be referred to by other nodes From ce1b3ee28d3b0f025e1c6d6890576839e14b2b28 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Fri, 3 Mar 2023 13:00:32 -0500 Subject: [PATCH 15/16] Remove unused code exploring local dependency files --- core/dbt/parser/manifest.py | 2 +- core/dbt/parser/read_files.py | 21 +--- .../functional/artifacts/expected_manifest.py | 2 +- .../partial_parsing/test_file_diff.py | 116 +----------------- 4 files changed, 6 insertions(+), 135 deletions(-) diff --git a/core/dbt/parser/manifest.py b/core/dbt/parser/manifest.py index 483d7f7fa63..afe078181c4 100644 --- a/core/dbt/parser/manifest.py +++ b/core/dbt/parser/manifest.py @@ -262,7 +262,7 @@ def load(self): file_diff=self.file_diff, ) else: - # We're getting files from a file_diff structure + # We're getting files from the file system file_reader = ReadFilesFromFileSystem( all_projects=self.all_projects, files=self.manifest.files, diff --git a/core/dbt/parser/read_files.py b/core/dbt/parser/read_files.py index 5780df94e3e..92a91070594 100644 --- a/core/dbt/parser/read_files.py +++ b/core/dbt/parser/read_files.py @@ -11,7 +11,6 @@ AnySourceFile, SchemaSourceFile, ) -from dbt.contracts.project import LocalPackage from dbt.config import Project from dbt.dataclass_schema import dbtClassMixin from dbt.parser.schemas import yaml_from_file, schema_file_keys, check_format_version @@ -337,24 +336,8 @@ def read_files(self): self.files[source_file.file_id] = source_file def get_project_name(self, path): - if self.local_package_dirs is None: - packages = self.all_projects[self.root_project_name].packages - self.local_package_dirs = [] - for package_spec in packages.packages: - if isinstance(package_spec, LocalPackage): - self.local_package_dirs.append(package_spec.local) - if len(self.local_package_dirs) == 0: - return self.root_project_name - - # Check whether this change is to a local package - input_file_path = pathlib.PurePath(path) - input_file_path_parts = input_file_path.parts - for local_package_dir in self.local_package_dirs: - if local_package_dir in input_file_path_parts: - pass - # we need the package_name of the local package_dir and - # there's currently no way to get it - + # It's not currently possible to recognize any other project files, + # and it's an open issue how to handle deps. return self.root_project_name def get_project_file_types(self, project_name): diff --git a/tests/functional/artifacts/expected_manifest.py b/tests/functional/artifacts/expected_manifest.py index 5f058f5db09..1d36c538632 100644 --- a/tests/functional/artifacts/expected_manifest.py +++ b/tests/functional/artifacts/expected_manifest.py @@ -167,7 +167,7 @@ def checksum_file(path): silly things if we just open(..., 'r').encode('utf-8'). """ with open(path, "rb") as fp: - # We strip the file contents because we want the checksum to match the store contents + # We strip the file contents because we want the checksum to match the stored contents hashed = hashlib.sha256(fp.read().strip()).hexdigest() return { "name": "sha256", diff --git a/tests/functional/partial_parsing/test_file_diff.py b/tests/functional/partial_parsing/test_file_diff.py index 729ccd45712..a9f4c8fdd09 100644 --- a/tests/functional/partial_parsing/test_file_diff.py +++ b/tests/functional/partial_parsing/test_file_diff.py @@ -1,71 +1,6 @@ import os -import pytest from dbt.tests.util import run_dbt, write_artifact -from dbt.tests.fixtures.project import write_project_files - - -local_dependency__dbt_project_yml = """ -name: 'local_dep' -version: '1.0' -config-version: 2 - -profile: 'default' - -require-dbt-version: '>=0.1.0' - -target-path: "target" # directory which will store compiled SQL files -clean-targets: # directories to be removed by `dbt clean` - - "target" - - "dbt_packages" - -seeds: - quote_columns: False - -""" - -local_dependency__models__schema_yml = """ -version: 2 -sources: - - name: seed_source - schema: schema - tables: - - name: "local_seed" - columns: - - name: id - tests: - - unique - -seeds: - - name: local_seed - - name: seed - config: - column_types: - id: integer - first_name: varchar(11) - email: varchar(31) - ip_address: varchar(15) - updated_at: timestamp without time zone - -""" - -local_dependency__models__model_to_import_sql = """ -select * from {{ ref('seed') }} - -""" - -local_dependency__seeds__seed_csv = """id -1 -""" - -# This seed is to make the dbt_integration_project work, but is actually installed -# by the local_dependency -integration_project__seed_csv = """id,first_name,email,ip_address,updated_at -1,Larry,lking0@miitbeian.gov.cn,69.135.206.194,2008-09-12 19:08:31 -2,Larry,lperkins1@toplist.cz,64.210.133.162,1978-05-09 04:15:14 -3,Anna,amontgomery2@miitbeian.gov.cn,168.104.64.114,2011-10-16 04:07:57 -4,Sandra,sgeorge3@livejournal.com,229.235.252.98,1973-07-19 10:52:43 -""" first_file_diff = { @@ -83,53 +18,6 @@ class TestFileDiffs: - # We need to be able to test changes to dependencies - @pytest.fixture(scope="class", autouse=True) - def setUp(self, project_root): - local_dependency_files = { - "dbt_project.yml": local_dependency__dbt_project_yml, - "models": { - "schema.yml": local_dependency__models__schema_yml, - "model_to_import.sql": local_dependency__models__model_to_import_sql, - }, - "seeds": { - "local_seed.csv": local_dependency__seeds__seed_csv, - "seed.csv": integration_project__seed_csv, - }, - } - write_project_files(project_root, "local_dependency", local_dependency_files) - - @pytest.fixture(scope="class") - def packages(self): - return { - "packages": [ - { - "git": "https://github.com/dbt-labs/dbt-integration-project", - "revision": "1.1", - }, - {"local": "local_dependency"}, - ] - } - - # @pytest.fixture(scope="class") - # def project_config_update(self): - # return { - # "seeds": { - # "quote_columns": False, - # "test": { - # "seed": { - # "+column_types": { - # "id": "integer", - # "first_name": "varchar(11)", - # "email": "varchar(31)", - # "ip_address": "varchar(15)", - # "updated_at": "timestamp without time zone", - # }, - # }, - # }, - # }, - # } - def test_file_diffs(self, project): os.environ["DBT_PP_FILE_DIFF_TEST"] = "true" @@ -142,8 +30,8 @@ def test_file_diffs(self, project): write_artifact(first_file_diff, "file_diff.json") results = run_dbt() - assert len(results) == 5 + assert len(results) == 1 write_artifact(second_file_diff, "file_diff.json") results = run_dbt() - assert len(results) == 6 + assert len(results) == 2 From c764c9fa7ee6e26c4dd776bca55fc98daa1838f4 Mon Sep 17 00:00:00 2001 From: Gerda Shank Date: Tue, 7 Mar 2023 15:21:09 -0500 Subject: [PATCH 16/16] Finish removing unused log event --- core/dbt/events/proto_types.py | 13 ------------- core/dbt/events/types.proto | 10 +--------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/core/dbt/events/proto_types.py b/core/dbt/events/proto_types.py index c7ad5fdf83f..05d7fca6c17 100644 --- a/core/dbt/events/proto_types.py +++ b/core/dbt/events/proto_types.py @@ -2267,19 +2267,6 @@ class MainStackTraceMsg(betterproto.Message): data: "MainStackTrace" = betterproto.message_field(2) -@dataclass -class SystemErrorRetrievingModTime(betterproto.Message): - """Z004""" - - path: str = betterproto.string_field(1) - - -@dataclass -class SystemErrorRetrievingModTimeMsg(betterproto.Message): - info: "EventInfo" = betterproto.message_field(1) - data: "SystemErrorRetrievingModTime" = betterproto.message_field(2) - - @dataclass class SystemCouldNotWrite(betterproto.Message): """Z005""" diff --git a/core/dbt/events/types.proto b/core/dbt/events/types.proto index 93e495f3aba..fbfb6015287 100644 --- a/core/dbt/events/types.proto +++ b/core/dbt/events/types.proto @@ -1819,15 +1819,7 @@ message MainStackTraceMsg { MainStackTrace data = 2; } -// Z004 -message SystemErrorRetrievingModTime { - string path = 1; -} - -message SystemErrorRetrievingModTimeMsg { - EventInfo info = 1; - SystemErrorRetrievingModTime data = 2; -} +// skipping Z004 // Z005 message SystemCouldNotWrite {