diff --git a/.circleci/config.yml b/.circleci/config.yml index ff5451abfa4..95ff4f1d158 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2.1 jobs: unit: docker: &test_only - - image: fishtownanalytics/test-container:5 + - image: fishtownanalytics/test-container:7 environment: DBT_INVOCATION_ENV: circle steps: @@ -30,7 +30,7 @@ jobs: destination: dist integration-postgres-py36: docker: &test_and_postgres - - image: fishtownanalytics/test-container:5 + - image: fishtownanalytics/test-container:7 environment: DBT_INVOCATION_ENV: circle - image: postgres diff --git a/CHANGELOG.md b/CHANGELOG.md index efbd58c1dd2..6b2acc853d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,9 +10,12 @@ - Fixed a number of issues with globally-scoped vars ([#2473](https://github.com/fishtown-analytics/dbt/issues/2473), [#2472](https://github.com/fishtown-analytics/dbt/issues/2472), [#2469](https://github.com/fishtown-analytics/dbt/issues/2469), [#2477](https://github.com/fishtown-analytics/dbt/pull/2477)) - Fixed DBT Docker entrypoint ([#2470](https://github.com/fishtown-analytics/dbt/issues/2470), [#2475](https://github.com/fishtown-analytics/dbt/pull/2475)) - Fixed a performance regression that occurred even when a user was not using the relevant feature ([#2474](https://github.com/fishtown-analytics/dbt/issues/2474), [#2478](https://github.com/fishtown-analytics/dbt/pull/2478)) +- Substantial performance improvements for parsing on large projects, especially projects with many docs definition. ([#2480](https://github.com/fishtown-analytics/dbt/issues/2480), [#2481](https://github.com/fishtown-analytics/dbt/pull/2481)) +- Expose Snowflake query id in case of an exception raised by connector ([#2201](https://github.com/fishtown-analytics/dbt/issues/2201), [#2358](https://github.com/fishtown-analytics/dbt/pull/2358)) Contributors: - [@dmateusp](https://github.com/dmateusp) ([#2475](https://github.com/fishtown-analytics/dbt/pull/2475)) +- [@ChristianKohlberg](https://github.com/ChristianKohlberg) (#2358](https://github.com/fishtown-analytics/dbt/pull/2358)) ## dbt 0.17.0rc1 (May 12, 2020) @@ -67,7 +70,6 @@ Contributors: - Add a 'depends_on' attribute to the log record extra field ([#2316](https://github.com/fishtown-analytics/dbt/issues/2316), [#2341](https://github.com/fishtown-analytics/dbt/pull/2341)) - Added a '--no-browser' argument to "dbt docs serve" so you can serve docs in an environment that only has a CLI browser which would otherwise deadlock dbt ([#2004](https://github.com/fishtown-analytics/dbt/issues/2004), [#2364](https://github.com/fishtown-analytics/dbt/pull/2364)) - Snowflake now uses "describe table" to get the columns in a relation ([#2260](https://github.com/fishtown-analytics/dbt/issues/2260), [#2324](https://github.com/fishtown-analytics/dbt/pull/2324)) -- Expose Snowflake query id in case of an exception raised by connector ([#2201](https://github.com/fishtown-analytics/dbt/issues/2201), [#2358](https://github.com/fishtown-analytics/dbt/pull/2358)) - Sources (and therefore freshness tests) can be enabled and disabled via dbt_project.yml ([#2283](https://github.com/fishtown-analytics/dbt/issues/2283), [#2312](https://github.com/fishtown-analytics/dbt/pull/2312), [#2357](https://github.com/fishtown-analytics/dbt/pull/2357)) - schema.yml files are now fully rendered in a context that is aware of vars declared in from dbt_project.yml files ([#2269](https://github.com/fishtown-analytics/dbt/issues/2269), [#2357](https://github.com/fishtown-analytics/dbt/pull/2357)) - Sources from dependencies can be overridden in schema.yml files ([#2287](https://github.com/fishtown-analytics/dbt/issues/2287), [#2357](https://github.com/fishtown-analytics/dbt/pull/2357)) diff --git a/Dockerfile b/Dockerfile index 3c9b0544bfc..372fe145434 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ RUN apt-get update && \ apt-get install -y --no-install-recommends \ netcat postgresql curl git ssh software-properties-common \ make build-essential ca-certificates libpq-dev \ - libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit \ + libsasl2-dev libsasl2-2 libsasl2-modules-gssapi-mit libyaml-dev \ && \ add-apt-repository ppa:deadsnakes/ppa && \ apt-get install -y \ diff --git a/core/dbt/clients/jinja.py b/core/dbt/clients/jinja.py index 2e9b9b6f09f..6138e8902c8 100644 --- a/core/dbt/clients/jinja.py +++ b/core/dbt/clients/jinja.py @@ -493,6 +493,14 @@ def _requote_result(raw_value: str, rendered: str) -> str: return f'{quote_char}{rendered}{quote_char}' +# performance note: Local benmcharking (so take it with a big grain of salt!) +# on this indicates that it is is on average slightly slower than +# checking two separate patterns, but the standard deviation is smaller with +# one pattern. The time difference between the two was ~2 std deviations, which +# is small enough that I've just chosen the more readable option. +_HAS_RENDER_CHARS_PAT = re.compile(r'({[{%]|[}%]})') + + def get_rendered( string: str, ctx: Dict[str, Any], @@ -500,6 +508,18 @@ def get_rendered( capture_macros: bool = False, native: bool = False, ) -> str: + # performance optimization: if there are no jinja control characters in the + # string, we can just return the input. Fall back to jinja if the type is + # not a string or if native rendering is enabled (so '1' -> 1, etc...) + # If this is desirable in the native env as well, we could handle the + # native=True case by passing the input string to ast.literal_eval, like + # the native renderer does. + if ( + not native and + isinstance(string, str) and + _HAS_RENDER_CHARS_PAT.search(string) is None + ): + return string template = get_template( string, ctx, diff --git a/core/dbt/clients/yaml_helper.py b/core/dbt/clients/yaml_helper.py index c4b9f9f5adb..904d8aa6043 100644 --- a/core/dbt/clients/yaml_helper.py +++ b/core/dbt/clients/yaml_helper.py @@ -1,8 +1,17 @@ +from typing import Any + import dbt.exceptions import yaml import yaml.scanner +# the C version is faster, but it doesn't always exist +YamlLoader: Any +try: + from yaml import CSafeLoader as YamlLoader +except ImportError: + from yaml import SafeLoader as YamlLoader + YAML_ERROR_MESSAGE = """ Syntax error near line {line_number} @@ -44,9 +53,13 @@ def contextualized_yaml_error(raw_contents, error): raw_error=error) +def safe_load(contents): + return yaml.load(contents, Loader=YamlLoader) + + def load_yaml_text(contents): try: - return yaml.safe_load(contents) + return safe_load(contents) except (yaml.scanner.ScannerError, yaml.YAMLError) as e: if hasattr(e, 'problem_mark'): error = contextualized_yaml_error(contents, e) diff --git a/core/dbt/context/base.py b/core/dbt/context/base.py index ed8fa800bae..33c9681c15d 100644 --- a/core/dbt/context/base.py +++ b/core/dbt/context/base.py @@ -7,6 +7,7 @@ from dbt import flags from dbt import tracking from dbt.clients.jinja import undefined_error, get_rendered +from dbt.clients import yaml_helper from dbt.contracts.graph.compiled import CompiledResource from dbt.exceptions import raise_compiler_error, MacroReturn from dbt.logger import GLOBAL_LOGGER as logger @@ -383,7 +384,7 @@ def fromyaml(value: str, default: Any = None) -> Any: -- ["good"] """ try: - return yaml.safe_load(value) + return yaml_helper.safe_load(value) except (AttributeError, ValueError, yaml.YAMLError): return default diff --git a/core/dbt/context/providers.py b/core/dbt/context/providers.py index 57f61369eb8..5c59591217b 100644 --- a/core/dbt/context/providers.py +++ b/core/dbt/context/providers.py @@ -25,6 +25,7 @@ ParsedMacro, ParsedSourceDefinition, ParsedSeedNode ) from dbt.exceptions import ( + CompilationException, InternalException, ValidationException, RuntimeException, @@ -130,6 +131,19 @@ def _repack_args( else: return [package, name] + def validate_args(self, name: str, package: Optional[str]): + if not isinstance(name, str): + raise CompilationException( + f'The name argument to ref() must be a string, got ' + f'{type(name)}' + ) + + if package is not None and not isinstance(package, str): + raise CompilationException( + f'The package argument to ref() must be a string or None, got ' + f'{type(package)}' + ) + def __call__(self, *args: str) -> RelationProxy: name: str package: Optional[str] = None @@ -140,6 +154,7 @@ def __call__(self, *args: str) -> RelationProxy: package, name = args else: ref_invalid_args(self.model, args) + self.validate_args(name, package) return self.resolve(name, package) @@ -148,12 +163,25 @@ class BaseSourceResolver(BaseResolver): def resolve(self, source_name: str, table_name: str): pass + def validate_args(self, source_name: str, table_name: str): + if not isinstance(source_name, str): + raise CompilationException( + f'The source name (first) argument to source() must be a ' + f'string, got {type(source_name)}' + ) + if not isinstance(table_name, str): + raise CompilationException( + f'The table name (second) argument to source() must be a ' + f'string, got {type(table_name)}' + ) + def __call__(self, *args: str) -> RelationProxy: if len(args) != 2: raise_compiler_error( f"source() takes exactly two arguments ({len(args)} given)", self.model ) + self.validate_args(args[0], args[1]) return self.resolve(args[0], args[1]) diff --git a/core/dbt/contracts/graph/manifest.py b/core/dbt/contracts/graph/manifest.py index e33257aaa2b..2df6c0e8dec 100644 --- a/core/dbt/contracts/graph/manifest.py +++ b/core/dbt/contracts/graph/manifest.py @@ -1,3 +1,4 @@ +import abc import enum import hashlib import os @@ -34,6 +35,138 @@ NodeEdgeMap = Dict[str, List[str]] MacroKey = Tuple[str, str] SourceKey = Tuple[str, str] +PackageName = str +DocName = str +RefName = str +UniqueID = str + + +K_T = TypeVar('K_T') +V_T = TypeVar('V_T') + + +class PackageAwareCache(Generic[K_T, V_T]): + def __init__(self, manifest: 'Manifest'): + self.storage: Dict[K_T, Dict[PackageName, UniqueID]] = {} + self._manifest = manifest + self.populate() + + @abc.abstractmethod + def populate(self): + pass + + @abc.abstractmethod + def perform_lookup(self, unique_id: UniqueID) -> V_T: + pass + + def find_cached_value( + self, key: K_T, package: Optional[PackageName] + ) -> Optional[V_T]: + unique_id = self.find_unique_id_for_package(key, package) + if unique_id is not None: + return self.perform_lookup(unique_id) + return None + + def find_unique_id_for_package( + self, key: K_T, package: Optional[PackageName] + ) -> Optional[UniqueID]: + if key not in self.storage: + return None + + pkg_dct: Mapping[PackageName, UniqueID] = self.storage[key] + + if package is None: + if not pkg_dct: + return None + else: + return next(iter(pkg_dct.values())) + elif package in pkg_dct: + return pkg_dct[package] + else: + return None + + +class DocCache(PackageAwareCache[DocName, ParsedDocumentation]): + def add_doc(self, doc: ParsedDocumentation): + if doc.name not in self.storage: + self.storage[doc.name] = {} + self.storage[doc.name][doc.package_name] = doc.unique_id + + def populate(self): + for doc in self._manifest.docs.values(): + self.add_doc(doc) + + def perform_lookup( + self, unique_id: UniqueID + ) -> ParsedDocumentation: + if unique_id not in self._manifest.docs: + raise dbt.exceptions.InternalException( + f'Doc {unique_id} found in cache but not found in manifest' + ) + return self._manifest.docs[unique_id] + + +class SourceCache(PackageAwareCache[SourceKey, ParsedSourceDefinition]): + def add_source(self, source: ParsedSourceDefinition): + key = (source.source_name, source.name) + if key not in self.storage: + self.storage[key] = {} + + self.storage[key][source.package_name] = source.unique_id + + def populate(self): + for source in self._manifest.sources.values(): + self.add_source(source) + + def perform_lookup( + self, unique_id: UniqueID + ) -> ParsedSourceDefinition: + if unique_id not in self._manifest.sources: + raise dbt.exceptions.InternalException( + f'Source {unique_id} found in cache but not found in manifest' + ) + return self._manifest.sources[unique_id] + + +class RefableCache(PackageAwareCache[RefName, NonSourceNode]): + # refables are actually unique, so the Dict[PackageName, UniqueID] will + # only ever have exactly one value, but doing 3 dict lookups instead of 1 + # is not a big deal at all and retains consistency + def __init__(self, manifest: 'Manifest'): + self._cached_types = set(NodeType.refable()) + super().__init__(manifest) + + def add_node(self, node: NonSourceNode): + if node.resource_type in self._cached_types: + if node.name not in self.storage: + self.storage[node.name] = {} + self.storage[node.name][node.package_name] = node.unique_id + + def populate(self): + for node in self._manifest.nodes.values(): + self.add_node(node) + + def perform_lookup( + self, unique_id: UniqueID + ) -> NonSourceNode: + if unique_id not in self._manifest.nodes: + raise dbt.exceptions.InternalException( + f'Node {unique_id} found in cache but not found in manifest' + ) + return self._manifest.nodes[unique_id] + + +def _search_packages( + current_project: str, + node_package: str, + target_package: Optional[str] = None, +) -> List[Optional[str]]: + if target_package is not None: + return [target_package] + elif current_project == node_package: + return [current_project, None] + else: + return [current_project, node_package, None] @dataclass @@ -393,6 +526,9 @@ class Disabled(Generic[D]): target: D +MaybeDocumentation = Optional[ParsedDocumentation] + + MaybeParsedSource = Optional[Union[ ParsedSourceDefinition, Disabled[ParsedSourceDefinition], @@ -437,6 +573,9 @@ class Manifest: files: MutableMapping[str, SourceFile] metadata: ManifestMetadata = field(default_factory=ManifestMetadata) flat_graph: Dict[str, Any] = field(default_factory=dict) + _docs_cache: Optional[DocCache] = None + _sources_cache: Optional[SourceCache] = None + _refs_cache: Optional[RefableCache] = None @classmethod def from_macros( @@ -500,39 +639,6 @@ def find_disabled_source_by_name( assert isinstance(result, ParsedSourceDefinition) return result - def find_docs_by_name( - self, name: str, package: Optional[str] = None - ) -> Optional[ParsedDocumentation]: - searcher: NameSearcher = NameSearcher( - name, package, [NodeType.Documentation] - ) - result = searcher.search(self.docs.values()) - return result - - def find_refable_by_name( - self, name: str, package: Optional[str] - ) -> Optional[NonSourceNode]: - """Find any valid target for "ref()" in the graph by its name and - package name, or None for any package. - """ - searcher: NameSearcher = NameSearcher( - name, package, NodeType.refable() - ) - result = searcher.search(self.nodes.values()) - return result - - def find_source_by_name( - self, source_name: str, table_name: str, package: Optional[str] - ) -> Optional[ParsedSourceDefinition]: - """Find any valid target for "source()" in the graph by its name and - package name, or None for any package. - """ - - name = f'{source_name}.{table_name}' - searcher: NameSearcher = NameSearcher(name, package, [NodeType.Source]) - result = searcher.search(self.sources.values()) - return result - def _find_macros_by_name( self, name: str, @@ -647,6 +753,10 @@ def add_nodes(self, new_nodes: Mapping[str, NonSourceNode]): if unique_id in self.nodes: raise_duplicate_resource_name(node, self.nodes[unique_id]) self.nodes[unique_id] = node + # fixup the cache if it exists. + if self._refs_cache is not None: + if node.resource_type in NodeType.refable(): + self._refs_cache.add_node(node) def patch_macros( self, patches: MutableMapping[MacroKey, ParsedMacroPatch] @@ -766,6 +876,30 @@ def expect(self, unique_id: str) -> CompileResultNode: 'Expected node {} not found in manifest'.format(unique_id) ) + @property + def docs_cache(self) -> DocCache: + if self._docs_cache is not None: + return self._docs_cache + cache = DocCache(self) + self._docs_cache = cache + return cache + + @property + def source_cache(self) -> SourceCache: + if self._sources_cache is not None: + return self._sources_cache + cache = SourceCache(self) + self._sources_cache = cache + return cache + + @property + def refs_cache(self) -> RefableCache: + if self._refs_cache is not None: + return self._refs_cache + cache = RefableCache(self) + self._refs_cache = cache + return cache + def resolve_ref( self, target_model_name: str, @@ -773,37 +907,27 @@ def resolve_ref( current_project: str, node_package: str, ) -> MaybeNonSource: - if target_model_package is not None: - return self.find_refable_by_name( - target_model_name, - target_model_package) - - target_model: Optional[NonSourceNode] = None - disabled_target: Optional[NonSourceNode] = None - - # first pass: look for models in the current_project - # second pass: look for models in the node's package - # final pass: look for models in any package - if current_project == node_package: - candidates = [current_project, None] - else: - candidates = [current_project, node_package, None] - for candidate in candidates: - target_model = self.find_refable_by_name( - target_model_name, - candidate) - if target_model is not None and target_model.config.enabled: - return target_model + node: Optional[NonSourceNode] = None + disabled: Optional[NonSourceNode] = None + + candidates = _search_packages( + current_project, node_package, target_model_package + ) + for pkg in candidates: + node = self.refs_cache.find_cached_value(target_model_name, pkg) + + if node is not None and node.config.enabled: + return node # it's possible that the node is disabled - if disabled_target is None: - disabled_target = self.find_disabled_by_name( - target_model_name, candidate + if disabled is None: + disabled = self.find_disabled_by_name( + target_model_name, pkg ) - if disabled_target is not None: - return Disabled(disabled_target) + if disabled is not None: + return Disabled(disabled) return None def resolve_source( @@ -813,27 +937,24 @@ def resolve_source( current_project: str, node_package: str ) -> MaybeParsedSource: - candidate_targets = [current_project, node_package, None] + key = (target_source_name, target_table_name) + candidates = _search_packages(current_project, node_package) - target_source: Optional[ParsedSourceDefinition] = None - disabled_target: Optional[ParsedSourceDefinition] = None + source: Optional[ParsedSourceDefinition] = None + disabled: Optional[ParsedSourceDefinition] = None - for candidate in candidate_targets: - target_source = self.find_source_by_name( - target_source_name, - target_table_name, - candidate - ) - if target_source is not None and target_source.config.enabled: - return target_source + for pkg in candidates: + source = self.source_cache.find_cached_value(key, pkg) + if source is not None and source.config.enabled: + return source - if disabled_target is None: - disabled_target = self.find_disabled_source_by_name( - target_source_name, target_table_name, candidate + if disabled is None: + disabled = self.find_disabled_source_by_name( + target_source_name, target_table_name, pkg ) - if disabled_target is not None: - return Disabled(disabled_target) + if disabled is not None: + return Disabled(disabled) return None def resolve_doc( @@ -847,22 +968,15 @@ def resolve_doc( resolve_ref except the is_enabled checks are unnecessary as docs are always enabled. """ - if package is not None: - return self.find_docs_by_name( - name, package - ) + candidates = _search_packages( + current_project, node_package, package + ) - candidate_targets = [ - current_project, - node_package, - None, - ] - target_doc = None - for candidate in candidate_targets: - target_doc = self.find_docs_by_name(name, candidate) - if target_doc is not None: - break - return target_doc + for pkg in candidates: + result = self.docs_cache.find_cached_value(name, pkg) + if result is not None: + return result + return None @dataclass diff --git a/core/dbt/parser/docs.py b/core/dbt/parser/docs.py index b09bf38d837..43b84009526 100644 --- a/core/dbt/parser/docs.py +++ b/core/dbt/parser/docs.py @@ -1,17 +1,17 @@ from typing import Iterable -import jinja2.runtime +import re -from dbt.clients.jinja import get_template -from dbt.contracts.graph.unparsed import UnparsedDocumentationFile +from dbt.clients.jinja import get_rendered from dbt.contracts.graph.parsed import ParsedDocumentation -from dbt.exceptions import CompilationException, InternalException from dbt.node_types import NodeType from dbt.parser.base import Parser from dbt.parser.search import ( - FullBlock, FileBlock, FilesystemSearcher, BlockSearcher + BlockContents, FileBlock, FilesystemSearcher, BlockSearcher ) -from dbt.utils import deep_merge, DOCS_PREFIX + + +SHOULD_PARSE_RE = re.compile(r'{[{%]') class DocumentationParser(Parser[ParsedDocumentation]): @@ -35,58 +35,28 @@ def generate_unique_id(self, resource_name: str) -> str: # need to be part of the unique ID. return '{}.{}'.format(self.project.project_name, resource_name) - # TODO: could this class just render() the tag.contents() and skip this - # whole extra module.__dict__.items() thing? - def _parse_template_docs(self, template, docfile): - for key, item in template.module.__dict__.items(): - if type(item) != jinja2.runtime.Macro: - continue - - if not key.startswith(DOCS_PREFIX): - continue - - name = key.replace(DOCS_PREFIX, '') - - unique_id = self.generate_unique_id(name) - - merged = deep_merge( - docfile.to_dict(), - { - 'name': name, - 'unique_id': unique_id, - 'block_contents': item().strip(), - } - ) - merged.pop('file_contents', None) - yield ParsedDocumentation.from_dict(merged) + def parse_block( + self, block: BlockContents + ) -> Iterable[ParsedDocumentation]: + unique_id = self.generate_unique_id(block.name) + contents = get_rendered(block.contents, {}).strip() - def parse_block(self, block: FullBlock) -> Iterable[ParsedDocumentation]: - base_node = UnparsedDocumentationFile( + doc = ParsedDocumentation( root_path=self.project.project_root, path=block.file.path.relative_path, original_file_path=block.path.original_file_path, package_name=self.project.project_name, - # set contents to the actual internal contents of the block - file_contents=block.contents, + unique_id=unique_id, + name=block.name, + block_contents=contents, ) - try: - template = get_template(block.contents, {}) - except CompilationException as e: - e.add_node(base_node) - raise - all_docs = list(self._parse_template_docs(template, base_node)) - if len(all_docs) != 1: - raise InternalException( - 'Got {} docs in an extracted docs block: block parser ' - 'mismatched with jinja'.format(len(all_docs)) - ) - return all_docs + return [doc] def parse_file(self, file_block: FileBlock): - searcher: Iterable[FullBlock] = BlockSearcher( + searcher: Iterable[BlockContents] = BlockSearcher( source=[file_block], allowed_blocks={'docs'}, - source_tag_factory=FullBlock, + source_tag_factory=BlockContents, ) for block in searcher: for parsed in self.parse_block(block): diff --git a/core/dbt/task/run_operation.py b/core/dbt/task/run_operation.py index 6b318fa005c..107fd60dd90 100644 --- a/core/dbt/task/run_operation.py +++ b/core/dbt/task/run_operation.py @@ -28,10 +28,8 @@ def _get_kwargs(self) -> Dict[str, Any]: return dbt.utils.parse_cli_vars(self.args.args) def compile_manifest(self) -> None: - # skip building a linker, but do make sure to build the flat graph if self.manifest is None: raise InternalException('manifest was None in compile_manifest') - self.manifest.build_flat_graph() def _run_unsafe(self) -> agate.Table: adapter = get_adapter(self.config) diff --git a/core/dbt/task/runnable.py b/core/dbt/task/runnable.py index 525e1216c83..d75040f644c 100644 --- a/core/dbt/task/runnable.py +++ b/core/dbt/task/runnable.py @@ -72,7 +72,6 @@ def compile_manifest(self): 'compile_manifest called before manifest was loaded' ) self.linker = compile_manifest(self.config, self.manifest) - self.manifest.build_flat_graph() def _runtime_initialize(self): self.load_manifest() diff --git a/core/dbt/tracking.py b/core/dbt/tracking.py index 8d7dbc2f24e..133a6c13838 100644 --- a/core/dbt/tracking.py +++ b/core/dbt/tracking.py @@ -1,5 +1,6 @@ from typing import Optional +from dbt.clients import yaml_helper from dbt.logger import GLOBAL_LOGGER as logger from dbt import version as dbt_version from snowplow_tracker import Subject, Tracker, Emitter, logger as sp_logger @@ -144,7 +145,7 @@ def get_cookie(self): else: with open(self.cookie_path, "r") as fh: try: - user = yaml.safe_load(fh) + user = yaml_helper.safe_load(fh) if user is None: user = self.set_cookie() except yaml.reader.ReaderError: diff --git a/test/integration/060_persist_docs_tests/test_persist_docs.py b/test/integration/060_persist_docs_tests/test_persist_docs.py index 94bfe5715f3..c5a6cc1689e 100644 --- a/test/integration/060_persist_docs_tests/test_persist_docs.py +++ b/test/integration/060_persist_docs_tests/test_persist_docs.py @@ -1,4 +1,5 @@ from test.integration.base import DBTIntegrationTest, use_profile +import os import json @@ -19,7 +20,10 @@ def _assert_common_comments(self, *comments): assert '\n' in comment assert 'Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting' in comment assert '/* comment */' in comment - assert '--\n' in comment + if os.name == 'nt': + assert '--\r\n' in comment or '--\n' in comment + else: + assert '--\n' in comment def _assert_has_table_comments(self, table_node): table_comment = table_node['metadata']['comment'] diff --git a/test/unit/test_manifest.py b/test/unit/test_manifest.py index d6f77eaf083..c846c218077 100644 --- a/test/unit/test_manifest.py +++ b/test/unit/test_manifest.py @@ -700,7 +700,7 @@ def MockNode(package, name, resource_type=NodeType.Model, kwargs={}): __class__=cls, resource_type=resource_type, package_name=package, - unique_id=f'macro.{package}.{name}', + unique_id=f'{str(resource_type)}.{package}.{name}', search_name=name, **kwargs ) @@ -1096,9 +1096,14 @@ def id_nodes(arg): _refable_parameter_sets(), ids=id_nodes, ) -def test_find_refable_by_name(nodes, sources, package, expected): +def test_resolve_ref(nodes, sources, package, expected): manifest = make_manifest(nodes=nodes, sources=sources) - result = manifest.find_refable_by_name(name='my_model', package=package) + result = manifest.resolve_ref( + target_model_name='my_model', + target_model_package=package, + current_project='root', + node_package='root', + ) if expected is None: assert result is expected else: @@ -1112,7 +1117,7 @@ def test_find_refable_by_name(nodes, sources, package, expected): def _source_parameter_sets(): sets = [ # empties - FindNodeSpec(nodes=[], sources=[], package=None, expected=None), + FindNodeSpec(nodes=[], sources=[], package='dep', expected=None), FindNodeSpec(nodes=[], sources=[], package='root', expected=None), ] sets.extend( @@ -1123,7 +1128,7 @@ def _source_parameter_sets(): package=project, expected=None, ) - for project in ('root', None) for name in ('my_source', 'my_table') + for project in ('root', 'dep') for name in ('my_source', 'my_table') ) # exists in root alongside nodes with name parts sets.extend( @@ -1133,7 +1138,7 @@ def _source_parameter_sets(): package=project, expected=('root', 'my_source', 'my_table'), ) - for project in ('root', None) + for project in ('root', 'dep') ) sets.extend( # wrong source name @@ -1143,7 +1148,7 @@ def _source_parameter_sets(): package=project, expected=None, ) - for project in ('root', None) + for project in ('root', 'dep') ) sets.extend( # wrong table name @@ -1153,15 +1158,15 @@ def _source_parameter_sets(): package=project, expected=None, ) - for project in ('root', None) + for project in ('root', 'dep') ) sets.append( - # wrong project name (should not be found in 'root') + # should be found by the package=None search FindNodeSpec( nodes=[], sources=[MockSource('other', 'my_source', 'my_table')], package='root', - expected=None, + expected=('other', 'my_source', 'my_table'), ) ) sets.extend( @@ -1172,7 +1177,7 @@ def _source_parameter_sets(): package=project, expected=('root', 'my_source', 'my_table'), ) - for project in ('root', None) + for project in ('root', 'dep') ) return sets @@ -1183,9 +1188,14 @@ def _source_parameter_sets(): _source_parameter_sets(), ids=id_nodes, ) -def test_find_source_by_name(nodes, sources, package, expected): +def test_resolve_source(nodes, sources, package, expected): manifest = make_manifest(nodes=nodes, sources=sources) - result = manifest.find_source_by_name(source_name='my_source', table_name='my_table', package=package) + result = manifest.resolve_source( + target_source_name='my_source', + target_table_name='my_table', + current_project=package, + node_package='dep', + ) if expected is None: assert result is expected else: @@ -1225,9 +1235,9 @@ def _docs_parameter_sets(): _docs_parameter_sets(), ids=id_nodes, ) -def test_find_doc_by_name(docs, package, expected): +def test_resolve_doc(docs, package, expected): manifest = make_manifest(docs=docs) - result = manifest.find_docs_by_name(name='my_doc', package=package) + result = manifest.resolve_doc(name='my_doc', package=package, current_project='root', node_package='root') if expected is None: assert result is expected else: diff --git a/test/unit/test_parser.py b/test/unit/test_parser.py index 183d1a37e1b..6824bd3e3a6 100644 --- a/test/unit/test_parser.py +++ b/test/unit/test_parser.py @@ -28,15 +28,60 @@ from dbt.contracts.graph.parsed import ( ParsedModelNode, ParsedMacro, ParsedNodePatch, ParsedSourceDefinition, DependsOn, ColumnInfo, ParsedDataTestNode, ParsedSnapshotNode, - ParsedAnalysisNode, ParsedDocumentation, UnpatchedSourceDefinition -) -from dbt.contracts.graph.unparsed import ( - FreshnessThreshold, ExternalTable, Docs + ParsedAnalysisNode, ParsedDocumentation, UnpatchedSourceDefinition, + ParsedSeedNode ) +from dbt.contracts.graph.unparsed import Docs from .utils import config_from_parts_or_dicts, normalize, generate_name_macros +def MockSource(package, source_name, name, **kwargs): + src = mock.MagicMock( + __class__=ParsedSourceDefinition, + resource_type=NodeType.Source, + source_name=source_name, + package_name=package, + unique_id=f'source.{package}.{source_name}.{name}', + search_name=f'{source_name}.{name}', + **kwargs + ) + src.name = name + return src + + +def MockNode(package, name, resource_type=NodeType.Model, **kwargs): + if resource_type == NodeType.Model: + cls = ParsedModelNode + elif resource_type == NodeType.Seed: + cls = ParsedSeedNode + else: + raise ValueError(f'I do not know how to handle {resource_type}') + node = mock.MagicMock( + __class__=cls, + resource_type=resource_type, + package_name=package, + unique_id=f'{str(resource_type)}.{package}.{name}', + search_name=name, + **kwargs + ) + node.name = name + return node + + +def MockDocumentation(package, name, **kwargs): + doc = mock.MagicMock( + __class__=ParsedDocumentation, + resource_type=NodeType.Documentation, + package_name=package, + search_name=name, + unique_id=f'{package}.{name}', + **kwargs + ) + doc.name = name + return doc + + def get_abs_os_path(unix_path): return normalize(os.path.abspath(unix_path)) @@ -791,56 +836,44 @@ def setUp(self): super().setUp() x_depends_on = mock.MagicMock() y_depends_on = mock.MagicMock() - x_uid = 'model.project.x' - y_uid = 'model.otherproject.y' - src_uid = 'source.thirdproject.src.tbl' - self.x_node = mock.MagicMock( - __class__=ParsedModelNode, - package_name='project', - search_name='x', + self.x_node = MockNode( + package='project', + name='x', config=mock.MagicMock(enabled=True), refs=[], sources=[['src', 'tbl']], - unique_id=x_uid, - resource_type=NodeType.Model, depends_on=x_depends_on, description='other_project: {{ doc("otherproject", "my_doc") }}', ) - self.y_node = mock.MagicMock( - __class__=ParsedModelNode, - package_name='otherproject', - search_name='y', + self.y_node = MockNode( + package='otherproject', + name='y', config=mock.MagicMock(enabled=True), refs=[['x']], sources=[], - unique_id=y_uid, - resource_type=NodeType.Model, depends_on=y_depends_on, description='{{ doc("my_doc") }}', ) - self.src_node = mock.MagicMock( - __class__=ParsedSourceDefinition, - package_name='project', - search_name='src.tbl', + self.src_node = MockSource( + package='thirdproject', + source_name='src', + name='tbl', config=mock.MagicMock(enabled=True), - resource_type=NodeType.Source, - unique_id=src_uid, + ) + self.doc = MockDocumentation( + package='otherproject', + name='my_doc', + block_contents='some docs', ) nodes = { - x_uid: self.x_node, - y_uid: self.y_node, + self.x_node.unique_id: self.x_node, + self.y_node.unique_id: self.y_node, } sources = { - src_uid: self.src_node, + self.src_node.unique_id: self.src_node, } docs = { - 'otherproject.my_doc': mock.MagicMock( - __class__=ParsedDocumentation, - resource_type=NodeType.Documentation, - search_name='my_doc', - package_name='otherproject', - block_contents='some docs', - ) + self.doc.unique_id: self.doc, } self.manifest = Manifest( nodes=nodes, sources=sources, macros={}, docs=docs, disabled=[], files={}, generated_at=mock.MagicMock()