Skip to content

Commit

Permalink
Fix regression in ignoring symlinks (#23535)
Browse files Browse the repository at this point in the history
  • Loading branch information
ianbuss authored May 20, 2022
1 parent 7ab5ea7 commit 8494fc7
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 12 deletions.
32 changes: 21 additions & 11 deletions airflow/utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,14 @@ class _IgnoreRule(Protocol):

@staticmethod
def compile(pattern: str, base_dir: Path, definition_file: Path) -> Optional['_IgnoreRule']:
pass
"""
Build an ignore rule from the supplied pattern where base_dir
and definition_file should be absolute paths.
"""

@staticmethod
def match(path: Path, rules: List['_IgnoreRule']) -> bool:
pass
"""Match a candidate absolute path against a list of rules"""


class _RegexpIgnoreRule(NamedTuple):
Expand All @@ -57,19 +60,18 @@ class _RegexpIgnoreRule(NamedTuple):
def compile(pattern: str, base_dir: Path, definition_file: Path) -> Optional[_IgnoreRule]:
"""Build an ignore rule from the supplied regexp pattern and log a useful warning if it is invalid"""
try:
return _RegexpIgnoreRule(re.compile(pattern), base_dir.resolve())
return _RegexpIgnoreRule(re.compile(pattern), base_dir)
except re.error as e:
log.warning("Ignoring invalid regex '%s' from %s: %s", pattern, definition_file, e)
return None

@staticmethod
def match(path: Path, rules: List[_IgnoreRule]) -> bool:
"""Match a list of ignore rules against the supplied path"""
test_path: Path = path.resolve()
for rule in rules:
if not isinstance(rule, _RegexpIgnoreRule):
raise ValueError(f"_RegexpIgnoreRule cannot match rules of type: {type(rule)}")
if rule.pattern.search(str(test_path.relative_to(rule.base_dir))) is not None:
if rule.pattern.search(str(path.relative_to(rule.base_dir))) is not None:
return True
return False

Expand All @@ -95,21 +97,20 @@ def compile(pattern: str, _, definition_file: Path) -> Optional[_IgnoreRule]:
# > If there is a separator at the beginning or middle (or both) of the pattern, then the
# > pattern is relative to the directory level of the particular .gitignore file itself.
# > Otherwise the pattern may also match at any level below the .gitignore level.
relative_to = definition_file.resolve().parent
relative_to = definition_file.parent
ignore_pattern = GitWildMatchPattern(pattern)
return _GlobIgnoreRule(ignore_pattern.regex, pattern, ignore_pattern.include, relative_to)

@staticmethod
def match(path: Path, rules: List[_IgnoreRule]) -> bool:
"""Match a list of ignore rules against the supplied path"""
test_path: Path = path.resolve()
matched = False
for r in rules:
if not isinstance(r, _GlobIgnoreRule):
raise ValueError(f"_GlobIgnoreRule cannot match rules of type: {type(r)}")
rule: _GlobIgnoreRule = r # explicit typing to make mypy play nicely
rel_path = str(test_path.relative_to(rule.relative_to) if rule.relative_to else test_path.name)
if rule.raw_pattern.endswith("/") and test_path.is_dir():
rel_path = str(path.relative_to(rule.relative_to) if rule.relative_to else path.name)
if rule.raw_pattern.endswith("/") and path.is_dir():
# ensure the test path will potentially match a directory pattern if it is a directory
rel_path += "/"
if rule.include is not None and rule.pattern.match(rel_path) is not None:
Expand Down Expand Up @@ -208,10 +209,11 @@ def _find_path_from_directory(
:return: a generator of file paths which should not be ignored.
"""
# A Dict of patterns, keyed using resolved, absolute paths
patterns_by_dir: Dict[Path, List[_IgnoreRule]] = {}

for root, dirs, files in os.walk(base_dir_path, followlinks=True):
patterns: List[_IgnoreRule] = patterns_by_dir.get(Path(root), [])
patterns: List[_IgnoreRule] = patterns_by_dir.get(Path(root).resolve(), [])

ignore_file_path = Path(root) / ignore_file_name
if ignore_file_path.is_file():
Expand All @@ -233,7 +235,15 @@ def _find_path_from_directory(

dirs[:] = [subdir for subdir in dirs if not ignore_rule_type.match(Path(root) / subdir, patterns)]

patterns_by_dir.update({Path(root) / sd: patterns.copy() for sd in dirs})
# explicit loop for infinite recursion detection since we are following symlinks in this walk
for sd in dirs:
dirpath = (Path(root) / sd).resolve()
if dirpath in patterns_by_dir:
raise RuntimeError(
"Detected recursive loop when walking DAG directory "
+ f"{base_dir_path}: {dirpath} has appeared more than once."
)
patterns_by_dir.update({dirpath: patterns.copy()})

for file in files:
if file == ignore_file_name:
Expand Down
56 changes: 55 additions & 1 deletion tests/utils/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@
# specific language governing permissions and limitations
# under the License.

import os
import os.path
import unittest
from pathlib import Path
from unittest import mock

import pytest

from airflow.utils.file import correct_maybe_zipped, find_path_from_directory, open_maybe_zipped
from tests.models import TEST_DAGS_FOLDER

Expand Down Expand Up @@ -77,7 +81,24 @@ def test_open_maybe_zipped_archive(self):
assert isinstance(content, str)


class TestListPyFilesPath(unittest.TestCase):
class TestListPyFilesPath:
@pytest.fixture()
def test_dir(self, tmp_path):
# create test tree with symlinks
source = os.path.join(tmp_path, "folder")
target = os.path.join(tmp_path, "symlink")
py_file = os.path.join(source, "hello_world.py")
ignore_file = os.path.join(tmp_path, ".airflowignore")
os.mkdir(source)
os.symlink(source, target)
# write ignore files
with open(ignore_file, 'w') as f:
f.write("folder")
# write sample pyfile
with open(py_file, 'w') as f:
f.write("print('hello world')")
return tmp_path

def test_find_path_from_directory_regex_ignore(self):
should_ignore = [
"test_invalid_cron.py",
Expand Down Expand Up @@ -110,3 +131,36 @@ def test_find_path_from_directory_glob_ignore(self):
assert len(list(filter(lambda file: os.path.basename(file) in should_not_ignore, files))) == len(
should_not_ignore
)

def test_find_path_from_directory_respects_symlinks_regexp_ignore(self, test_dir):
ignore_list_file = ".airflowignore"
found = list(find_path_from_directory(test_dir, ignore_list_file))

assert os.path.join(test_dir, "symlink", "hello_world.py") in found
assert os.path.join(test_dir, "folder", "hello_world.py") not in found

def test_find_path_from_directory_respects_symlinks_glob_ignore(self, test_dir):
ignore_list_file = ".airflowignore"
found = list(find_path_from_directory(test_dir, ignore_list_file, ignore_file_syntax="glob"))

assert os.path.join(test_dir, "symlink", "hello_world.py") in found
assert os.path.join(test_dir, "folder", "hello_world.py") not in found

def test_find_path_from_directory_fails_on_recursive_link(self, test_dir):
# add a recursive link
recursing_src = os.path.join(test_dir, "folder2", "recursor")
recursing_tgt = os.path.join(test_dir, "folder2")
os.mkdir(recursing_tgt)
os.symlink(recursing_tgt, recursing_src)

ignore_list_file = ".airflowignore"

try:
list(find_path_from_directory(test_dir, ignore_list_file, ignore_file_syntax="glob"))
assert False, "Walking a self-recursive tree should fail"
except RuntimeError as err:
assert (
str(err)
== f"Detected recursive loop when walking DAG directory {test_dir}: "
+ f"{Path(recursing_tgt).resolve()} has appeared more than once."
)

0 comments on commit 8494fc7

Please sign in to comment.