scrape_role_files: add support for nested role directories

Changing the way how roles are scraped. Previously directories present in default roles directory were treated as roles (dir name is role name). Now scraper enters each directory looking for one of 8 specified subdirs. When at least one such subdir is found, its parent is treated as role, else subdir is added to search list (this is equivalent of traversing repository) Adjusting tests to utilize nested role directories
bmwcarit · Feb 8, 2024 · 1dc0fed · 1dc0fed
1 parent bb31afe
commit 1dc0fed
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 35 deletions.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -118,6 +118,14 @@ def repo_data():
             },
             "changelog_file": None,
         },
+        "foobar/baz": {
+            "last_changed": "2018-09-17 15:15:15",
+            "readme_file": {
+                "path": "roles/foobar/baz/README.rst",
+                "content": raw_file("repo_files/roles/foobar/baz/README.rst"),
+            },
+            "changelog_file": None,
+        },
     }
 
     return repo, tenants, job_files, role_files

diff --git a/tests/scraper/test_integration.py b/tests/scraper/test_integration.py
@@ -73,6 +73,7 @@ class MockContents:
     DIR = "dir"
 
     def __init__(self, path, type):
+        self.name = path.split("/")[-1]
         self.path = path
         self.type = type
 
@@ -101,9 +102,10 @@ class MockGitHubRepository(GitHubRepository):
             "zuul.d/jobs.yaml": MOCKED_JOB_CONTENT,
             "roles": {"docker-run": MockContents("roles/docker-run", MockContents.DIR)},
             "roles/docker-run": {
+                "tasks": MockContents("roles/docker-run/tasks", MockContents.DIR),
                 "README.rst": MockContents(
                     "roles/docker-run/README.rst", MockContents.FILE
-                )
+                ),
             },
             "roles/docker-run/README.rst": MOCKED_ROLE_DESCRIPTION,
             "roles/ignored": MockContents("not a valid role", MockContents.FILE),
@@ -117,25 +119,40 @@ class MockGitHubRepository(GitHubRepository):
                 "bar": MockContents("roles/bar", MockContents.DIR),
                 "foobar": MockContents("roles/foobar", MockContents.DIR),
                 "empty-dir": MockContents("roles/empty-dir", MockContents.DIR),
+                "foobaz": MockContents("roles/foobaz", MockContents.DIR),
             },
             "roles/foo": {
-                "README.md": MockContents("roles/foo/README.md", MockContents.FILE)
+                "defaults": MockContents("roles/foo/defaults", MockContents.DIR),
+                "README.md": MockContents("roles/foo/README.md", MockContents.FILE),
             },
             "roles/bar": {
-                "README.txt": MockContents("roles/bar/README.txt", MockContents.FILE)
+                "library": MockContents("roles/bar/library", MockContents.DIR),
+                "README.txt": MockContents("roles/bar/README.txt", MockContents.FILE),
             },
             "roles/foobar": {
-                "README": MockContents("roles/foobar/README", MockContents.FILE)
+                "handlers": MockContents("roles/foobar/handlers", MockContents.DIR),
+                "README": MockContents("roles/foobar/README", MockContents.FILE),
             },
             "roles/empty-dir": {
+                "meta": MockContents("roles/empty-dir/meta", MockContents.DIR),
                 "README.whatever": MockContents(
                     "roles/empty-dir/README.whatever", MockContents.FILE
-                )
+                ),
+            },
+            "roles/foobaz": {
+                "baz": MockContents("roles/foobaz/baz", MockContents.DIR),
+            },
+            "roles/foobaz/baz": {
+                "vars": MockContents("roles/foobaz/baz/vars", MockContents.DIR),
+                "README.rst": MockContents(
+                    "roles/foobaz/baz/README.rst", MockContents.FILE
+                ),
             },
             "roles/foo/README.md": "# Just some Markdown",
             "roles/bar/README.txt": "Just some simple text\nwith a line break",
             "roles/foobar/README": "Simple text in a file without extension",
-            "roles/empty-dir/REAMDE.whatever": "This file won't be checked out",
+            "roles/empty-dir/README.whatever": "This file won't be checked out",
+            "roles/foobaz/baz/README.rst": "Simple readme file from nested role",
         },
         "orga1/repo3": {
             REPO_ROOT: {
@@ -215,19 +232,24 @@ def test_scrape():
         "orga1/repo2": (
             {},
             {
-                "foo": {
+                "bar": {
                     "last_changed": "2018-09-17 15:15:15",
                     "readme_file": {
-                        "path": "roles/foo/README.md",
-                        "content": "# Just some Markdown",
+                        "path": "roles/bar/README.txt",
+                        "content": "Just some simple text\nwith a line break",
                     },
                     "changelog_file": None,
                 },
-                "bar": {
+                "empty-dir": {
+                    "last_changed": "2018-09-17 15:15:15",
+                    "readme_file": None,
+                    "changelog_file": None,
+                },
+                "foo": {
                     "last_changed": "2018-09-17 15:15:15",
                     "readme_file": {
-                        "path": "roles/bar/README.txt",
-                        "content": "Just some simple text\nwith a line break",
+                        "path": "roles/foo/README.md",
+                        "content": "# Just some Markdown",
                     },
                     "changelog_file": None,
                 },
@@ -239,9 +261,12 @@ def test_scrape():
                     },
                     "changelog_file": None,
                 },
-                "empty-dir": {
+                "foobaz/baz": {
                     "last_changed": "2018-09-17 15:15:15",
-                    "readme_file": None,
+                    "readme_file": {
+                        "path": "roles/foobaz/baz/README.rst",
+                        "content": "Simple readme file from nested role",
+                    },
                     "changelog_file": None,
                 },
             },

diff --git a/tests/scraper/test_repo_parser.py b/tests/scraper/test_repo_parser.py
@@ -47,6 +47,7 @@ def test_parse(repo_data):
     job_5 = jobs[4]
     role_1 = [r for r in roles if r["role_name"] == "foo"][0]
     role_2 = [r for r in roles if r["role_name"] == "bar"][0]
+    role_3 = [r for r in roles if r["role_name"] == "foobar/baz"][0]
 
     expected_job_1 = {
         "job_name": "my-cool-new-job",
@@ -215,6 +216,39 @@ def test_parse(repo_data):
         "last_updated": "2018-09-17 15:15:15",
     }
 
+    expected_role_3 = {
+        "role_name": "foobar/baz",
+        "repo": "my/project",
+        "tenants": ["foo", "bar"],
+        "description": (
+            "Yet another simple description\n\n"
+            "**Role variables**\n\n"
+            ".. zuul:rolevar:: my_mandatory_variable\n\n"
+            "   This variable is mandatory.\n"
+        ),
+        "description_html": (
+            "<p>Yet another simple description</p>\n"
+            "<p><strong>Role variables</strong></p>\n"
+            '<dl class="zuul rolevar">\n'
+            '<dt class="sig sig-object zuul" id="rolevar-my_mandatory_variable">\n'
+            '<span class="sig-name descname">'
+            '<span class="pre">my_mandatory_variable</span>'
+            "</span>"
+            '<a class="headerlink" href="#rolevar-my_mandatory_variable" '
+            'title="Link to this definition">'
+            "¶</a><br /></dt>\n"
+            "<dd><p>This variable is mandatory.</p>\n"
+            "</dd></dl>\n"
+            "\n"
+        ),
+        "url": "https://github/my/project/tree/master/roles/foobar/baz",
+        "private": False,
+        "platforms": [],
+        "reusable": False,
+        "scrape_time": scrape_time,
+        "last_updated": "2018-09-17 15:15:15",
+    }
+
     # NOTE (fschmidt): Without the skip_empty flag, empty (= None) keys will
     # be stripped from the resulting dict.
     assert job_1.to_dict(skip_empty=False) == expected_job_1
@@ -224,6 +258,7 @@ def test_parse(repo_data):
     assert job_5.to_dict(skip_empty=False) == expected_job_5
     assert role_1.to_dict(skip_empty=False) == expected_role_1
     assert role_2.to_dict(skip_empty=False) == expected_role_2
+    assert role_3.to_dict(skip_empty=False) == expected_role_3
 
 
 def test_parse_reusable_repo(repo_data):

diff --git a/tests/testdata/repo_files/roles/foobar/baz/README.rst b/tests/testdata/repo_files/roles/foobar/baz/README.rst
@@ -0,0 +1,7 @@
+Yet another simple description
+
+**Role variables**
+
+.. zuul:rolevar:: my_mandatory_variable
+
+   This variable is mandatory.
diff --git a/zubbi/scraper/scraper.py b/zubbi/scraper/scraper.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+from pathlib import Path
 
 from zubbi.scraper.exceptions import CheckoutError
 
@@ -29,6 +30,19 @@
 
 ROLES_DIRECTORY = "roles"
 
+# Role needs to contains at least one of those directories to be considered as role
+# https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_reuse_roles.html#role-directory-structure
+ROLE_MANDATORY_DIRS = [
+    "tasks",
+    "handlers",
+    "library",
+    "defaults",
+    "vars",
+    "files",
+    "templates",
+    "meta",
+]
+
 REPO_ROOT = "/"
 
 
@@ -146,29 +160,44 @@ def get_file_info(self, path):
 
     def scrape_role_files(self):
         role_files = {}
-        # We are only interested in the role name (=> directory name)
-        # and the README and CHANGELOG files. Thus, we don't need to
-        # iterate recursively over the roles directory as those files
-        # should be on the top-level per role.
+        # Roles might be grouped in some parent directory, so we need to
+        # recursively check nested directories.
+
+        # Listing repository files is based on the example in pyGithub doc:
+        # https://pygithub.readthedocs.io/en/v2.1.1/examples/Repository.html#get-all-of-the-contents-of-the-repository-recursively
         try:
-            roles = self.repo.directory_contents(ROLES_DIRECTORY)
-            for role_name, remote_file in roles.items():
+            # Ansible requires roles to be defined in a certain directory structure.
+            # Thus, we are only interested in directories
+            dirs_to_search = [
+                item
+                for item in self.repo.directory_contents(ROLES_DIRECTORY).values()
+                if item.type == "dir"
+            ]
+            while dirs_to_search:
                 try:
-                    if remote_file.type != "dir":
-                        # Ansible requires the role to be defined in a
-                        # certain directory structure. Thus, we can
-                        # ignore it in case it's not a directory.
-                        continue
-                    last_changed = self.repo.last_changed(remote_file.path)
-                    existing_files = self.repo.directory_contents(remote_file.path)
-                    # Skip empty directories or files
-                    if not existing_files:
+                    dir = dirs_to_search.pop(0)
+                    dir_items = self.repo.directory_contents(dir.path)
+
+                    subdirs = [
+                        item for item in dir_items.values() if item.type == "dir"
+                    ]
+                    # When directory does not contain one of role mandatory directories
+                    # it is not a role and its subdirectories (if any) should be further
+                    # scanned.
+                    # This is done by appending those subdirectories to 'dirs_to_search'
+                    # search list (this implements the recursive search)
+                    if set([d.name for d in subdirs]).isdisjoint(ROLE_MANDATORY_DIRS):
+                        dirs_to_search.extend(subdirs)
                         continue
-                    readme_file = self.find_matching_file(README_FILES, existing_files)
-                    changelog_file = self.find_matching_file(
-                        CHANGELOG_FILES, existing_files
-                    )
-                    role_files[role_name] = {
+
+                    # Once the role is found, we are only interested in the timestamp of
+                    # the latest update (the last git change), README and CHANGELOG files
+                    # Those files should be on the top-level per role.
+                    last_changed = self.repo.last_changed(dir.path)
+                    readme_file = self.find_matching_file(README_FILES, dir_items)
+                    changelog_file = self.find_matching_file(CHANGELOG_FILES, dir_items)
+                    # role name is the directory path relative to ROLES_DIRECTORY
+                    role_files[str(Path(dir.path).relative_to(ROLES_DIRECTORY))] = {
                         "last_changed": last_changed,
                         "readme_file": readme_file,
                         "changelog_file": changelog_file,
@@ -178,7 +207,8 @@ def scrape_role_files(self):
         except CheckoutError as e:
             LOGGER.debug(e)
 
-        return role_files
+        # sort keys (role names) alphabetically
+        return {key: value for key, value in sorted(role_files.items())}
 
     def find_matching_file(self, file_filter, existing_files):
         for filename, file_content in existing_files.items():