From 1dc0fed620621a2e8a437603f42e03fb18fa3479 Mon Sep 17 00:00:00 2001
From: Krzysztof Swietlicki <Krzysztof.Swietlicki@partner.bmw.de>
Date: Wed, 10 Jan 2024 19:02:40 +0100
Subject: [PATCH] scrape_role_files: add support for nested role directories

Changing the way how roles are scraped. Previously directories present in
default roles directory were treated as roles (dir name is role name).

Now scraper enters each directory looking for one of 8 specified subdirs.
When at least one such subdir is found, its parent is treated as role, else
subdir is added to search list (this is equivalent of traversing repository)

Adjusting tests to utilize nested role directories
---
 tests/conftest.py                             |  8 +++
 tests/scraper/test_integration.py             | 53 ++++++++++----
 tests/scraper/test_repo_parser.py             | 35 +++++++++
 .../repo_files/roles/foobar/baz/README.rst    |  7 ++
 zubbi/scraper/scraper.py                      | 72 +++++++++++++------
 5 files changed, 140 insertions(+), 35 deletions(-)
 create mode 100644 tests/testdata/repo_files/roles/foobar/baz/README.rst

diff --git a/tests/conftest.py b/tests/conftest.py
index b82aa5a..eb7d50f 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -118,6 +118,14 @@ def repo_data():
             },
             "changelog_file": None,
         },
+        "foobar/baz": {
+            "last_changed": "2018-09-17 15:15:15",
+            "readme_file": {
+                "path": "roles/foobar/baz/README.rst",
+                "content": raw_file("repo_files/roles/foobar/baz/README.rst"),
+            },
+            "changelog_file": None,
+        },
     }
 
     return repo, tenants, job_files, role_files
diff --git a/tests/scraper/test_integration.py b/tests/scraper/test_integration.py
index dc583ef..66f2038 100644
--- a/tests/scraper/test_integration.py
+++ b/tests/scraper/test_integration.py
@@ -73,6 +73,7 @@ class MockContents:
     DIR = "dir"
 
     def __init__(self, path, type):
+        self.name = path.split("/")[-1]
         self.path = path
         self.type = type
 
@@ -101,9 +102,10 @@ class MockGitHubRepository(GitHubRepository):
             "zuul.d/jobs.yaml": MOCKED_JOB_CONTENT,
             "roles": {"docker-run": MockContents("roles/docker-run", MockContents.DIR)},
             "roles/docker-run": {
+                "tasks": MockContents("roles/docker-run/tasks", MockContents.DIR),
                 "README.rst": MockContents(
                     "roles/docker-run/README.rst", MockContents.FILE
-                )
+                ),
             },
             "roles/docker-run/README.rst": MOCKED_ROLE_DESCRIPTION,
             "roles/ignored": MockContents("not a valid role", MockContents.FILE),
@@ -117,25 +119,40 @@ class MockGitHubRepository(GitHubRepository):
                 "bar": MockContents("roles/bar", MockContents.DIR),
                 "foobar": MockContents("roles/foobar", MockContents.DIR),
                 "empty-dir": MockContents("roles/empty-dir", MockContents.DIR),
+                "foobaz": MockContents("roles/foobaz", MockContents.DIR),
             },
             "roles/foo": {
-                "README.md": MockContents("roles/foo/README.md", MockContents.FILE)
+                "defaults": MockContents("roles/foo/defaults", MockContents.DIR),
+                "README.md": MockContents("roles/foo/README.md", MockContents.FILE),
             },
             "roles/bar": {
-                "README.txt": MockContents("roles/bar/README.txt", MockContents.FILE)
+                "library": MockContents("roles/bar/library", MockContents.DIR),
+                "README.txt": MockContents("roles/bar/README.txt", MockContents.FILE),
             },
             "roles/foobar": {
-                "README": MockContents("roles/foobar/README", MockContents.FILE)
+                "handlers": MockContents("roles/foobar/handlers", MockContents.DIR),
+                "README": MockContents("roles/foobar/README", MockContents.FILE),
             },
             "roles/empty-dir": {
+                "meta": MockContents("roles/empty-dir/meta", MockContents.DIR),
                 "README.whatever": MockContents(
                     "roles/empty-dir/README.whatever", MockContents.FILE
-                )
+                ),
+            },
+            "roles/foobaz": {
+                "baz": MockContents("roles/foobaz/baz", MockContents.DIR),
+            },
+            "roles/foobaz/baz": {
+                "vars": MockContents("roles/foobaz/baz/vars", MockContents.DIR),
+                "README.rst": MockContents(
+                    "roles/foobaz/baz/README.rst", MockContents.FILE
+                ),
             },
             "roles/foo/README.md": "# Just some Markdown",
             "roles/bar/README.txt": "Just some simple text\nwith a line break",
             "roles/foobar/README": "Simple text in a file without extension",
-            "roles/empty-dir/REAMDE.whatever": "This file won't be checked out",
+            "roles/empty-dir/README.whatever": "This file won't be checked out",
+            "roles/foobaz/baz/README.rst": "Simple readme file from nested role",
         },
         "orga1/repo3": {
             REPO_ROOT: {
@@ -215,19 +232,24 @@ def test_scrape():
         "orga1/repo2": (
             {},
             {
-                "foo": {
+                "bar": {
                     "last_changed": "2018-09-17 15:15:15",
                     "readme_file": {
-                        "path": "roles/foo/README.md",
-                        "content": "# Just some Markdown",
+                        "path": "roles/bar/README.txt",
+                        "content": "Just some simple text\nwith a line break",
                     },
                     "changelog_file": None,
                 },
-                "bar": {
+                "empty-dir": {
+                    "last_changed": "2018-09-17 15:15:15",
+                    "readme_file": None,
+                    "changelog_file": None,
+                },
+                "foo": {
                     "last_changed": "2018-09-17 15:15:15",
                     "readme_file": {
-                        "path": "roles/bar/README.txt",
-                        "content": "Just some simple text\nwith a line break",
+                        "path": "roles/foo/README.md",
+                        "content": "# Just some Markdown",
                     },
                     "changelog_file": None,
                 },
@@ -239,9 +261,12 @@ def test_scrape():
                     },
                     "changelog_file": None,
                 },
-                "empty-dir": {
+                "foobaz/baz": {
                     "last_changed": "2018-09-17 15:15:15",
-                    "readme_file": None,
+                    "readme_file": {
+                        "path": "roles/foobaz/baz/README.rst",
+                        "content": "Simple readme file from nested role",
+                    },
                     "changelog_file": None,
                 },
             },
diff --git a/tests/scraper/test_repo_parser.py b/tests/scraper/test_repo_parser.py
index 8434775..baf5201 100644
--- a/tests/scraper/test_repo_parser.py
+++ b/tests/scraper/test_repo_parser.py
@@ -47,6 +47,7 @@ def test_parse(repo_data):
     job_5 = jobs[4]
     role_1 = [r for r in roles if r["role_name"] == "foo"][0]
     role_2 = [r for r in roles if r["role_name"] == "bar"][0]
+    role_3 = [r for r in roles if r["role_name"] == "foobar/baz"][0]
 
     expected_job_1 = {
         "job_name": "my-cool-new-job",
@@ -215,6 +216,39 @@ def test_parse(repo_data):
         "last_updated": "2018-09-17 15:15:15",
     }
 
+    expected_role_3 = {
+        "role_name": "foobar/baz",
+        "repo": "my/project",
+        "tenants": ["foo", "bar"],
+        "description": (
+            "Yet another simple description\n\n"
+            "**Role variables**\n\n"
+            ".. zuul:rolevar:: my_mandatory_variable\n\n"
+            "   This variable is mandatory.\n"
+        ),
+        "description_html": (
+            "<p>Yet another simple description</p>\n"
+            "<p><strong>Role variables</strong></p>\n"
+            '<dl class="zuul rolevar">\n'
+            '<dt class="sig sig-object zuul" id="rolevar-my_mandatory_variable">\n'
+            '<span class="sig-name descname">'
+            '<span class="pre">my_mandatory_variable</span>'
+            "</span>"
+            '<a class="headerlink" href="#rolevar-my_mandatory_variable" '
+            'title="Link to this definition">'
+            "¶</a><br /></dt>\n"
+            "<dd><p>This variable is mandatory.</p>\n"
+            "</dd></dl>\n"
+            "\n"
+        ),
+        "url": "https://github/my/project/tree/master/roles/foobar/baz",
+        "private": False,
+        "platforms": [],
+        "reusable": False,
+        "scrape_time": scrape_time,
+        "last_updated": "2018-09-17 15:15:15",
+    }
+
     # NOTE (fschmidt): Without the skip_empty flag, empty (= None) keys will
     # be stripped from the resulting dict.
     assert job_1.to_dict(skip_empty=False) == expected_job_1
@@ -224,6 +258,7 @@ def test_parse(repo_data):
     assert job_5.to_dict(skip_empty=False) == expected_job_5
     assert role_1.to_dict(skip_empty=False) == expected_role_1
     assert role_2.to_dict(skip_empty=False) == expected_role_2
+    assert role_3.to_dict(skip_empty=False) == expected_role_3
 
 
 def test_parse_reusable_repo(repo_data):
diff --git a/tests/testdata/repo_files/roles/foobar/baz/README.rst b/tests/testdata/repo_files/roles/foobar/baz/README.rst
new file mode 100644
index 0000000..7ecbbd3
--- /dev/null
+++ b/tests/testdata/repo_files/roles/foobar/baz/README.rst
@@ -0,0 +1,7 @@
+Yet another simple description
+
+**Role variables**
+
+.. zuul:rolevar:: my_mandatory_variable
+
+   This variable is mandatory.
diff --git a/zubbi/scraper/scraper.py b/zubbi/scraper/scraper.py
index b272b55..00219d6 100644
--- a/zubbi/scraper/scraper.py
+++ b/zubbi/scraper/scraper.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import logging
+from pathlib import Path
 
 from zubbi.scraper.exceptions import CheckoutError
 
@@ -29,6 +30,19 @@
 
 ROLES_DIRECTORY = "roles"
 
+# Role needs to contains at least one of those directories to be considered as role
+# https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_reuse_roles.html#role-directory-structure
+ROLE_MANDATORY_DIRS = [
+    "tasks",
+    "handlers",
+    "library",
+    "defaults",
+    "vars",
+    "files",
+    "templates",
+    "meta",
+]
+
 REPO_ROOT = "/"
 
 
@@ -146,29 +160,44 @@ def get_file_info(self, path):
 
     def scrape_role_files(self):
         role_files = {}
-        # We are only interested in the role name (=> directory name)
-        # and the README and CHANGELOG files. Thus, we don't need to
-        # iterate recursively over the roles directory as those files
-        # should be on the top-level per role.
+        # Roles might be grouped in some parent directory, so we need to
+        # recursively check nested directories.
+
+        # Listing repository files is based on the example in pyGithub doc:
+        # https://pygithub.readthedocs.io/en/v2.1.1/examples/Repository.html#get-all-of-the-contents-of-the-repository-recursively
         try:
-            roles = self.repo.directory_contents(ROLES_DIRECTORY)
-            for role_name, remote_file in roles.items():
+            # Ansible requires roles to be defined in a certain directory structure.
+            # Thus, we are only interested in directories
+            dirs_to_search = [
+                item
+                for item in self.repo.directory_contents(ROLES_DIRECTORY).values()
+                if item.type == "dir"
+            ]
+            while dirs_to_search:
                 try:
-                    if remote_file.type != "dir":
-                        # Ansible requires the role to be defined in a
-                        # certain directory structure. Thus, we can
-                        # ignore it in case it's not a directory.
-                        continue
-                    last_changed = self.repo.last_changed(remote_file.path)
-                    existing_files = self.repo.directory_contents(remote_file.path)
-                    # Skip empty directories or files
-                    if not existing_files:
+                    dir = dirs_to_search.pop(0)
+                    dir_items = self.repo.directory_contents(dir.path)
+
+                    subdirs = [
+                        item for item in dir_items.values() if item.type == "dir"
+                    ]
+                    # When directory does not contain one of role mandatory directories
+                    # it is not a role and its subdirectories (if any) should be further
+                    # scanned.
+                    # This is done by appending those subdirectories to 'dirs_to_search'
+                    # search list (this implements the recursive search)
+                    if set([d.name for d in subdirs]).isdisjoint(ROLE_MANDATORY_DIRS):
+                        dirs_to_search.extend(subdirs)
                         continue
-                    readme_file = self.find_matching_file(README_FILES, existing_files)
-                    changelog_file = self.find_matching_file(
-                        CHANGELOG_FILES, existing_files
-                    )
-                    role_files[role_name] = {
+
+                    # Once the role is found, we are only interested in the timestamp of
+                    # the latest update (the last git change), README and CHANGELOG files
+                    # Those files should be on the top-level per role.
+                    last_changed = self.repo.last_changed(dir.path)
+                    readme_file = self.find_matching_file(README_FILES, dir_items)
+                    changelog_file = self.find_matching_file(CHANGELOG_FILES, dir_items)
+                    # role name is the directory path relative to ROLES_DIRECTORY
+                    role_files[str(Path(dir.path).relative_to(ROLES_DIRECTORY))] = {
                         "last_changed": last_changed,
                         "readme_file": readme_file,
                         "changelog_file": changelog_file,
@@ -178,7 +207,8 @@ def scrape_role_files(self):
         except CheckoutError as e:
             LOGGER.debug(e)
 
-        return role_files
+        # sort keys (role names) alphabetically
+        return {key: value for key, value in sorted(role_files.items())}
 
     def find_matching_file(self, file_filter, existing_files):
         for filename, file_content in existing_files.items():