Add additional extraction options

This change primarily adds two new extraction options: - max-pages: This will extract up to some number of specified pages - skip-percent: This will skip some percent of all pages skip percent is useful when you want to get periodic samples instead of pulling every page for entities that have thousands of pages.
SatelliteQE · Jun 21, 2024 · 83470eb · 83470eb
1 parent 5bbdd0c
commit 83470eb
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 23 deletions.
diff --git a/candore/__init__.py b/candore/__init__.py
@@ -21,7 +21,7 @@ def __init__(self, settings):
     def list_endpoints(self):
         return self.api_lister.lister_endpoints()
 
-    async def save_all_entities(self, mode, output_file, full):
+    async def save_all_entities(self, mode, output_file, full, max_pages=None, skip_percent=None):
         """Save all the entities to a json file
 
         :param mode: Pre or Post
@@ -36,6 +36,8 @@ async def save_all_entities(self, mode, output_file, full):
         async with Extractor(settings=self.settings, apilister=self.api_lister) as extractor:
             if full:
                 extractor.full = True
+            extractor.max_pages = max_pages
+            extractor.skip_percent = skip_percent
             data = await extractor.extract_all_entities()
 
         if not data:

diff --git a/candore/cli.py b/candore/cli.py
@@ -44,11 +44,21 @@ def apis(ctx):
 @click.option("--mode", type=str, help="The mode must be 'pre' or 'post'")
 @click.option("-o", "--output", type=str, help="The output file name")
 @click.option("--full", is_flag=True, help="Extract data from all the pages of a component")
+@click.option("--max-pages", type=int, help="The maximum number of pages to extract per entity")
+@click.option("--skip-percent", type=int, help="The percentage of pages to skip per entity")
 @click.pass_context
-def extract(ctx, mode, output, full):
+def extract(ctx, mode, output, full, max_pages, skip_percent):
     loop = asyncio.get_event_loop()
     candore_obj = ctx.parent.candore
-    loop.run_until_complete(candore_obj.save_all_entities(mode=mode, output_file=output, full=full))
+    loop.run_until_complete(
+        candore_obj.save_all_entities(
+            mode=mode,
+            output_file=output,
+            full=full,
+            max_pages=max_pages,
+            skip_percent=skip_percent,
+        )
+    )
 
 
 @candore.command(help="Compare pre and post upgrade data")
@@ -86,9 +96,11 @@ def compare(ctx, pre, post, inverse, output, report_type, record_evs):
     "e.g entity/5/description",
 )
 @click.option(
-    "--data-file", type=str, help="The data file from which to search the data on a given path"
+    "--data-file",
+    type=str,
+    help="The data file from which to search the data on a given path",
 )
-@click.option("--delimiter", type=str, default='/', help="Settings file path. Default is '/'")
+@click.option("--delimiter", type=str, default="/", help="Settings file path. Default is '/'")
 @click.pass_context
 def reader(ctx, path, data_file, delimiter):
     candore_obj = ctx.parent.candore

diff --git a/candore/modules/comparator.py b/candore/modules/comparator.py
@@ -1,7 +1,9 @@
 import json
 
-from candore.modules.variations import Variations, Constants
-from candore.utils import last_index_of_element, is_list_contains_dict
+from candore.modules.variations import Constants
+from candore.modules.variations import Variations
+from candore.utils import is_list_contains_dict
+from candore.utils import last_index_of_element
 
 
 class Comparator:
@@ -29,7 +31,10 @@ def record_variation(self, pre, post, var_details=None):
         big_key = [str(itm) for itm in self.big_key]
         full_path = "/".join(big_key)
         var_full_path = "/".join([itm for itm in self.big_key if not isinstance(itm, int)])
-        if var_full_path in self.variations.expected_variations or var_full_path in self.variations.skipped_variations:
+        if (
+            var_full_path in self.variations.expected_variations
+            or var_full_path in self.variations.skipped_variations
+        ):
             if self.record_evs:
                 variation = {
                     "pre": pre,
@@ -48,7 +53,10 @@ def record_constants(self, pre, post, var_details=None):
         big_key = [str(itm) for itm in self.big_key]
         full_path = "/".join(big_key)
         var_full_path = "/".join([itm for itm in self.big_key if not isinstance(itm, int)])
-        if var_full_path in self.constants.expected_constants or var_full_path in self.constants.skipped_constants:
+        if (
+            var_full_path in self.constants.expected_constants
+            or var_full_path in self.constants.skipped_constants
+        ):
             if self.record_evs:
                 variation = {
                     "pre": pre,
@@ -95,7 +103,7 @@ def _is_data_type_list_contains_dict(self, pre, post):
                         )
                 else:
                     key = list(pre_entity.keys())[0]
-                    if pre_entity[key] == post_entity[key]:
+                    if pre_entity[key] == post_entity.get(key):
                         self.compare_all_pres_with_posts(
                             pre_entity[key], post_entity[key], unique_key=key
                         )
@@ -105,7 +113,6 @@ def _is_data_type_list_contains_dict(self, pre, post):
                 self.remove_path(pre_entity[list(pre_entity.keys())[0]])
 
     def _is_data_type_list(self, pre, post, unique_key=""):
-
         def custom_key(elem):
             return 'None' if elem is None else str(elem)
 
@@ -121,9 +128,9 @@ def custom_key(elem):
     def compare_all_pres_with_posts(self, pre_data, post_data, unique_key="", var_details=None):
         if unique_key:
             self.big_key.append(unique_key)
-        if type(pre_data) is dict:
+        if isinstance(pre_data, dict):
             self._is_data_type_dict(pre_data, post_data, unique_key=unique_key)
-        elif type(pre_data) is list:
+        elif isinstance(pre_data, list):
             self._is_data_type_list(pre_data, post_data, unique_key=unique_key)
         else:
             if pre_data != post_data:

diff --git a/candore/modules/extractor.py b/candore/modules/extractor.py
@@ -1,4 +1,5 @@
 import asyncio  # noqa: F401
+import math
 from functools import cached_property
 
 import aiohttp
@@ -68,9 +69,18 @@ async def fetch_page(self, page, _request):
             page_entities = await self.paged_results(**_request)
             return page_entities
 
-    async def fetch_all_pages(self, total_pages, _request):
+    async def fetch_all_pages(self, total_pages, _request, max_pages=None, skip_percent=None):
+        if max_pages:
+            stop = min(total_pages, max_pages)
+        else:
+            stop = total_pages
+        if skip_percent:
+            step = stop // math.ceil(stop * (100 - skip_percent) / 100)
+        else:
+            step = 1
         tasks = []
-        for page in range(2, total_pages + 1):
+        print(f"Fetching {len(list(range(1, stop, step)))} more page(s).")
+        for page in range(1, stop, step):
             task = asyncio.ensure_future(self.fetch_page(page, _request))
             tasks.append(task)
         responses = await asyncio.gather(*tasks)
@@ -96,15 +106,21 @@ async def fetch_component_entities(self, **comp_params):
                     return entity_data
             else:
                 return entity_data
-        # If the entity has multiple pages, fetch them all
-        if self.full:
-            total_pages = results.get("total") // results.get("per_page") + 1
-            if total_pages > 1:
-                print(f"Endpoint {endpoint} has {total_pages} pages.")
+        total_pages = results.get("total") // results.get("per_page") + 1
+        if total_pages > 1:
+            print(f"Endpoint {endpoint} has {total_pages} pages.")
+            # If the entity has multiple pages, fetch them all
+            if self.full:
                 pages_data = await self.fetch_all_pages(total_pages, _request)
-                for page_entities in pages_data:
-                    if page_entities:
-                        entity_data.extend(page_entities)
+            elif self.max_pages or self.skip_percent:
+                pages_data = await self.fetch_all_pages(
+                    total_pages, _request, self.max_pages, self.skip_percent
+                )
+            else:
+                return entity_data
+            for page_entities in pages_data:
+                if page_entities:
+                    entity_data.extend(page_entities)
         return entity_data
 
     async def dependency_ids(self, dependency):