Skip to content

Commit

Permalink
Add additional extraction options
Browse files Browse the repository at this point in the history
This change primarily adds two new extraction options:
- max-pages: This will extract up to some number of specified pages
- skip-percent: This will skip some percent of all pages

skip percent is useful when you want to get periodic samples instead of
pulling every page for entities that have thousands of pages.
  • Loading branch information
JacobCallahan committed Jun 21, 2024
1 parent 5bbdd0c commit 83470eb
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 23 deletions.
4 changes: 3 additions & 1 deletion candore/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, settings):
def list_endpoints(self):
return self.api_lister.lister_endpoints()

async def save_all_entities(self, mode, output_file, full):
async def save_all_entities(self, mode, output_file, full, max_pages=None, skip_percent=None):
"""Save all the entities to a json file
:param mode: Pre or Post
Expand All @@ -36,6 +36,8 @@ async def save_all_entities(self, mode, output_file, full):
async with Extractor(settings=self.settings, apilister=self.api_lister) as extractor:
if full:
extractor.full = True
extractor.max_pages = max_pages
extractor.skip_percent = skip_percent
data = await extractor.extract_all_entities()

if not data:
Expand Down
20 changes: 16 additions & 4 deletions candore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,21 @@ def apis(ctx):
@click.option("--mode", type=str, help="The mode must be 'pre' or 'post'")
@click.option("-o", "--output", type=str, help="The output file name")
@click.option("--full", is_flag=True, help="Extract data from all the pages of a component")
@click.option("--max-pages", type=int, help="The maximum number of pages to extract per entity")
@click.option("--skip-percent", type=int, help="The percentage of pages to skip per entity")
@click.pass_context
def extract(ctx, mode, output, full):
def extract(ctx, mode, output, full, max_pages, skip_percent):
loop = asyncio.get_event_loop()
candore_obj = ctx.parent.candore
loop.run_until_complete(candore_obj.save_all_entities(mode=mode, output_file=output, full=full))
loop.run_until_complete(
candore_obj.save_all_entities(
mode=mode,
output_file=output,
full=full,
max_pages=max_pages,
skip_percent=skip_percent,
)
)


@candore.command(help="Compare pre and post upgrade data")
Expand Down Expand Up @@ -86,9 +96,11 @@ def compare(ctx, pre, post, inverse, output, report_type, record_evs):
"e.g entity/5/description",
)
@click.option(
"--data-file", type=str, help="The data file from which to search the data on a given path"
"--data-file",
type=str,
help="The data file from which to search the data on a given path",
)
@click.option("--delimiter", type=str, default='/', help="Settings file path. Default is '/'")
@click.option("--delimiter", type=str, default="/", help="Settings file path. Default is '/'")
@click.pass_context
def reader(ctx, path, data_file, delimiter):
candore_obj = ctx.parent.candore
Expand Down
23 changes: 15 additions & 8 deletions candore/modules/comparator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import json

from candore.modules.variations import Variations, Constants
from candore.utils import last_index_of_element, is_list_contains_dict
from candore.modules.variations import Constants
from candore.modules.variations import Variations
from candore.utils import is_list_contains_dict
from candore.utils import last_index_of_element


class Comparator:
Expand Down Expand Up @@ -29,7 +31,10 @@ def record_variation(self, pre, post, var_details=None):
big_key = [str(itm) for itm in self.big_key]
full_path = "/".join(big_key)
var_full_path = "/".join([itm for itm in self.big_key if not isinstance(itm, int)])
if var_full_path in self.variations.expected_variations or var_full_path in self.variations.skipped_variations:
if (
var_full_path in self.variations.expected_variations
or var_full_path in self.variations.skipped_variations
):
if self.record_evs:
variation = {
"pre": pre,
Expand All @@ -48,7 +53,10 @@ def record_constants(self, pre, post, var_details=None):
big_key = [str(itm) for itm in self.big_key]
full_path = "/".join(big_key)
var_full_path = "/".join([itm for itm in self.big_key if not isinstance(itm, int)])
if var_full_path in self.constants.expected_constants or var_full_path in self.constants.skipped_constants:
if (
var_full_path in self.constants.expected_constants
or var_full_path in self.constants.skipped_constants
):
if self.record_evs:
variation = {
"pre": pre,
Expand Down Expand Up @@ -95,7 +103,7 @@ def _is_data_type_list_contains_dict(self, pre, post):
)
else:
key = list(pre_entity.keys())[0]
if pre_entity[key] == post_entity[key]:
if pre_entity[key] == post_entity.get(key):
self.compare_all_pres_with_posts(
pre_entity[key], post_entity[key], unique_key=key
)
Expand All @@ -105,7 +113,6 @@ def _is_data_type_list_contains_dict(self, pre, post):
self.remove_path(pre_entity[list(pre_entity.keys())[0]])

def _is_data_type_list(self, pre, post, unique_key=""):

def custom_key(elem):
return 'None' if elem is None else str(elem)

Expand All @@ -121,9 +128,9 @@ def custom_key(elem):
def compare_all_pres_with_posts(self, pre_data, post_data, unique_key="", var_details=None):
if unique_key:
self.big_key.append(unique_key)
if type(pre_data) is dict:
if isinstance(pre_data, dict):
self._is_data_type_dict(pre_data, post_data, unique_key=unique_key)
elif type(pre_data) is list:
elif isinstance(pre_data, list):
self._is_data_type_list(pre_data, post_data, unique_key=unique_key)
else:
if pre_data != post_data:
Expand Down
36 changes: 26 additions & 10 deletions candore/modules/extractor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio # noqa: F401
import math
from functools import cached_property

import aiohttp
Expand Down Expand Up @@ -68,9 +69,18 @@ async def fetch_page(self, page, _request):
page_entities = await self.paged_results(**_request)
return page_entities

async def fetch_all_pages(self, total_pages, _request):
async def fetch_all_pages(self, total_pages, _request, max_pages=None, skip_percent=None):
if max_pages:
stop = min(total_pages, max_pages)
else:
stop = total_pages
if skip_percent:
step = stop // math.ceil(stop * (100 - skip_percent) / 100)
else:
step = 1
tasks = []
for page in range(2, total_pages + 1):
print(f"Fetching {len(list(range(1, stop, step)))} more page(s).")
for page in range(1, stop, step):
task = asyncio.ensure_future(self.fetch_page(page, _request))
tasks.append(task)
responses = await asyncio.gather(*tasks)
Expand All @@ -96,15 +106,21 @@ async def fetch_component_entities(self, **comp_params):
return entity_data
else:
return entity_data
# If the entity has multiple pages, fetch them all
if self.full:
total_pages = results.get("total") // results.get("per_page") + 1
if total_pages > 1:
print(f"Endpoint {endpoint} has {total_pages} pages.")
total_pages = results.get("total") // results.get("per_page") + 1
if total_pages > 1:
print(f"Endpoint {endpoint} has {total_pages} pages.")
# If the entity has multiple pages, fetch them all
if self.full:
pages_data = await self.fetch_all_pages(total_pages, _request)
for page_entities in pages_data:
if page_entities:
entity_data.extend(page_entities)
elif self.max_pages or self.skip_percent:
pages_data = await self.fetch_all_pages(
total_pages, _request, self.max_pages, self.skip_percent
)
else:
return entity_data
for page_entities in pages_data:
if page_entities:
entity_data.extend(page_entities)
return entity_data

async def dependency_ids(self, dependency):
Expand Down

0 comments on commit 83470eb

Please sign in to comment.