diff --git a/src/__init__.py b/src/__init__.py index 11b8225a..6422102f 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -29,11 +29,11 @@ from src.views.statistics.references import References from src.views.statistics.xhtml import Xhtml -# new stuff apr 2024 -from src.views.v2.article_cache_view_v2 import ArticleCacheV2 # new stuff jan 2024 from src.views.v2.article_view_v2 import ArticleV2 from src.views.version import Version +# new stuff apr 2024 +from src.views.v2.article_cache_view_v2 import ArticleCacheV2 # new stuff jun 2024 from src.views.v2.editref_v2 import EditRefV2 diff --git a/src/models/v2/job/editref_job_v2.py b/src/models/v2/job/editref_job_v2.py index 7f3a7b72..b5f3cc7d 100644 --- a/src/models/v2/job/editref_job_v2.py +++ b/src/models/v2/job/editref_job_v2.py @@ -1,30 +1,41 @@ -# import re -# from urllib.parse import quote, unquote +import re +from urllib.parse import quote, unquote # # import requests # # import config # from src.models.exceptions import MissingInformationError, WikipediaApiFetchError -# from src.models.wikimedia.enums import WikimediaDomain +from src.models.wikimedia.enums import WikimediaDomain +from src import MissingInformationError from src.models.v2.job import JobV2 class EditRefJobV2(JobV2): """job that supports EditRefV2 endpoint""" - target: str = "" - replace: str = "" + old_ref: str = "" + new_ref: str = "" + source: str = "" + wiki_page_url: str = "" + + wiki_lang: str = "" + wiki_domain: WikimediaDomain = WikimediaDomain.wikipedia + wiki_page_title: str = "" + wiki_revision: str = "" + + regex_wiki_url = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)" + # NB this should be elevated to a globalish constant + + @property + def quoted_title(self): + if not self.wiki_page_title: + raise MissingInformationError("self.wiki_page_title is empty") + return quote(self.wiki_page_title, safe="") # these following (commented) functions might be useful when we # have a wikipage id rather than a string to describe source - # @property - # def quoted_title(self): - # if not self.title: - # raise MissingInformationError("self.title was empty") - # return quote(self.title, safe="") - # def get_mediawiki_ids(self) -> None: # from src import app # @@ -88,11 +99,35 @@ class EditRefJobV2(JobV2): # if not matches: # app.logger.error("Not a supported Wikimedia URL") - def validate_fields(self): """ any parameter checking done here... + + must have at least "source" or "wiki_page_url" defined """ - # self.__extract_url__() # may want to do something to parse wikipage id in the future - pass + from src import app + + if not self.source: + if not self.wiki_page_url: + app.logger.error('Parameters must contain a valid "source" or valid "wiki_page_url" value. ' + f"source: {self.source}, wiki_page_url: {self.wiki_page_url}") + raise MissingInformationError( + f'Parameters must contain a valid "source" or valid "wiki_page_url" value.' + ) + + # extract wiki parts from wiki_page_url? + my_url = unquote(self.wiki_page_url) + + matches = re.match(self.regex_wiki_url, my_url) + if matches: + groups = matches.groups() + self.wiki_lang = groups[0] + self.wiki_domain = WikimediaDomain(groups[1]) + self.wiki_page_title = groups[2] + else: + app.logger.error(f"{self.wiki_page_url} is not a supported Wikimedia URL") + raise MissingInformationError( + f"wiki_page_url parameter value ({self.wiki_page_url}) is invalid." + ) + diff --git a/src/models/v2/schema/editref_schema_v2.py b/src/models/v2/schema/editref_schema_v2.py index cbd96e64..bb6c55bf 100644 --- a/src/models/v2/schema/editref_schema_v2.py +++ b/src/models/v2/schema/editref_schema_v2.py @@ -8,9 +8,10 @@ class EditRefSchemaV2(BaseSchemaV2): # Defines expected parameters for EditRefV2 endpoint # - default parameters are defined in BaseSchemaV2 - target = fields.Str(required=True) - replace = fields.Str(required=True) - source = fields.Str(required=True) + old_ref = fields.Str(required=True) + new_ref = fields.Str(required=True) + source = fields.Str(required=False) + wiki_page_url = fields.Str(required=False) # noinspection PyUnusedLocal @post_load diff --git a/src/models/wikimedia/wikipedia/reference/generic.py b/src/models/wikimedia/wikipedia/reference/generic.py index 27660e21..4195a11c 100644 --- a/src/models/wikimedia/wikipedia/reference/generic.py +++ b/src/models/wikimedia/wikipedia/reference/generic.py @@ -420,6 +420,7 @@ def extract_and_check(self) -> None: self.__extract_templates_and_parameters__() self.__extract_reference_urls__() + self.__extract_unique_first_level_domains__() self.__generate_reference_id__() diff --git a/src/views/statistics/__init__.py b/src/views/statistics/__init__.py index 6493b8a2..d4960a4e 100644 --- a/src/views/statistics/__init__.py +++ b/src/views/statistics/__init__.py @@ -41,7 +41,7 @@ def __validate_and_get_job__(self): def __validate__(self): from src import app - app.logger.debug("StatisticsView::__validate__") + app.logger.debug("==> StatisticsView::__validate__") errors = self.schema.validate(request.args) if errors: @@ -52,7 +52,7 @@ def __validate__(self): def __parse_into_job__(self): from src import app - app.logger.debug("__parse_into_job__: running") + app.logger.debug("==> StatisticsView::__parse_into_job__") # app.logger.debug(request.args) if not self.schema: raise MissingInformationError() @@ -61,6 +61,6 @@ def __parse_into_job__(self): if not self.job: # this seems to be the case when there are no arguments, as in the # /version endpoint. Seems to be harmless not having a valid job property - console.print("self.job is null") + app.logger.info("StatisticsView: self.job is null") console.print(self.job) diff --git a/src/views/v2/editref_v2.py b/src/views/v2/editref_v2.py index 87a9b1ea..5d34d7ad 100644 --- a/src/views/v2/editref_v2.py +++ b/src/views/v2/editref_v2.py @@ -4,6 +4,11 @@ from typing import Any, Optional, Tuple import traceback +from dateutil.parser import isoparse + +import config +import requests + from src.models.exceptions import MissingInformationError, WikipediaApiFetchError from src.models.v2.schema.editref_schema_v2 import EditRefSchemaV2 @@ -26,9 +31,10 @@ class EditRefV2(StatisticsViewV2): replaces search string with replace string in source string, and returns results """ - schema = EditRefSchemaV2() # overrides StatisticsViewV2's schema property - job: EditRefJobV2 # overrides StatisticsViewV2's job property + schema = EditRefSchemaV2() # Defines expected parameters; Overrides StatisticsViewV2's "schema" property + job: EditRefJobV2 # Holds usable variables, seeded from schema. Overrides StatisticsViewV2's "job" + source_text = "" replaced_data = "" def get(self): @@ -39,7 +45,7 @@ def get(self): from src import app app.logger.debug("==> EditRefV2::get") - return self.__process_data__() + return self.__process_data__(method="get") def post(self): """ @@ -57,18 +63,39 @@ def __process_data__(self, method="get"): try: self.__validate_and_get_job__(method) # inherited from StatisticsViewV2 # - # validates via schema (a marshmallow feature) and sets job values wia schema's values + # validates schema params (a marshmallow feature), and sets job properties based on schema's values + """ + url = ( + f"https://{self.job.lang}.{self.job.domain.value}/" + f"w/rest.php/v1/page/{self.job.quoted_title}" + ) + headers = {"User-Agent": config.user_agent} + response = requests.get(url, headers=headers) + # console.print(response.json()) + if response.status_code == 200: + data = response.json() + self.job.revision = int(data["latest"]["id"]) + self.revision_isodate = isoparse(data["latest"]["timestamp"]) + self.revision_timestamp = round(self.revision_isodate.timestamp()) + self.page_id = int(data["id"]) + # logger.debug(f"Got pageid: {self.page_id}") + self.wikitext = data["source"] + """ + # set up source_text + self.__setup_source_text__() # setup source_text to be # set up results - self.__replace_data__() # sets self.replaced_data + self.__replace_data__() # self.replaced_data holds newly edited source # and return results - return { - "target": self.job.target, - "replace": self.job.replace, - "source": self.job.source, - "result": self.replaced_data - } + # return { + # "old_ref": self.job.old_ref, + # "new_ref": self.job.new_ref, + # # "source": self.job.source, + # "result": self.replaced_data + # } + return self.replaced_data + except MissingInformationError as e: app.logger.debug("after EditRefV2::self.__validate_and_get_job__ MissingInformationError exception") @@ -80,10 +107,68 @@ def __process_data__(self, method="get"): traceback.print_exc() return {"error": f"General Error: {str(e)}"}, 500 + def __setup_source_text__(self): + from src import app + app.logger.debug("==> EditRefV2::__setup_source_text__") + + """ + set source_text to: + job.source if non-empty + fetched wikitext based on wiki_page_url otherwise + - EXCEPTION No Wiki Page + - EXCEPTION General + """ + if self.job.source: + self.source_text = self.job.source + + else: + # grab wikitext from wiki_page_url + + url = ( + f"https://{self.job.wiki_lang}.{self.job.wiki_domain.value}/" + f"w/rest.php/v1/page/{self.job.quoted_title}" + ) + headers = {"User-Agent": config.user_agent} + response = requests.get(url, headers=headers) + + # console.print(response.json()) + app.logger.debug(f"==> EditRefV2::__setup_source_text__: url to grab is: {url}") + + if response.status_code == 200: + + data = response.json() + + self.job.wiki_revision = int(data["latest"]["id"]) + self.revision_isodate = isoparse(data["latest"]["timestamp"]) + self.revision_timestamp = round(self.revision_isodate.timestamp()) + self.page_id = int(data["id"]) + + self.source_text = data["source"] + + else: + # raise an exception because wiki page fetch was unsuccessful + app.logger.error(f"==> EditRefV2::__setup_source_text__: wikitext fetch was unsuccessful " + f"({self.job.wiki_page_url})") + def __replace_data__(self): + # takes source_text and applies replacement transformations on it from src import app app.logger.debug("==> EditRefV2::__replace_data__") - self.replaced_data = self.job.source.replace(self.job.target, self.job.replace) + app.logger.debug("==>") + app.logger.debug("==>") + app.logger.debug("==>") + + app.logger.debug("==>") + app.logger.debug("==> SOURCE") + app.logger.debug("==>") + app.logger.debug(self.source_text) + + self.replaced_data = self.source_text.replace(self.job.old_ref, self.job.new_ref) + + app.logger.debug("==>") + app.logger.debug("==> REPLACED") + app.logger.debug("==>") + app.logger.debug(self.replaced_data) diff --git a/src/views/v2/statistics/__init__.py b/src/views/v2/statistics/__init__.py index 78a1ccd7..0ad66511 100644 --- a/src/views/v2/statistics/__init__.py +++ b/src/views/v2/statistics/__init__.py @@ -45,11 +45,12 @@ def __read_from_cache__(self): def __validate_and_get_job__(self, method="get"): """ Validates request params, whether from GET or POST, and, - if successful, pulls those param values into job object + if successful, pulls param values into job's properties """ from src import app app.logger.debug(f"==> StatisticsViewV2::__validate_and_get_job__({method})") + # use args if GET, form if POST request_args = request.args if (method == "get") else request.form self.__validate__(request_args)