code complete for editref endpoint

internetarchive · Jul 2, 2024 · 2769dc3 · 2769dc3
1 parent 7722e6b
commit 2769dc3
Show file tree

Hide file tree

Showing 17 changed files with 269 additions and 47 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "Internet Archive Reference Inventory (IARI)"
-version = "4.4.0"
+version = "4.4.1"
 description = "API capable of fetching, extracting, transforming and storing reference information from Wikipedia articles, websites and PDFs as structured data."
 authors = [
   "Chris Lombardi <[email protected]>",

diff --git a/src/__init__.py b/src/__init__.py
@@ -34,6 +34,8 @@
 # new stuff jan 2024
 from src.views.v2.article_view_v2 import ArticleV2
 from src.views.version import Version
+# new stuff jun 2024
+from src.views.v2.editref_v2 import EditRefV2
 
 logging.basicConfig(level=config.loglevel)
 logger = logging.getLogger(__name__)
@@ -52,15 +54,16 @@ def add_cors_headers(response):
 # let's see if we can distinguish which server we are on
 server_name = os.getenv('FLASK_SERVER_NAME', 'Unknown Server')
 
-# Register the function as a after_request handler
+# Register the function as an after_request handler
 app.after_request(add_cors_headers)
 
 # We use a prefix here to enable us to stabilize the api over time
 # and bump the version when making breaking changes
 api = Api(app, prefix="/v2")
 
-# Here we link together the API views and endpoint urls
-# api.add_resource(LookupByWikidataQid, "/wikidata-qid/<string:qid>")
+# link the API views to respective endpoint urls
+api.add_resource(EditRefV2, "/editref")
+
 api.add_resource(ArticleV2, "/article")
 api.add_resource(ArticleCacheV2, "/article_cache")
 
@@ -75,6 +78,9 @@ def add_cors_headers(response):
 api.add_resource(Reference, "/statistics/reference/<string:reference_id>")
 api.add_resource(Pdf, "/statistics/pdf")
 api.add_resource(Xhtml, "/statistics/xhtml")
+
+# api.add_resource(LookupByWikidataQid, "/wikidata-qid/<string:qid>")
+
 # return app_
 # api.add_resource(
 #     AddJobToQueue, "/add-job"

diff --git a/src/models/v2/file_io/article_file_io_v2.py b/src/models/v2/file_io/article_file_io_v2.py
@@ -6,6 +6,7 @@
 
 
 class ArticleFileIoV2(FileIo):
+
     data: Optional[Dict[str, Any]] = None
     subfolder = "articlesV2/"
     job: Optional[ArticleJobV2]

diff --git a/src/models/v2/job/editref_job_v2.py b/src/models/v2/job/editref_job_v2.py
@@ -0,0 +1,98 @@
+# import re
+# from urllib.parse import quote, unquote
+#
+# import requests
+#
+# import config
+# from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
+# from src.models.wikimedia.enums import WikimediaDomain
+from src.models.v2.job import JobV2
+
+
+class EditRefJobV2(JobV2):
+    """job that supports EditRefV2 endpoint"""
+
+    target: str = ""
+    replace: str = ""
+    source: str = ""
+
+    # these following (commented) functions might be useful when we
+    # have a wikipage id rather than a string to describe source
+
+    # @property
+    # def quoted_title(self):
+    #     if not self.title:
+    #         raise MissingInformationError("self.title was empty")
+    #     return quote(self.title, safe="")
+
+    # def get_mediawiki_ids(self) -> None:
+    #     from src import app
+    #
+    #     app.logger.debug(
+    #         f"ArticleJobV2::get_mediawiki_ids: self.page_id={self.page_id}"
+    #     )
+    #
+    #     if not self.page_id:
+    #         app.logger.debug(
+    #             f"ArticleJobV2::get_mediawiki_ids: lang={self.lang}, title={self.title}, lang={self.domain}"
+    #         )
+    #         if not self.lang or not self.title or not self.domain:
+    #             raise MissingInformationError("url lang, title or domain not found")
+    #
+    #         # https://stackoverflow.com/questions/31683508/wikipedia-mediawiki-api-get-pageid-from-url
+    #         wiki_fetch_url = (
+    #             f"https://{self.lang}.{self.domain.value}/"
+    #             f"w/rest.php/v1/page/{self.quoted_title}"
+    #         )
+    #         headers = {"User-Agent": config.user_agent}
+    #         response = requests.get(wiki_fetch_url, headers=headers)
+    #         # console.print(response.json())
+    #         if response.status_code == 200:
+    #             data = response.json()
+    #             # We only set this if the patron did not specify a revision they want
+    #             if not self.revision:
+    #                 self.revision = int(data["latest"]["id"])
+    #             self.page_id = int(data["id"])
+    #
+    #         elif response.status_code == 404:
+    #             app.logger.error(
+    #                 f"Could not fetch page data from {self.domain} because of 404. See {wiki_fetch_url}"
+    #             )
+    #         else:
+    #             raise WikipediaApiFetchError(
+    #                 f"Could not fetch page data. Got {response.status_code} from {wiki_fetch_url}"
+    #             )
+
+    # def __urldecode_url__(self):
+    #     """We decode the title to have a human readable string to pass around"""
+    #     self.url = unquote(self.url)
+    #
+    # def __extract_url__(self):
+    #     """This was generated with help of chatgpt using this prompt:
+    #     I want a python re regex that extracts "en" "wikipedia.org"
+    #     and "Test" from http://en.wikipedia.org/wiki/Test
+    #     """
+    #     from src import app
+    #
+    #     app.logger.debug("extract_url: running")
+    #     if self.url:
+    #         self.__urldecode_url__()
+    #         wiki_url_pattern = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)"
+    #
+    #         matches = re.match(wiki_url_pattern, self.url)
+    #         if matches:
+    #             groups = matches.groups()
+    #             self.lang = groups[0]
+    #             self.domain = WikimediaDomain(groups[1])
+    #             self.title = groups[2]
+    #         if not matches:
+    #             app.logger.error("Not a supported Wikimedia URL")
+
+
+    def validate_fields(self):
+        """
+        any parameter checking done here...
+        """
+
+        # self.__extract_url__()  # may want to do something to parse wikipage id in the future
+        pass
diff --git a/src/models/v2/schema/editref_schema_v2.py b/src/models/v2/schema/editref_schema_v2.py
@@ -0,0 +1,29 @@
+from marshmallow import fields, post_load
+
+from src.models.v2.job.editref_job_v2 import EditRefJobV2
+from src.models.v2.schema import BaseSchemaV2
+
+
+class EditRefSchemaV2(BaseSchemaV2):
+    # Defines expected parameters for EditRefV2 endpoint
+    #   - default parameters are defined in BaseSchemaV2
+
+    target = fields.Str(required=True)
+    replace = fields.Str(required=True)
+    source = fields.Str(required=True)
+
+    # noinspection PyUnusedLocal
+    @post_load
+    # NB: post_load is a marshmallow directive; this function is run after loading request args
+    #   **kwargs is needed here despite what the validator claims
+    def return_job_object(self, data, **kwargs) -> EditRefJobV2:  # type: ignore # dead: disable
+        """Return Job object"""
+        from src import app
+        app.logger.debug("==> EditRefSchemaV2::@post_load:return_job_object")
+
+        job = EditRefJobV2(**data)
+        job.validate_fields()
+
+        # NB we can modify job field values here before returning
+
+        return job
diff --git a/src/models/v2/wikimedia/wikipedia/url_v2.py b/src/models/v2/wikimedia/wikipedia/url_v2.py
@@ -17,7 +17,7 @@
 class WikipediaUrlV2(BaseModel):
     """models a Wikipedia URL
 
-    It uses BaseModel to avoid the cache attribute (vsIariBaseModel),
+    It uses BaseModel to avoid the cache attribute (vs IariBaseModel),
     so we can output it via the API easily WTF?
 
     We do not perform any checking or lookup here that requires HTTP requests.

diff --git a/src/models/v3/wikipedia/section.py b/src/models/v3/wikipedia/section.py
diff --git a/src/views/check_doi.py b/src/views/check_doi.py
@@ -21,7 +21,7 @@ class CheckDoi(StatisticsWriteView):
 
     job: Optional[CheckDoiJob] = None
     schema: CheckDoiSchema = CheckDoiSchema()
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
     headers: Optional[Dict[str, Any]] = None
     # {
     #         "Access-Control-Allow-Origin": "*",

diff --git a/src/views/check_url.py b/src/views/check_url.py
@@ -28,7 +28,7 @@ class CheckUrl(StatisticsWriteView):
 
     job: Optional[UrlJob] = None
     schema: Schema = UrlSchema()
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
     headers: Optional[Dict[str, Any]] = None
     #     {
     #     "Access-Control-Allow-Origin": "*",

diff --git a/src/views/check_url_archive.py b/src/views/check_url_archive.py
@@ -21,7 +21,7 @@ class CheckUrlArchive(StatisticsWriteView):
 
     job: Optional[UrlArchiveJob] = None
     schema: Schema = UrlArchiveSchema()
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
     headers: Optional[Dict[str, Any]] = None
     #     {
     #     "Access-Control-Allow-Origin": "*",

diff --git a/src/views/check_urls.py b/src/views/check_urls.py
@@ -39,7 +39,7 @@ class CheckUrls(StatisticsWriteView):
     """
     job: Optional[UrlsJob] = None
     schema: Schema = UrlsSchema()
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
     headers: Optional[Dict[str, Any]] = None
     #     {
     #     "Access-Control-Allow-Origin": "*",

diff --git a/src/views/statistics/__init__.py b/src/views/statistics/__init__.py
@@ -25,7 +25,7 @@ class StatisticsView(Resource):
     job: Optional[Job]
 
     time_of_analysis: Optional[datetime] = None
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
 
     io: Optional[FileIo] = None
 

diff --git a/src/views/statistics/pdf.py b/src/views/statistics/pdf.py
@@ -22,7 +22,7 @@ class Pdf(StatisticsWriteView):
 
     job: Optional[UrlJob] = None
     schema: UrlSchema = UrlSchema()
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
     headers: Optional[Dict[str, Any]] = None
     #     {
     #     "Access-Control-Allow-Origin": "*",

diff --git a/src/views/statistics/xhtml.py b/src/views/statistics/xhtml.py
@@ -18,7 +18,7 @@ class Xhtml(StatisticsWriteView):
 
     job: Optional[UrlJob] = None
     schema: UrlSchema = UrlSchema()
-    serving_from_json: bool = False
+    # ### serving_from_json: bool = False
     headers: Optional[Dict[str, Any]] = None
     #     {
     #     "Access-Control-Allow-Origin": "*",

diff --git a/src/views/v2/editref_v2.py b/src/views/v2/editref_v2.py
@@ -0,0 +1,89 @@
+# from flask_restful import Resource, abort  # type: ignore
+# from marshmallow import Schema
+from datetime import datetime
+from typing import Any, Optional, Tuple
+import traceback
+
+from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
+
+from src.models.v2.schema.editref_schema_v2 import EditRefSchemaV2
+from src.models.v2.job.editref_job_v2 import EditRefJobV2
+
+# from src.models.v2.file_io.article_file_io_v2 import ArticleFileIoV2
+# from src.models.v2.wikimedia.wikipedia.analyzer_v2 import WikipediaAnalyzerV2
+# from src.models.wikimedia.enums import AnalyzerReturnValues, WikimediaDomain
+from src.views.v2.statistics import StatisticsViewV2
+
+
+from src.helpers.get_version import get_poetry_version
+
+
+class EditRefV2(StatisticsViewV2):
+    # TODO Since no setup_io is needed for this endpoint, we could maybe
+    #   base this on an "Execution" view? or a generic "Action" view?
+
+    """
+    replaces search string with replace string in source string, and returns results
+    """
+
+    schema = EditRefSchemaV2()  # overrides StatisticsViewV2's schema property
+    job: EditRefJobV2           # overrides StatisticsViewV2's job property
+
+    replaced_data = ""
+
+    def get(self):
+        """
+        flask GET entrypoint for returning editref results
+        must return a tuple: (Any,response_code)
+        """
+        from src import app
+        app.logger.debug("==> EditRefV2::get")
+
+        return self.__process_data__()
+
+    def post(self):
+        """
+        flask POST entrypoint for returning editref results
+        must return a tuple: (Any,response_code)
+        """
+        from src import app
+        app.logger.debug("==> EditRefV2::post")
+
+        return self.__process_data__(method="post")
+
+
+    def __process_data__(self, method="get"):
+        from src import app
+        try:
+            self.__validate_and_get_job__(method)  # inherited from StatisticsViewV2
+            #
+            # validates via schema (a marshmallow feature) and sets job values wia schema's values
+
+            # set up results
+            self.__replace_data__()  # sets self.replaced_data
+
+            # and return results
+            return {
+                "target": self.job.target,
+                "replace": self.job.replace,
+                "source": self.job.source,
+                "result": self.replaced_data
+            }
+
+        except MissingInformationError as e:
+            app.logger.debug("after EditRefV2::self.__validate_and_get_job__ MissingInformationError exception")
+            traceback.print_exc()
+            return {"error": f"Missing Information Error: {str(e)}"}, 500
+
+        except Exception as e:
+            app.logger.debug("after EditRefV2::self.__validate_and_get_job__ exception")
+            traceback.print_exc()
+            return {"error": f"General Error: {str(e)}"}, 500
+
+    def __replace_data__(self):
+        from src import app
+        app.logger.debug("==> EditRefV2::__replace_data__")
+
+        self.replaced_data = self.job.source.replace(self.job.target, self.job.replace)
+
+