Skip to content

Commit

Permalink
code complete for editref endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
mojomonger committed Jul 2, 2024
1 parent 7722e6b commit 2769dc3
Show file tree
Hide file tree
Showing 17 changed files with 269 additions and 47 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "Internet Archive Reference Inventory (IARI)"
version = "4.4.0"
version = "4.4.1"
description = "API capable of fetching, extracting, transforming and storing reference information from Wikipedia articles, websites and PDFs as structured data."
authors = [
"Chris Lombardi <[email protected]>",
Expand Down
12 changes: 9 additions & 3 deletions src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
# new stuff jan 2024
from src.views.v2.article_view_v2 import ArticleV2
from src.views.version import Version
# new stuff jun 2024
from src.views.v2.editref_v2 import EditRefV2

logging.basicConfig(level=config.loglevel)
logger = logging.getLogger(__name__)
Expand All @@ -52,15 +54,16 @@ def add_cors_headers(response):
# let's see if we can distinguish which server we are on
server_name = os.getenv('FLASK_SERVER_NAME', 'Unknown Server')

# Register the function as a after_request handler
# Register the function as an after_request handler
app.after_request(add_cors_headers)

# We use a prefix here to enable us to stabilize the api over time
# and bump the version when making breaking changes
api = Api(app, prefix="/v2")

# Here we link together the API views and endpoint urls
# api.add_resource(LookupByWikidataQid, "/wikidata-qid/<string:qid>")
# link the API views to respective endpoint urls
api.add_resource(EditRefV2, "/editref")

api.add_resource(ArticleV2, "/article")
api.add_resource(ArticleCacheV2, "/article_cache")

Expand All @@ -75,6 +78,9 @@ def add_cors_headers(response):
api.add_resource(Reference, "/statistics/reference/<string:reference_id>")
api.add_resource(Pdf, "/statistics/pdf")
api.add_resource(Xhtml, "/statistics/xhtml")

# api.add_resource(LookupByWikidataQid, "/wikidata-qid/<string:qid>")

# return app_
# api.add_resource(
# AddJobToQueue, "/add-job"
Expand Down
1 change: 1 addition & 0 deletions src/models/v2/file_io/article_file_io_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


class ArticleFileIoV2(FileIo):

data: Optional[Dict[str, Any]] = None
subfolder = "articlesV2/"
job: Optional[ArticleJobV2]
Expand Down
98 changes: 98 additions & 0 deletions src/models/v2/job/editref_job_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# import re
# from urllib.parse import quote, unquote
#
# import requests
#
# import config
# from src.models.exceptions import MissingInformationError, WikipediaApiFetchError
# from src.models.wikimedia.enums import WikimediaDomain
from src.models.v2.job import JobV2


class EditRefJobV2(JobV2):
"""job that supports EditRefV2 endpoint"""

target: str = ""
replace: str = ""
source: str = ""

# these following (commented) functions might be useful when we
# have a wikipage id rather than a string to describe source

# @property
# def quoted_title(self):
# if not self.title:
# raise MissingInformationError("self.title was empty")
# return quote(self.title, safe="")

# def get_mediawiki_ids(self) -> None:
# from src import app
#
# app.logger.debug(
# f"ArticleJobV2::get_mediawiki_ids: self.page_id={self.page_id}"
# )
#
# if not self.page_id:
# app.logger.debug(
# f"ArticleJobV2::get_mediawiki_ids: lang={self.lang}, title={self.title}, lang={self.domain}"
# )
# if not self.lang or not self.title or not self.domain:
# raise MissingInformationError("url lang, title or domain not found")
#
# # https://stackoverflow.com/questions/31683508/wikipedia-mediawiki-api-get-pageid-from-url
# wiki_fetch_url = (
# f"https://{self.lang}.{self.domain.value}/"
# f"w/rest.php/v1/page/{self.quoted_title}"
# )
# headers = {"User-Agent": config.user_agent}
# response = requests.get(wiki_fetch_url, headers=headers)
# # console.print(response.json())
# if response.status_code == 200:
# data = response.json()
# # We only set this if the patron did not specify a revision they want
# if not self.revision:
# self.revision = int(data["latest"]["id"])
# self.page_id = int(data["id"])
#
# elif response.status_code == 404:
# app.logger.error(
# f"Could not fetch page data from {self.domain} because of 404. See {wiki_fetch_url}"
# )
# else:
# raise WikipediaApiFetchError(
# f"Could not fetch page data. Got {response.status_code} from {wiki_fetch_url}"
# )

# def __urldecode_url__(self):
# """We decode the title to have a human readable string to pass around"""
# self.url = unquote(self.url)
#
# def __extract_url__(self):
# """This was generated with help of chatgpt using this prompt:
# I want a python re regex that extracts "en" "wikipedia.org"
# and "Test" from http://en.wikipedia.org/wiki/Test
# """
# from src import app
#
# app.logger.debug("extract_url: running")
# if self.url:
# self.__urldecode_url__()
# wiki_url_pattern = r"https?://(\w+)\.(\w+\.\w+)/wiki/(.+)"
#
# matches = re.match(wiki_url_pattern, self.url)
# if matches:
# groups = matches.groups()
# self.lang = groups[0]
# self.domain = WikimediaDomain(groups[1])
# self.title = groups[2]
# if not matches:
# app.logger.error("Not a supported Wikimedia URL")


def validate_fields(self):
"""
any parameter checking done here...
"""

# self.__extract_url__() # may want to do something to parse wikipage id in the future
pass
29 changes: 29 additions & 0 deletions src/models/v2/schema/editref_schema_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from marshmallow import fields, post_load

from src.models.v2.job.editref_job_v2 import EditRefJobV2
from src.models.v2.schema import BaseSchemaV2


class EditRefSchemaV2(BaseSchemaV2):
# Defines expected parameters for EditRefV2 endpoint
# - default parameters are defined in BaseSchemaV2

target = fields.Str(required=True)
replace = fields.Str(required=True)
source = fields.Str(required=True)

# noinspection PyUnusedLocal
@post_load
# NB: post_load is a marshmallow directive; this function is run after loading request args
# **kwargs is needed here despite what the validator claims
def return_job_object(self, data, **kwargs) -> EditRefJobV2: # type: ignore # dead: disable
"""Return Job object"""
from src import app
app.logger.debug("==> EditRefSchemaV2::@post_load:return_job_object")

job = EditRefJobV2(**data)
job.validate_fields()

# NB we can modify job field values here before returning

return job
2 changes: 1 addition & 1 deletion src/models/v2/wikimedia/wikipedia/url_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
class WikipediaUrlV2(BaseModel):
"""models a Wikipedia URL
It uses BaseModel to avoid the cache attribute (vsIariBaseModel),
It uses BaseModel to avoid the cache attribute (vs IariBaseModel),
so we can output it via the API easily WTF?
We do not perform any checking or lookup here that requires HTTP requests.
Expand Down
Empty file removed src/models/v3/wikipedia/section.py
Empty file.
2 changes: 1 addition & 1 deletion src/views/check_doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CheckDoi(StatisticsWriteView):

job: Optional[CheckDoiJob] = None
schema: CheckDoiSchema = CheckDoiSchema()
serving_from_json: bool = False
# ### serving_from_json: bool = False
headers: Optional[Dict[str, Any]] = None
# {
# "Access-Control-Allow-Origin": "*",
Expand Down
2 changes: 1 addition & 1 deletion src/views/check_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class CheckUrl(StatisticsWriteView):

job: Optional[UrlJob] = None
schema: Schema = UrlSchema()
serving_from_json: bool = False
# ### serving_from_json: bool = False
headers: Optional[Dict[str, Any]] = None
# {
# "Access-Control-Allow-Origin": "*",
Expand Down
2 changes: 1 addition & 1 deletion src/views/check_url_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CheckUrlArchive(StatisticsWriteView):

job: Optional[UrlArchiveJob] = None
schema: Schema = UrlArchiveSchema()
serving_from_json: bool = False
# ### serving_from_json: bool = False
headers: Optional[Dict[str, Any]] = None
# {
# "Access-Control-Allow-Origin": "*",
Expand Down
2 changes: 1 addition & 1 deletion src/views/check_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class CheckUrls(StatisticsWriteView):
"""
job: Optional[UrlsJob] = None
schema: Schema = UrlsSchema()
serving_from_json: bool = False
# ### serving_from_json: bool = False
headers: Optional[Dict[str, Any]] = None
# {
# "Access-Control-Allow-Origin": "*",
Expand Down
2 changes: 1 addition & 1 deletion src/views/statistics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class StatisticsView(Resource):
job: Optional[Job]

time_of_analysis: Optional[datetime] = None
serving_from_json: bool = False
# ### serving_from_json: bool = False

io: Optional[FileIo] = None

Expand Down
2 changes: 1 addition & 1 deletion src/views/statistics/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class Pdf(StatisticsWriteView):

job: Optional[UrlJob] = None
schema: UrlSchema = UrlSchema()
serving_from_json: bool = False
# ### serving_from_json: bool = False
headers: Optional[Dict[str, Any]] = None
# {
# "Access-Control-Allow-Origin": "*",
Expand Down
2 changes: 1 addition & 1 deletion src/views/statistics/xhtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Xhtml(StatisticsWriteView):

job: Optional[UrlJob] = None
schema: UrlSchema = UrlSchema()
serving_from_json: bool = False
# ### serving_from_json: bool = False
headers: Optional[Dict[str, Any]] = None
# {
# "Access-Control-Allow-Origin": "*",
Expand Down
89 changes: 89 additions & 0 deletions src/views/v2/editref_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# from flask_restful import Resource, abort # type: ignore
# from marshmallow import Schema
from datetime import datetime
from typing import Any, Optional, Tuple
import traceback

from src.models.exceptions import MissingInformationError, WikipediaApiFetchError

from src.models.v2.schema.editref_schema_v2 import EditRefSchemaV2
from src.models.v2.job.editref_job_v2 import EditRefJobV2

# from src.models.v2.file_io.article_file_io_v2 import ArticleFileIoV2
# from src.models.v2.wikimedia.wikipedia.analyzer_v2 import WikipediaAnalyzerV2
# from src.models.wikimedia.enums import AnalyzerReturnValues, WikimediaDomain
from src.views.v2.statistics import StatisticsViewV2


from src.helpers.get_version import get_poetry_version


class EditRefV2(StatisticsViewV2):
# TODO Since no setup_io is needed for this endpoint, we could maybe
# base this on an "Execution" view? or a generic "Action" view?

"""
replaces search string with replace string in source string, and returns results
"""

schema = EditRefSchemaV2() # overrides StatisticsViewV2's schema property
job: EditRefJobV2 # overrides StatisticsViewV2's job property

replaced_data = ""

def get(self):
"""
flask GET entrypoint for returning editref results
must return a tuple: (Any,response_code)
"""
from src import app
app.logger.debug("==> EditRefV2::get")

return self.__process_data__()

def post(self):
"""
flask POST entrypoint for returning editref results
must return a tuple: (Any,response_code)
"""
from src import app
app.logger.debug("==> EditRefV2::post")

return self.__process_data__(method="post")


def __process_data__(self, method="get"):
from src import app
try:
self.__validate_and_get_job__(method) # inherited from StatisticsViewV2
#
# validates via schema (a marshmallow feature) and sets job values wia schema's values

# set up results
self.__replace_data__() # sets self.replaced_data

# and return results
return {
"target": self.job.target,
"replace": self.job.replace,
"source": self.job.source,
"result": self.replaced_data
}

except MissingInformationError as e:
app.logger.debug("after EditRefV2::self.__validate_and_get_job__ MissingInformationError exception")
traceback.print_exc()
return {"error": f"Missing Information Error: {str(e)}"}, 500

except Exception as e:
app.logger.debug("after EditRefV2::self.__validate_and_get_job__ exception")
traceback.print_exc()
return {"error": f"General Error: {str(e)}"}, 500

def __replace_data__(self):
from src import app
app.logger.debug("==> EditRefV2::__replace_data__")

self.replaced_data = self.job.source.replace(self.job.target, self.job.replace)


Loading

0 comments on commit 2769dc3

Please sign in to comment.