From e5e00ccdd5e9cd9ead8ef0f0b408036e8eb58ba3 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 8 Nov 2023 14:21:14 -0500 Subject: [PATCH] more api cleanup, documentation, polishing --- bedhost/const.py | 2 +- bedhost/helpers.py | 8 +- bedhost/main.py | 127 +++++++++++++------- bedhost/routers/bed_api.py | 231 ++++-------------------------------- bedhost/templates/page.html | 53 +++++++++ docs/README.md | 3 + docs/about.md | 6 + docs/guide.md | 35 ++++++ docs/index.md | 9 ++ 9 files changed, 216 insertions(+), 258 deletions(-) create mode 100644 bedhost/templates/page.html create mode 100644 docs/README.md create mode 100644 docs/about.md create mode 100644 docs/guide.md create mode 100644 docs/index.md diff --git a/bedhost/const.py b/bedhost/const.py index bb061416..bd29c720 100644 --- a/bedhost/const.py +++ b/bedhost/const.py @@ -27,7 +27,7 @@ TEMPLATES_PATH = os.path.join( os.path.dirname(os.path.abspath(__file__)), TEMPLATES_DIRNAME ) -STATIC_DIRNAME = "static" +STATIC_DIRNAME = "../docs" STATIC_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), STATIC_DIRNAME) UI_PATH = os.path.join(os.path.dirname(__file__), "static", "bedhost-ui") diff --git a/bedhost/helpers.py b/bedhost/helpers.py index 536088d9..739350e8 100644 --- a/bedhost/helpers.py +++ b/bedhost/helpers.py @@ -2,7 +2,7 @@ from bbconf import BedBaseConf from fastapi.staticfiles import StaticFiles -from starlette.responses import FileResponse, RedirectResponse +from starlette.responses import FileResponse, RedirectResponse, JSONResponse from typing import List, Union from urllib import parse @@ -236,3 +236,9 @@ def configure(bbconf_file_path, app): f"Using remote files for serving. Prefix: {bbc.config[CFG_REMOTE_KEY]['http']['prefix']}" ) return bbc + + +def drs_response(status_code, msg): + """Helper function to make quick DRS responses""" + content = {"status_code": status_code, "msg": msg} + return JSONResponse(status_code=status_code, content=content) diff --git a/bedhost/main.py b/bedhost/main.py index ffa9b871..db2c554d 100644 --- a/bedhost/main.py +++ b/bedhost/main.py @@ -4,7 +4,7 @@ from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse +from fastapi.responses import JSONResponse, HTMLResponse from typing import Dict from urllib.parse import urlparse from fastapi import Response, HTTPException @@ -14,7 +14,13 @@ from . import _LOGGER -from .helpers import FileResponse, configure, attach_routers, get_openapi_version +from .helpers import ( + FileResponse, + configure, + attach_routers, + get_openapi_version, + drs_response, +) from .cli import build_parser from .const import ( ALL_VERSIONS, @@ -26,11 +32,36 @@ SERVER_VERSION, ) +tags_metadata = [ + { + "name": "home", + "description": "General landing page and service info", + }, + { + "name": "objects", + "description": "Download BED files or BEDSET files via [GA4GH DRS standard](https://ga4gh.github.io/data-repository-service-schemas/). For details, see [BEDbase Developer Guide](/docs/guide).", + + }, + { + "name": "bed", + "description": "Endpoints for retrieving metadata for BED records", + }, + { + "name": "bedset", + "description": "Endpoints for retrieving metadata for BEDSET records", + }, + { + "name": "search", + "description": "Discovery-oriented endpoints for finding records of interest", + }, +] + app = FastAPI( title=PKG_NAME, description="BED file/sets statistics and image server API", version=SERVER_VERSION, docs_url="/docs", + openapi_tags=tags_metadata, ) origins = [ @@ -49,16 +80,34 @@ allow_headers=["*"], ) +import markdown +from fastapi.templating import Jinja2Templates +templates = Jinja2Templates(directory="bedhost/templates", autoescape=False) -@app.get("/", summary="API intro page", tags=["General endpoints"]) -async def index(): +@app.get("/", summary="API intro page", tags=["home"]) +async def index(request: Request): """ Display the index UI page """ - return FileResponse(os.path.join(STATIC_PATH, "index.html")) + return render_markdown("index.md", request) + + +@app.get("/docs/changelog", summary="Release notes", response_class=HTMLResponse, tags=["home"]) +async def changelog(request: Request): + return render_markdown("changelog.md", request) +@app.get("/docs/guide", summary="Developer guide", response_class=HTMLResponse, tags=["home"]) +async def guide(request: Request): + return render_markdown("guide.md", request) -@app.get("/service-info", summary="GA4GH service info", tags=["General endpoints"]) +def render_markdown(filename: str, request: Request): + with open(os.path.join(STATIC_PATH, filename), "r", encoding="utf-8") as input_file: + text = input_file.read() + content = markdown.markdown(text) + return templates.TemplateResponse("page.html", {"request": request, "content": content}) + + +@app.get("/service-info", summary="GA4GH service info", tags=["home"]) async def service_info(): """ Returns information about this service, such as versions, name, etc. @@ -86,14 +135,10 @@ async def service_info(): } return JSONResponse(content=ret) - -DRS_ENDPOINTS_LABEL = "objects -- download files via DRS" - - @app.get( "/objects/{object_id}", summary="Get DRS object metadata", - tags=[DRS_ENDPOINTS_LABEL], + tags=["objects"], ) async def get_drs_object_metadata(object_id: str, req: Request): """ @@ -108,8 +153,8 @@ async def get_drs_object_metadata(object_id: str, req: Request): @app.get( "/objects/{object_id}/access/{access_id}", - summary="Get URL where you can retrive files", - tags=[DRS_ENDPOINTS_LABEL], + summary="Get URL where you can retrieve files", + tags=["objects"], ) async def get_object_bytes_url(object_id: str, access_id: str): """ @@ -127,7 +172,7 @@ async def get_object_bytes_url(object_id: str, access_id: str): @app.get( "/objects/{object_id}/access/{access_id}/bytes", summary="Download actual files", - tags=[DRS_ENDPOINTS_LABEL], + tags=["objects"], ) async def get_object_bytes(object_id: str, access_id: str): """ @@ -144,7 +189,7 @@ async def get_object_bytes(object_id: str, access_id: str): @app.get( "/objects/{object_id}/access/{access_id}/thumbnail", summary="Download thumbnail", - tags=[DRS_ENDPOINTS_LABEL], + tags=["objects"], ) async def get_object_thumbnail(object_id: str, access_id: str): """ @@ -168,7 +213,13 @@ def parse_bedbase_drs_object_id(object_id: str): """ Parse bedbase object id into its components """ - record_type, record_id, result_id = object_id.split(".") + try: + record_type, record_id, result_id = object_id.split(".") + except ValueError: + raise HTTPException( + status_code=400, + detail=f"Object ID {object_id} is malformed. Should be of the form ..", + ) if record_type not in ["bed", "bedset"]: raise HTTPException( status_code=400, detail=f"Object type {record_type} is incorrect" @@ -183,43 +234,29 @@ def parse_bedbase_drs_object_id(object_id: str): # General-purpose exception handlers (so we don't have to write try/catch blocks in every endpoint) @app.exception_handler(MissingThumbnailError) -async def exception_handler_MissingThumbnailError( - request: Request, exc: MissingThumbnailError -): - return JSONResponse( - status_code=404, - content={"msg": "No thumbnail for this object.", "status_code": 404}, - ) +async def exc_handler_MissingThumbnailError(req: Request, exc: MissingThumbnailError): + return drs_response(404, "No thumbnail for this object.") -@app.exception_handler(IncorrectAccessMethodError) -async def exception_handler_IncorrectAccessMethodError( - request: Request, exc: IncorrectAccessMethodError -): - return JSONResponse( - status_code=404, - content={"msg": "Requested access URL was not found.", "status_code": 404}, - ) +@app.exception_handler(BadAccessMethodError) +async def exc_handler_BadAccessMethodError(req: Request, exc: BadAccessMethodError): + return drs_response(404, "Requested access URL was not found.") @app.exception_handler(ColumnNotFoundError) -async def exception_handler_ColumnNotFoundError( - request: Request, exc: ColumnNotFoundError -): - return JSONResponse( - status_code=404, - content={"msg": "Malformed result identifier.", "status_code": 404}, - ) +async def exc_handler_ColumnNotFoundError(req: Request, exc: ColumnNotFoundError): + _LOGGER.error(f"ColumnNotFoundError: {exc}") + return drs_response(404, "Malformed result identifier.") @app.exception_handler(RecordNotFoundError) -async def exception_handler_RecordNotFoundError( - request: Request, exc: RecordNotFoundError -): - return JSONResponse( - status_code=404, - content={"msg": "Record not found.", "status_code": 404}, - ) +async def exc_handler_RecordNotFoundError(req: Request, exc: RecordNotFoundError): + return drs_response(404, "Record not found.") + + +@app.exception_handler(MissingObjectError) +async def exc_handler_MissingObjectError(req: Request, exc: MissingObjectError): + return drs_response(404, "Object not found.") def main(): diff --git a/bedhost/routers/bed_api.py b/bedhost/routers/bed_api.py index 6028a0d6..50b0d874 100644 --- a/bedhost/routers/bed_api.py +++ b/bedhost/routers/bed_api.py @@ -38,32 +38,32 @@ async def get_bed_genome_assemblies(): """ Returns available genome assemblies in the database """ - return bbc.bed.retrieve_distinct(columns=["genome"]) + return bbc.bed.select_distinct(columns=["genome"]) -@router.get("/count", response_model=int) -async def get_bedfile_count(): +@router.get("/count", summary="Number of BED records in the database", response_model=int) +async def count_bed_record(): """ - Returns the number of bedfiles available in the database + Returns the number of bed records available in the database """ return int(bbc.bed.record_count) -@router.get("/schema", response_model=Dict) +@router.get("/schema", summary="Schema for BED records", response_model=Dict) async def get_bed_schema(): """ - Get bedfiles pipestat schema + Get pipestat schema for BED records used by this database """ - d = bbc.bed.schema.original_schema + d = bbc.bed.schema.resolved_schema return d -@router.get("/example") -async def get_bed_example(): +@router.get("/example", summary="Get metadata for an example BED record", response_model=Dict) +async def get_example_bed_record(): return bbc.bed.select_records(limit=1)["records"][0] -@router.get("/list", summary="List all bedfiles, paged.") +@router.get("/list", summary="Paged list of all BED records") async def list_beds(limit: int = 1000, token: str = None): """ To get the first page, leave token field empty. The response will include a @@ -73,17 +73,7 @@ async def list_beds(limit: int = 1000, token: str = None): return x -@router.get("/{record_id}/{result_id}/{access_id}") -async def get_bed_object_uri(record_id: str, result_id: str, access_id: str): - try: - record_type = "bed" - path = bbc.get_object_uri(record_type, record_id, result_id, access_id) - return Response(path, media_type="text/plain") - except RecordNotFoundError as e: - raise HTTPException(status_code=404, detail="Record not found") - - -@router.get("/{bed_id}/metadata", response_model=DBResponse) +@router.get("/{bed_id}/metadata", summary="Get metadata for a single BED record") async def get_bed_metadata( bed_id: str = BedDigest, attr_id: Optional[str] = Query( @@ -91,11 +81,11 @@ async def get_bed_metadata( ), ): """ - Returns metadata from selected columns for selected bedfile + Returns metadata from selected columns for selected BED record """ # TODO: should accept a list of columns try: - values = bbc.bed.retrieve(bed_id, attr_id) + values = bbc.bed.retrieve_one(bed_id, attr_id) if not isinstance(values, dict) or attr_id: values = { attr_id: values, @@ -109,74 +99,11 @@ async def get_bed_metadata( _LOGGER.warning("No records matched the query") colnames = [] values = [[]] + return values return {"columns": colnames, "data": values} -# UCSC tool expects a head respond. So we added head request -@router.head("/{bed_id}/file/{file_id}", include_in_schema=False) -@router.get("/{bed_id}/file/{file_id}", include_in_schema=False) -async def get_bytes_of_bedfile( - bed_id: str, - file_id: str, -): - res = bbc.bed_retrieve(bed_id, file_id) - path = bbc.get_prefixed_uri(res["path"]) - return bbc.serve_file(path) - - -@router.get("/{bed_id}/file_path/{file_id}", include_in_schema=False) -async def get_uri_for_bedfile( - bed_id: str, - file_id: str, - remote_class: RemoteClassEnum = Query( - RemoteClassEnum("http"), description="Remote data provider class" - ), -): - try: - res = bbc.bed.retrieve(bed_id, file_id) - except KeyError: - raise HTTPException(status_code=404, detail="Record or attribute not found") - - path = bbc.get_prefixed_uri(res["path"], remote_class.value) - return Response(path, media_type="text/plain") - - -@router.get("/{bed_id}/img/{image_id}", include_in_schema=False) -async def get_image_for_bedfile( - bed_id: str, - image_id: str, - format: FIG_FORMAT = Query("pdf", description="Figure file format"), -): - """ - Returns the specified image associated with the specified bed file. - """ - img = bbc.bed_retrieve(bed_id, image_id) - identifier = img["path" if format == "pdf" else "thumbnail_path"] - path = bbc.get_prefixed_uri(identifier) - return bbc.serve_file(path) - - -@router.get("/{bed_id}/img_path/{image_id}", include_in_schema=False) -async def get_image_path_for_bedfile( - bed_id: str, - image_id: str, - format: Annotated[ - Optional[FIG_FORMAT], Query(description="Figure file format") - ] = "pdf", - remote_class: Annotated[ - Optional[RemoteClassEnum], Query(description="Remote data provider class") - ] = RemoteClassEnum("http"), -): - """ - Returns the bedfile plot with provided ID in provided format - """ - img = bbc.bed_retrieve(bed_id, image_id) - identifier = img["path" if format == "pdf" else "thumbnail_path"] - path = bbc.get_prefixed_uri(identifier, remote_class.value) - return Response(path, media_type="text/plain") - - -@router.get("/{bed_id}/regions/{chr_num}", response_class=PlainTextResponse) +@router.get("/{bed_id}/regions/{chr_num}", summary="Get regions from a BED file that overlap a query region.", response_class=PlainTextResponse) def get_regions_for_bedfile( bed_id: str = BedDigest, chr_num: str = chromosome_number, @@ -190,23 +117,15 @@ def get_regions_for_bedfile( """ Returns the queried regions with provided ID and optional query parameters """ - hit = bbc.bed.retrieve(record_identifier=bed_id, result_identifier="bigbedfile") - if isinstance(hit, dict): - file = hit.get("bigbedfile") + res = bbc.bed.retrieve_one(bed_id, "bigbedfile") + if isinstance(res, dict): + file = res.get("bigbedfile") else: raise HTTPException( status_code=404, detail="ERROR: bigBed file doesn't exists. Can't query." ) - remote = True if CFG_REMOTE_KEY in bbc.config else False - - path = ( - os.path.join(bbc.config[CFG_REMOTE_KEY]["http"]["prefix"], file["path"]) - if remote - else os.path.join( - bbc.config[CFG_PATH_KEY][CFG_PATH_PIPELINE_OUTPUT_KEY], file["path"] - ) - ) - + path = bbc.get_prefixed_uri(res["path"], access_id="http") + _LOGGER.debug(path) cmd = ["bigBedToBed"] if chr_num: cmd.append(f"-chrom={chr_num}") @@ -238,113 +157,3 @@ def get_regions_for_bedfile( raise HTTPException( status_code=500, detail="ERROR: bigBedToBed is not installed." ) - - -@router.get( - "/search_by_genome_coordinates/regions/{chr_num}/{start}/{end}", - response_model=DBResponse, - include_in_schema=True, -) -async def get_regions_for_bedfile( - start: Annotated[int, Path(description="start coordinate", example=1103243)], - end: Annotated[int, Path(description="end coordinate", example=2103332)], - chr_num: str = chromosome_number, -): - """ - Returns the list of BED files have regions overlapping given genome coordinates - """ - with tempfile.NamedTemporaryFile(mode="w+") as f: - f.write(f"{chr_num}\t{start}\t{end}\n") - - bed_files = await get_all_bed_metadata( - ids=["name", "record_identifier", "bedfile"] - ) - - colnames = ["name", "record_identifier", "overlapped_regions"] - values = [] - for bed in bed_files["data"]: - name = bed[0] - bed_id = bed[1] - remote = True if CFG_REMOTE_KEY in bbc.config else False - path = ( - os.path.join( - bbc.config[CFG_REMOTE_KEY]["http"]["prefix"], bed[2]["path"] - ) - if remote - else os.path.join( - bbc.config[CFG_PATH_KEY][CFG_PATH_PIPELINE_OUTPUT_KEY], - bed[2]["path"], - ) - ) - - cmd = [ - "bedIntersect", - f.name, - path, - "stdout", - ] - - _LOGGER.info(f"Command: {' '.join(map(str, cmd))} | wc -l") - - try: - ct_process = subprocess.Popen( - ["wc", "-l"], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - universal_newlines=True, - ) - - subprocess.Popen( - cmd, - stdout=ct_process.stdin, - text=True, - ) - if int(ct_process.communicate()[0].rstrip("\n")) != 0: - values.append( - [name, bed_id, int(ct_process.communicate()[0].rstrip("\n"))] - ) - - except FileNotFoundError: - _LOGGER.warning("bedIntersect is not installed.") - raise HTTPException( - status_code=500, detail="ERROR: bedIntersect is not installed." - ) - return {"columns": colnames, "data": values} - - -# should it be deleted, or disabled for public use? -@router.get("/all/metadata") -async def get_all_bed_metadata( - ids: Annotated[ - Optional[List[str]], Query(description="Bedfiles table column name") - ] = None, - limit: Annotated[ - Optional[int], Query(description="number of rows returned by the query") - ] = 10, -): - """ - Get bedfiles metadata for selected columns - """ - if ids and "record_identifier" not in ids: - ids.append("record_identifier") - try: - # TODO: remove backend dependency - res = bbc.bed.backend.select(columns=ids, limit=limit) - except AttributeError: - raise HTTPException( - status_code=404, detail=f"Table results for {ids} not found" - ) - - if res: - if ids: - colnames = ids - values = [list(x) if isinstance(x, tuple) else list(x) for x in res] - else: - colnames = list(res[0].__dict__.keys())[1:] - values = [list(x.__dict__.values())[1:] for x in res] - else: - _LOGGER.warning(f"No records matched the query") - return {"columns": [], "data": [[]]} - - _LOGGER.debug(f"Serving data for columns: {colnames}") - return {"columns": colnames, "data": values} diff --git a/bedhost/templates/page.html b/bedhost/templates/page.html new file mode 100644 index 00000000..f1e3dea7 --- /dev/null +++ b/bedhost/templates/page.html @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + BEDbase API + + + + +
+
+ {{ content }} +
+
+
+
+
+ databio + databio + databio +
+
+
+ + + diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..6abffb6e --- /dev/null +++ b/docs/README.md @@ -0,0 +1,3 @@ +# Docs + +These markdown files are hosted by the service to make some simple documentation for the API. \ No newline at end of file diff --git a/docs/about.md b/docs/about.md new file mode 100644 index 00000000..529dfcfc --- /dev/null +++ b/docs/about.md @@ -0,0 +1,6 @@ +# about + +testing about + +[here is a link](/docs) + diff --git a/docs/guide.md b/docs/guide.md new file mode 100644 index 00000000..1c8899c6 --- /dev/null +++ b/docs/guide.md @@ -0,0 +1,35 @@ +# Developer Guide + +## Introduction + +### Data types + +BEDbase stores two types of data, which we call *records*. They are 1. BEDs, and 2. BEDsets. BEDsets are simply collections of BEDs. Each record in the database is either a BED or a BEDset. + +### Endpoint organization + +The endpoints are divided into 3 groups: + +1. `/bed` endpoints are used to interact with metadata for BED records. +2. `/bedset` endpoints are used to interact with metadata for BEDset records. +3. `/objects` endpoints are used to download metadata and get URLs to retrieve the underlying data itself. These endpoints implement the [GA4GH DRS standard](https://ga4gh.github.io/data-repository-service-schemas/). + +Therefore, to get information and statistics about BED or BEDset records, or what is contained in the database, look through the `/bed` and `/bedset` endpoints. But if you need to write a tool that gets the actual underlying files, then you'll need to use the `/objects` endpoints. The type of identifiers used in each case differ. + +## Record identifiers vs. object identifiers + +Each record has an identifier. For example, `eaf9ee97241f300f1c7e76e1f945141f` is a BED identifier. You can use this identifier for the metadata endpoints. To download files, you'll need something slightly different -- you need an *object identifier*. This is because each BED record includes multiple files, such as the original BED file, the BigBed file, analysis plots, and so on. To download a file, you will construct what we call the `object_id`, which identifies the specific file. + +## How to construct object identifiers + +Object IDs take the form `..`. An example of an object_id for a BED file is `bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile` + +So, you can get information about this object like this: + +`GET` [/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile](/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile) + +Or, you can get a URL to download the actual file with: + +`GET` [/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile/access/http](/objects/bed.eaf9ee97241f300f1c7e76e1f945141f.bedfile/access/http) + + diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..1b6ebf05 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,9 @@ +# BEDbase API + +Welcome to the BEDbase API. You might be looking for: + +- [API OpenAPI documentation](/docs) +- [BEDbase API changelog](/docs/changelog) +- [Developer Guide and FAQ](/docs/guide) +- [bedbase.org user interface](https://bedbase.org) +- [Sheffield lab of computational biology](https://databio.org)