Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/improve api #13

Merged
merged 4 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 68 additions & 54 deletions challenge/api.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
import sys
from datetime import datetime, timezone

import fastapi
import pandas as pd
from fastapi import HTTPException
from pydantic import BaseModel
from pydantic import BaseModel, validator

from challenge.model import DelayModel


def print_to_file(whatever: any):
with open("file.txt", "a") as sys.stdout:
print(whatever)


valid_opera_values = [
VALID_OPERA_VALUES = [
"american airlines",
"air canada",
"air france",
Expand All @@ -39,90 +33,110 @@ def print_to_file(whatever: any):
"lacsa",
]

valid_tipo_vuelo_values = [
VALID_TIPO_VUELO_VALUES = [
"I",
"N",
]

valid_mes_values = range(1, 13)


def valid_tipo_vuelo(tipo_vuelo: str) -> bool:
return tipo_vuelo in valid_tipo_vuelo_values
VALID_MES_VALUES = range(1, 13)


def valid_opera(opera: str) -> bool:
return opera in valid_opera_values


def valid_mes(mes_value: int) -> bool:
return mes_value in valid_mes_values
app = fastapi.FastAPI()
model = DelayModel()
model.load_model("models")


class Flight(BaseModel):
OPERA: str
TIPOVUELO: str
MES: int


class FlightData(BaseModel):
flights: list[Flight]


app = fastapi.FastAPI()
model = DelayModel()
model.load_model("models")


def flight_data_to_pandas(flight_data: FlightData) -> pd.DataFrame:
flight_data_dict = {"OPERA": [], "TIPOVUELO": [], "MES": []}
for elem in flight_data.flights:
if not valid_opera(elem.OPERA.lower()):
@validator("OPERA")
def valid_opera(cls, opera_value: str):
if opera_value.lower() not in VALID_OPERA_VALUES:
raise HTTPException(
status_code=400,
detail=(
f"Value for tipo vuelo not valid. Recieved {elem.OPERA},"
f" expected one from {[v for v in valid_opera_values]}"
f"Value for tipo vuelo not valid. Recieved {opera_value}, "
f"expected one from {VALID_OPERA_VALUES}"
),
)
if not valid_tipo_vuelo(elem.TIPOVUELO.capitalize()):
return opera_value

@validator("TIPOVUELO")
def valid_tipo_vuelo(cls, tipo_vuelo_value: str):
if tipo_vuelo_value.capitalize() not in VALID_TIPO_VUELO_VALUES:
raise HTTPException(
status_code=400,
detail=(
f"Value for tipo vuelo not valid. Recieved {elem.TIPOVUELO},"
f" expected one from {[v for v in valid_tipo_vuelo_values]}"
f"Value for tipo vuelo not valid. Recieved {tipo_vuelo_value}, "
f"expected one from {VALID_TIPO_VUELO_VALUES}"
),
)
if not valid_mes(elem.MES):
return tipo_vuelo_value

@validator("MES")
def valid_mes(cls, mes_value: int):
if mes_value not in VALID_MES_VALUES:
raise HTTPException(
status_code=400,
detail=(
f"Value for tipo vuelo not valid. Recieved {elem.MES},"
f" expected one from {valid_mes_values}"
f"Value for tipo vuelo not valid. Recieved {mes_value}, "
f"expected one from {VALID_MES_VALUES}"
),
)
return mes_value


class FlightData(BaseModel):
flights: list[Flight]


def flight_data_to_pandas(flight_data: FlightData) -> pd.DataFrame:
flight_data_dict = {"OPERA": [], "TIPOVUELO": [], "MES": []}
for elem in flight_data.flights:
flight_data_dict["OPERA"].append(elem.OPERA)
flight_data_dict["TIPOVUELO"].append(elem.TIPOVUELO)
flight_data_dict["MES"].append(elem.MES)

return pd.DataFrame(flight_data_dict)


@app.get("/", status_code=200)
async def root() -> dict:
return {
"message": (
"welcome to the api for predicting flight delay. Use the /health "
"endpoint to get server status, and the /predict endpoint to get your "
"prediction from input data."
)
}


@app.get("/health", status_code=200)
async def get_health() -> dict:
return {"status": "OK"}


@app.post("/predict", status_code=200)
async def post_predict(flight_data: FlightData) -> dict:
# get data and convert to pandas dataframe

flight_data_df = flight_data_to_pandas(flight_data)
preprocessed_data = model.preprocess(flight_data_df)

column_order = model._model.feature_names_in_
preprocessed_data = preprocessed_data[column_order]

pred = model.predict(preprocessed_data)

return {"predict": pred}
try:
# get data and convert to pandas dataframe
flight_data_df = flight_data_to_pandas(flight_data)
preprocessed_data = model.preprocess(flight_data_df)

# sorts column to feed the model
column_order = model._model.feature_names_in_
preprocessed_data = preprocessed_data[column_order]

pred = model.predict(preprocessed_data)

return {"predict": pred}
except Exception as e:
# there may be exceptions we don't want to send to the clients, so log them in
# an internal file for debugging. Just as a cheap solution.
with open("error_logs.txt", "a") as f:
f.write(f"{datetime.now(timezone.utc)}: encounter error {e}")
raise HTTPException(
status_code=500, detail="Internal server error during prediction"
)
22 changes: 22 additions & 0 deletions docs/challenge.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,28 @@ number of trees in xgboost). Also, it has the advantage that we can limit
ourselves to only one framework (scikit learn), and have less imcompatibility
issues when trying to move our model to production.

## Part II API developement

Developed an api to serve the model's predictions properly.

There is a welcome message at the root (`/`) entry-point, a health status check
at the `/health` entry-point, and the prediction service at the `/predict`
entry-point.

This API, expects a directory named `models/` at the level of its execution,
where the model object will look for a `model.pkl` file, which stores a trained
instance of the selected model.

Notice that the api mostly manages the reception of information, and does little
processing, i.e. convert the input list of flights into a pandas dataframe.

Also, while on the prediction stage, where an error may occur, I've decided to
not report the error directly to the client, but to log it in an internal file,
and return a 500 error. This is not scalable, it's just an ad-hoc solution to
unwanted information leak to the client side of the api. Lot more information
could be into the log, and could be done with a proper library. But just to
showcase the proper railguard that needs to be there.

## Part III - Deployment to Cloud

A first step for deployin to cloud, is to build a Dockerfile for our application
Expand Down
Loading