Skip to content

Commit

Permalink
feat: add lm_eval extension (#345)
Browse files Browse the repository at this point in the history
Signed-off-by: Radek Ježek <[email protected]>
  • Loading branch information
jezekra1 authored Apr 12, 2024
1 parent eb6bc17 commit 9a97422
Show file tree
Hide file tree
Showing 13 changed files with 1,977 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
- name: Install dependencies
run: |
if [[ $(python -c "import sys; print(sys.version_info[:2] >= (3, 12))") == "True" ]]; then
poetry install --no-interaction -E llama-index -E localserver -E langchain
poetry install --no-interaction -E llama-index -E localserver -E langchain -E lm-eval
else
poetry install --no-interaction --all-extras
fi
Expand Down
9 changes: 9 additions & 0 deletions examples/extensions/lm_eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""
lm_eval
.. admonition:: Before you start
:class: important
To use the following extension, first install it by running
:bash:`pip install 'ibm-generative-ai[lm_eval]'`.
"""
33 changes: 33 additions & 0 deletions examples/extensions/lm_eval/lm_eval_cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
lm-evaluation-harness CLI usage
The recommended way to run benchmarks is through CLI.
In your python environment with 'ibm-generative-ai[lm-eval]' installed:
Example::
python -m genai.extensions.lm_eval \\
--model="ibm_genai" \\
--model_args="model_id=tiiuae/falcon-40b,temperature=0" \\
--task="hellaswag" \\
--num_fewshot=10 \\
--output_path="falcon-40b_hellaswag.json"
"""

import subprocess

subprocess.run(
[
"python",
"-m",
"genai.extensions.lm_eval",
"--model=ibm_genai",
"--model_args=model_id=tiiuae/falcon-40b,temperature=0",
"--task=hellaswag",
"--num_fewshot=10",
"--limit=10", # WARNING: only for debug purposes, remove for full testing dataset
],
check=True,
text=True,
capture_output=False,
)
46 changes: 46 additions & 0 deletions examples/extensions/lm_eval/lm_eval_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""
lm-evaluation-harness advanced usage
Use lm-evaluation extension from code to have additional control over concurrency or execution options
Note:
This is for advanced usage only, use CLI in most cases (lm_eval_cli example)
"""

import logging
from pprint import pprint

from dotenv import load_dotenv
from lm_eval import simple_evaluate

from genai import Client, Credentials
from genai.extensions.lm_eval.model import IBMGenAILMEval
from genai.schema import TextGenerationParameters

load_dotenv()

logging.getLogger("httpx").setLevel(logging.WARN)
logging.getLogger("genai").setLevel(logging.WARN)

task_name = "arc_challenge"
model_id = "tiiuae/falcon-40b"
num_fewshot = 25
limit = 10 # WARNING: only for debug purposes, set None for full testing dataset

client = Client(
credentials=Credentials.from_env(),
config={"api_client_config": {"transport_options": {"retries": 999}}},
)
model = IBMGenAILMEval(
client=client,
model_id=model_id,
show_progressbar=True,
parameters=TextGenerationParameters(temperature=0),
)
results = simple_evaluate(model, tasks=[task_name], num_fewshot=num_fewshot, log_samples=False, limit=limit)

# add info about the model and few shot config
# "model_kwargs": model_kwargs,
results["config"] = {"model": model_id, "use_cache": False, "limit": limit, "model_kwargs": model.dump_parameters()}

pprint(results)
863 changes: 861 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ llama-index-core = { version = "^0.10.0", optional = true }
uvicorn = { version = "^0.22.0", optional = true }
fastapi = { version = "^0.100.0", optional = true }
deprecated = "^1.2.14"
lm-eval = { version = "^0.4.2", optional = true }
tqdm = { version = "^4.66.1", optional = true }

[tool.black] # left for IDE compatibility (pycharm)
line-length = 120
Expand Down Expand Up @@ -114,10 +116,12 @@ pytest-httpx = "^0.30.0"
langchain = ["langchain-core", "pyyaml"]
huggingface = ["datasets", "transformers"]
llama-index = ["llama-index-core"]
lm-eval = ["lm-eval", "tqdm"]
localserver = ["uvicorn", "fastapi"]

[tool.pytest.ini_options]
addopts = "--cov --cov-report term-missing --cov-fail-under 80 -v"
testpaths = ["tests"]
markers = [
"unit",
"integration",
Expand Down Expand Up @@ -151,6 +155,10 @@ warn_required_dynamic_aliases = true
help = "Install dependencies for the SDK core and it's extensions"
cmd = "poetry install --all-extras --without dev"

[tool.poe.tasks.lm_eval]
help = "Run lm_eval with support for genai models"
cmd = "python -m genai.extensions.lm_eval"

[tool.poe.tasks.install-dev]
help = "Install dependencies and related tooling for development"
sequence = [{ cmd = "poetry install --all-extras" }, { cmd = "pre-commit install" }]
Expand Down
Empty file.
30 changes: 30 additions & 0 deletions src/genai/extensions/lm_eval/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import logging
import signal

from genai import handle_shutdown_event
from genai.extensions.lm_eval.model import initialize_model

try:
# load dotenv if installed
from dotenv import load_dotenv

load_dotenv()
except ImportError:
...


try:
from lm_eval.__main__ import cli_evaluate
except ImportError:
raise ImportError("Could not import lm_eval: Please install ibm-generative-ai[lm-eval] extension.") # noqa: B904


initialize_model()

signal.signal(signal.SIGINT, handle_shutdown_event)
signal.signal(signal.SIGTERM, handle_shutdown_event)

logging.getLogger("httpx").setLevel(logging.WARN)
logging.getLogger("genai").setLevel(logging.WARN)

cli_evaluate()
Loading

0 comments on commit 9a97422

Please sign in to comment.