diff --git a/setup.py b/setup.py index 5fa82e5e..f9467af6 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,10 @@ 'jedi', 'tenacity', ], + 'test': [ + 'pytest', + 'pytest-cov', + ] }, include_package_data=True, ) \ No newline at end of file diff --git a/swebench/__init__.py b/swebench/__init__.py index 10b51570..e1216c6d 100644 --- a/swebench/__init__.py +++ b/swebench/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.1.7" +__version__ = "3.0.0" from swebench.collect.build_dataset import main as build_dataset from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline @@ -48,8 +48,7 @@ ) from swebench.harness.utils import ( - get_environment_yml, - get_requirements, + run_threadpool, ) from swebench.versioning.constants import ( @@ -59,9 +58,9 @@ from swebench.versioning.get_versions import ( get_version, - map_version_to_task_instances, get_versions_from_build, get_versions_from_web, + map_version_to_task_instances, ) from swebench.versioning.utils import ( diff --git a/swebench/harness/constants/__init__.py b/swebench/harness/constants/__init__.py new file mode 100644 index 00000000..ad4dcc8b --- /dev/null +++ b/swebench/harness/constants/__init__.py @@ -0,0 +1,21 @@ +from swebench.harness.constants.constants import * +from swebench.harness.constants.javascript import * +from swebench.harness.constants.python import * + +MAP_REPO_VERSION_TO_SPECS = { + **MAP_REPO_VERSION_TO_SPECS_JS, + **MAP_REPO_VERSION_TO_SPECS_PY, +} + +MAP_REPO_TO_INSTALL = { + **MAP_REPO_TO_INSTALL_JS, + **MAP_REPO_TO_INSTALL_PY, +} + +MAP_REPO_TO_EXT = { + **{k: "js" for k in MAP_REPO_VERSION_TO_SPECS_JS.keys()}, + **{k: "py" for k in MAP_REPO_VERSION_TO_SPECS_PY.keys()}, +} + +LATEST = "latest" +USE_X86 = USE_X86_PY diff --git a/swebench/harness/constants/constants.py b/swebench/harness/constants/constants.py new file mode 100644 index 00000000..39a7524a --- /dev/null +++ b/swebench/harness/constants/constants.py @@ -0,0 +1,113 @@ +from enum import Enum +from pathlib import Path +from typing import TypedDict + +# Constants - Evaluation Log Directories +BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base") +ENV_IMAGE_BUILD_DIR = Path("logs/build_images/env") +INSTANCE_IMAGE_BUILD_DIR = Path("logs/build_images/instances") +RUN_EVALUATION_LOG_DIR = Path("logs/run_evaluation") +RUN_VALIDATION_LOG_DIR = Path("logs/run_validation") + +# Constants - Task Instance Class +class SWEbenchInstance(TypedDict): + repo: str + instance_id: str + base_commit: str + patch: str + test_patch: str + problem_statement: str + hints_text: str + created_at: str + version: str + FAIL_TO_PASS: str + PASS_TO_PASS: str + environment_setup_commit: str + +# Constants - Test Types, Statuses, Commands +FAIL_TO_PASS = "FAIL_TO_PASS" +FAIL_TO_FAIL = "FAIL_TO_FAIL" +PASS_TO_PASS = "PASS_TO_PASS" +PASS_TO_FAIL = "PASS_TO_FAIL" + +class ResolvedStatus(Enum): + NO = "RESOLVED_NO" + PARTIAL = "RESOLVED_PARTIAL" + FULL = "RESOLVED_FULL" + +class TestStatus(Enum): + FAILED = "FAILED" + PASSED = "PASSED" + SKIPPED = "SKIPPED" + ERROR = "ERROR" + XFAIL = "XFAIL" + +class EvalType(Enum): + PASS_AND_FAIL = "pass_and_fail" + FAIL_ONLY = "fail_only" + +# Constants - Evaluation Keys +KEY_INSTANCE_ID = "instance_id" +KEY_MODEL = "model_name_or_path" +KEY_PREDICTION = "model_patch" + +# Constants - Harness +DOCKER_PATCH = "/tmp/patch.diff" +DOCKER_USER = "root" +DOCKER_WORKDIR = "/testbed" +LOG_REPORT = "report.json" +LOG_INSTANCE = "run_instance.log" +LOG_TEST_OUTPUT = "test_output.txt" +UTF8 = "utf-8" + +# Constants - Logging +APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed" +APPLY_PATCH_PASS = ">>>>> Applied Patch" +INSTALL_FAIL = ">>>>> Init Failed" +INSTALL_PASS = ">>>>> Init Succeeded" +INSTALL_TIMEOUT = ">>>>> Init Timed Out" +RESET_FAILED = ">>>>> Reset Failed" +TESTS_ERROR = ">>>>> Tests Errored" +TESTS_FAILED = ">>>>> Some Tests Failed" +TESTS_PASSED = ">>>>> All Tests Passed" +TESTS_TIMEOUT = ">>>>> Tests Timed Out" +START_TEST_OUTPUT = ">>>>> Start Test Output" +END_TEST_OUTPUT = ">>>>> End Test Output" + +# Constants - Patch Types +class PatchType(Enum): + PATCH_GOLD = "gold" + PATCH_PRED = "pred" + PATCH_PRED_TRY = "pred_try" + PATCH_PRED_MINIMAL = "pred_minimal" + PATCH_PRED_MINIMAL_TRY = "pred_minimal_try" + PATCH_TEST = "test" + + def __str__(self): + return self.value + +# Constants - Miscellaneous +NON_TEST_EXTS = [ + ".json", + ".png", + "csv", + ".txt", + ".md", + ".jpg", + ".jpeg", + ".pkl", + ".yml", + ".yaml", + ".toml", +] +SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/" +DEFAULT_DOCKER_SPECS = { + "pnpm_version": "9.5.0", + "node_version": "21.6.2", + "python_version": "3.9", +} +FAIL_ONLY_REPOS = { + "chartjs/Chart.js", + "processing/p5.js", + "markedjs/marked", +} diff --git a/swebench/harness/constants/javascript.py b/swebench/harness/constants/javascript.py new file mode 100644 index 00000000..59b486a1 --- /dev/null +++ b/swebench/harness/constants/javascript.py @@ -0,0 +1,161 @@ +# Constants - Commonly Used Commands +TEST_XVFB_PREFIX = 'xvfb-run --server-args="-screen 0 1280x1024x24 -ac :99"' +XVFB_DEPS = [ + "python3", "python3-pip", "xvfb", "x11-xkb-utils", "xfonts-100dpi", + "xfonts-75dpi", "xfonts-scalable", "xfonts-cyrillic", "x11-apps", "firefox" +] +X11_DEPS = [ + "libx11-xcb1", "libxcomposite1", "libxcursor1", "libxdamage1", "libxi6", + "libxtst6", "libnss3", "libcups2", "libxss1", "libxrandr2", "libasound2", + "libatk1.0-0", "libgtk-3-0", "x11-utils", +] + +# Constants - Task Instance Installation Environment +SPECS_CALYPSO = { + **{k: { + "apt-pkgs": ["libsass-dev", "sassc"], + "install": ["npm install --unsafe-perm"], + "test_cmd": "npm run test-client", + "docker_specs": { + "node_version": k, + } + } for k in [ + '0.8', + '4.2.3', '4.3.0', + '5.10.1', '5.11.1', + '6.1.0', '6.7.0', '6.9.0', '6.9.1', '6.9.4', '6.10.0', '6.10.2', '6.10.3', '6.11.1', '6.11.2', '6.11.5', + '8.9.1', '8.9.3', '8.9.4', '8.11.0', '8.11.2', + '10.4.1', '10.5.0', '10.6.0', '10.9.0', '10.10.0', '10.12.0', '10.13.0', '10.14.0', '10.15.2', '10.16.3', + ]} +} + +TEST_CHART_JS_TEMPLATE = "./node_modules/.bin/cross-env NODE_ENV=test ./node_modules/.bin/karma start {} --single-run --coverage --grep --auto-watch false" +SPECS_CHART_JS = { + **{k: { + "install": [ + "pnpm install", + "pnpm run build", + ], + "test_cmd": [ + "pnpm install", + "pnpm run build", + f"{TEST_XVFB_PREFIX} su chromeuser -c \"{TEST_CHART_JS_TEMPLATE.format('./karma.conf.cjs')}\"" + ], + "docker_specs": { + "node_version": "21.6.2", + "pnpm_version": "7.9.0", + "run_args": { + "cap_add": ["SYS_ADMIN"], + } + }, + } for k in ['4.0', '4.1', '4.2', '4.3', '4.4']}, + **{k: { + "install": ["npm install"], + "test_cmd": [ + "npm install", + "npm run build", + f"{TEST_XVFB_PREFIX} su chromeuser -c \"{TEST_CHART_JS_TEMPLATE.format('./karma.conf.js')}\"" + ], + "docker_specs": { + "node_version": "21.6.2", + "run_args": { + "cap_add": ["SYS_ADMIN"], + } + } + } for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8']}, + **{k: { + "install": [ + "npm install", + "npm install -g gulp-cli" + ], + "test_cmd": [ + "npm install", + "gulp build", + TEST_XVFB_PREFIX + ' su chromeuser -c "gulp test"' + ], + "docker_specs": { + "node_version": "21.6.2", + "run_args": { + "cap_add": ["SYS_ADMIN"], + } + } + } for k in ['2.0', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7', '2.8', '2.9']} +} +for v in SPECS_CHART_JS.keys(): + SPECS_CHART_JS[v]["apt-pkgs"] = XVFB_DEPS + +SPECS_MARKED = { + **{k: { + "install": ["npm install"], + "test_cmd": "./node_modules/.bin/jasmine --no-color --config=jasmine.json", + "docker_specs": { + "node_version": "12.22.12", + } + } for k in [ + '0.3', '0.5', '0.6', '0.7', '1.0', '1.1', + '1.2', '2.0', '3.9', '4.0', '4.1', '5.0' + ]} +} +for v in ['4.0', '4.1', '5.0']: + SPECS_MARKED[v]["docker_specs"]["node_version"] = "20.16.0" + +SPECS_P5_JS = { + **{k: { + "apt-pkgs": X11_DEPS, + "install": [ + "npm install", + "PUPPETEER_SKIP_CHROMIUM_DOWNLOAD='' node node_modules/puppeteer/install.js", + "./node_modules/.bin/grunt yui", + ], + "test_cmd": ( + """sed -i 's/concurrency:[[:space:]]*[0-9][0-9]*/concurrency: 1/g' Gruntfile.js\n""" + "stdbuf -o 1M ./node_modules/.bin/grunt test --quiet --force" + ), + "docker_specs": { + "node_version": "14.17.3", + } + } for k in [ + "0.10", "0.2", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9", + "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", + "1.8", "1.9", + ] + }, +} +for k in ['0.4', '0.5', '0.6',]: + SPECS_P5_JS[k]["install"] = [ + "npm install", + "./node_modules/.bin/grunt yui", + ] + +SPECS_REACT_PDF = { + **{k: { + "apt-pkgs": ["pkg-config", "build-essential", "libpixman-1-0", "libpixman-1-dev", "libcairo2-dev", "libpango1.0-dev", + "libjpeg-dev", "libgif-dev", "librsvg2-dev"] + X11_DEPS, + "install": [ + "npm i -g yarn", + "yarn install" + ], + "test_cmd": 'NODE_OPTIONS="--experimental-vm-modules" ./node_modules/.bin/jest --no-color', + "docker_specs": { + "node_version": "18.20.4" + } + } for k in ['1.0', '1.1', '1.2', '2.0']} +} +for v in ['1.0', '1.1', '1.2']: + SPECS_REACT_PDF[v]["docker_specs"]["node_version"] = "8.17.0" + SPECS_REACT_PDF[v]["install"] = [ + "npm install", + "npm install cheerio@1.0.0-rc.3" + ] + SPECS_REACT_PDF[v]["test_cmd"] = "./node_modules/.bin/jest --no-color" + +MAP_REPO_VERSION_TO_SPECS_JS = { + "Automattic/wp-calypso": SPECS_CALYPSO, + "chartjs/Chart.js": SPECS_CHART_JS, + "markedjs/marked": SPECS_MARKED, + "processing/p5.js": SPECS_P5_JS, + "diegomura/react-pdf": SPECS_REACT_PDF, +} + +# Constants - Repository Specific Installation Instructions +MAP_REPO_TO_INSTALL_JS = {} diff --git a/swebench/harness/constants.py b/swebench/harness/constants/python.py similarity index 94% rename from swebench/harness/constants.py rename to swebench/harness/constants/python.py index cc6e8252..c37d27f5 100644 --- a/swebench/harness/constants.py +++ b/swebench/harness/constants/python.py @@ -1,47 +1,4 @@ -from enum import Enum -from pathlib import Path -from typing import TypedDict - -# Constants - Evaluation Log Directories -BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base") -ENV_IMAGE_BUILD_DIR = Path("logs/build_images/env") -INSTANCE_IMAGE_BUILD_DIR = Path("logs/build_images/instances") -RUN_EVALUATION_LOG_DIR = Path("logs/run_evaluation") - -# Constants - Task Instance Class -class SWEbenchInstance(TypedDict): - repo: str - instance_id: str - base_commit: str - patch: str - test_patch: str - problem_statement: str - hints_text: str - created_at: str - version: str - FAIL_TO_PASS: str - PASS_TO_PASS: str - environment_setup_commit: str - - -# Constants - Test Types, Statuses, Commands -FAIL_TO_PASS = "FAIL_TO_PASS" -FAIL_TO_FAIL = "FAIL_TO_FAIL" -PASS_TO_PASS = "PASS_TO_PASS" -PASS_TO_FAIL = "PASS_TO_FAIL" - -class ResolvedStatus(Enum): - NO = "RESOLVED_NO" - PARTIAL = "RESOLVED_PARTIAL" - FULL = "RESOLVED_FULL" - -class TestStatus(Enum): - FAILED = "FAILED" - PASSED = "PASSED" - SKIPPED = "SKIPPED" - ERROR = "ERROR" - XFAIL = "XFAIL" - +# Constants - Testing Commands TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider" TEST_PYTEST_VERBOSE = "pytest -rA --tb=long -p no:cacheprovider" TEST_ASTROPY_PYTEST = "pytest -rA -vv -o console_output_style=classic --tb=no" @@ -912,7 +869,7 @@ class TestStatus(Enum): SPECS_HUMANEVAL = {k: {"python": "3.9", "test_cmd": "python"} for k in ["1.0"]} # Constants - Task Instance Instllation Environment -MAP_REPO_VERSION_TO_SPECS = { +MAP_REPO_VERSION_TO_SPECS_PY = { "astropy/astropy": SPECS_ASTROPY, "dbt-labs/dbt-core": SPECS_DBT_CORE, "django/django": SPECS_DJANGO, @@ -936,7 +893,7 @@ class TestStatus(Enum): } # Constants - Repository Specific Installation Instructions -MAP_REPO_TO_INSTALL = {} +MAP_REPO_TO_INSTALL_PY = {} # Constants - Task Instance Requirements File Paths @@ -961,65 +918,7 @@ class TestStatus(Enum): "pydata/xarray": ["ci/requirements/environment.yml", "environment.yml"], } - -# Constants - Evaluation Keys -KEY_INSTANCE_ID = "instance_id" -KEY_MODEL = "model_name_or_path" -KEY_PREDICTION = "model_patch" - - -# Constants - Harness -DOCKER_PATCH = "/tmp/patch.diff" -DOCKER_USER = "root" -DOCKER_WORKDIR = "/testbed" -LOG_REPORT = "report.json" -LOG_INSTANCE = "run_instance.log" -LOG_TEST_OUTPUT = "test_output.txt" -UTF8 = "utf-8" - - -# Constants - Logging -APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed" -APPLY_PATCH_PASS = ">>>>> Applied Patch" -INSTALL_FAIL = ">>>>> Init Failed" -INSTALL_PASS = ">>>>> Init Succeeded" -INSTALL_TIMEOUT = ">>>>> Init Timed Out" -RESET_FAILED = ">>>>> Reset Failed" -TESTS_ERROR = ">>>>> Tests Errored" -TESTS_FAILED = ">>>>> Some Tests Failed" -TESTS_PASSED = ">>>>> All Tests Passed" -TESTS_TIMEOUT = ">>>>> Tests Timed Out" - - -# Constants - Patch Types -class PatchType(Enum): - PATCH_GOLD = "gold" - PATCH_PRED = "pred" - PATCH_PRED_TRY = "pred_try" - PATCH_PRED_MINIMAL = "pred_minimal" - PATCH_PRED_MINIMAL_TRY = "pred_minimal_try" - PATCH_TEST = "test" - - def __str__(self): - return self.value - - -# Constants - Miscellaneous -NON_TEST_EXTS = [ - ".json", - ".png", - "csv", - ".txt", - ".md", - ".jpg", - ".jpeg", - ".pkl", - ".yml", - ".yaml", - ".toml", -] -SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/" -USE_X86 = { +USE_X86_PY = { "astropy__astropy-7973", "django__django-10087", "django__django-10097", diff --git a/swebench/harness/docker_build.py b/swebench/harness/docker_build.py index 58cadb4b..affc73de 100644 --- a/swebench/harness/docker_build.py +++ b/swebench/harness/docker_build.py @@ -5,28 +5,27 @@ import traceback import docker import docker.errors -from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor, as_completed + from pathlib import Path from swebench.harness.constants import ( - DOCKER_USER, BASE_IMAGE_BUILD_DIR, + DOCKER_USER, ENV_IMAGE_BUILD_DIR, INSTANCE_IMAGE_BUILD_DIR, - MAP_REPO_VERSION_TO_SPECS, UTF8, ) -from swebench.harness.test_spec import ( - get_test_specs_from_dataset, - make_test_spec, - TestSpec -) from swebench.harness.docker_utils import ( cleanup_container, remove_image, find_dependent_images ) +from swebench.harness.test_spec.test_spec import ( + get_test_specs_from_dataset, + make_test_spec, + TestSpec, +) +from swebench.harness.utils import run_threadpool ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]") @@ -280,44 +279,18 @@ def build_env_images( return [], [] print(f"Total environment images to build: {len(configs_to_build)}") - # Build the environment images - successful, failed = list(), list() - with tqdm( - total=len(configs_to_build), smoothing=0, desc="Building environment images" - ) as pbar: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - # Create a future for each image to build - futures = { - executor.submit( - build_image, - image_name, - {"setup_env.sh": config["setup_script"]}, - config["dockerfile"], - config["platform"], - client, - ENV_IMAGE_BUILD_DIR / image_name.replace(":", "__"), - ): image_name - for image_name, config in configs_to_build.items() - } - - # Wait for each future to complete - for future in as_completed(futures): - pbar.update(1) - try: - # Update progress bar, check if image built successfully - future.result() - successful.append(futures[future]) - except BuildImageError as e: - print(f"BuildImageError {e.image_name}") - traceback.print_exc() - failed.append(futures[future]) - continue - except Exception: - print("Error building image") - traceback.print_exc() - failed.append(futures[future]) - continue - + args_list = list() + for image_name, config in configs_to_build.items(): + args_list.append(( + image_name, + {"setup_env.sh": config["setup_script"]}, + config["dockerfile"], + config["platform"], + client, + ENV_IMAGE_BUILD_DIR / image_name.replace(":", "__"), + )) + + successful, failed = run_threadpool(build_image, args_list, max_workers) # Show how many images failed to build if len(failed) == 0: print("All environment images built successfully.") @@ -332,7 +305,9 @@ def build_instance_images( client: docker.DockerClient, dataset: list, force_rebuild: bool = False, - max_workers: int = 4 + max_workers: int = 4, + namespace: str = None, + tag: str = None, ): """ Builds the instance images required for the dataset if they do not already exist. @@ -344,7 +319,7 @@ def build_instance_images( max_workers (int): Maximum number of workers to use for building images """ # Build environment images (and base images as needed) first - test_specs = list(map(make_test_spec, dataset)) + test_specs = list(map(lambda x: make_test_spec(x, namespace=namespace, instance_image_tag=tag), dataset)) if force_rebuild: for spec in test_specs: remove_image(client, spec.instance_image_key, "quiet") @@ -357,42 +332,11 @@ def build_instance_images( print(f"Skipping {len(dont_run_specs)} instances - due to failed env image builds") print(f"Building instance images for {len(test_specs)} instances") successful, failed = list(), list() - + + # `logger` is set to None b/c logger is created in build-instage_image + payloads = [(spec, client, None, False) for spec in test_specs] # Build the instance images - with tqdm( - total=len(test_specs), smoothing=0, desc="Building instance images" - ) as pbar: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - # Create a future for each image to build - futures = { - executor.submit( - build_instance_image, - test_spec, - client, - None, # logger is created in build_instance_image, don't make loggers before you need them - False, - ): test_spec - for test_spec in test_specs - } - - # Wait for each future to complete - for future in as_completed(futures): - pbar.update(1) - try: - # Update progress bar, check if image built successfully - future.result() - successful.append(futures[future]) - except BuildImageError as e: - print(f"BuildImageError {e.image_name}") - traceback.print_exc() - failed.append(futures[future]) - continue - except Exception: - print("Error building image") - traceback.print_exc() - failed.append(futures[future]) - continue - + successful, failed = run_threadpool(build_instance_image, payloads, max_workers) # Show how many images failed to build if len(failed) == 0: print("All instance images built successfully.") @@ -432,7 +376,7 @@ def build_instance_image( # Check that the env. image the instance image is based on exists try: - client.images.get(env_image_name) + env_image = client.images.get(env_image_name) except docker.errors.ImageNotFound as e: raise BuildImageError( test_spec.instance_id, @@ -494,25 +438,34 @@ def build_container( # Build corresponding instance image if force_rebuild: remove_image(client, test_spec.instance_image_key, "quiet") - build_instance_image(test_spec, client, logger, nocache) + if not test_spec.is_remote_image: + build_instance_image(test_spec, client, logger, nocache) + else: + try: + client.images.get(test_spec.instance_image_key) + except docker.errors.ImageNotFound: + try: + client.images.pull(test_spec.instance_image_key) + except docker.errors.NotFound as e: + raise BuildImageError(test_spec.instance_id, str(e), logger) from e container = None try: - # Get configurations for how container should be created - config = MAP_REPO_VERSION_TO_SPECS[test_spec.repo][test_spec.version] - user = DOCKER_USER if not config.get("execute_test_as_nonroot", False) else "nonroot" - nano_cpus = config.get("nano_cpus") - # Create the container logger.info(f"Creating container for {test_spec.instance_id}...") + + # Define arguments for running the container + run_args = test_spec.docker_specs.get("run_args", {}) + cap_add = run_args.get("cap_add", []) + container = client.containers.create( image=test_spec.instance_image_key, name=test_spec.get_instance_container_name(run_id), - user=user, + user=DOCKER_USER, detach=True, command="tail -f /dev/null", - nano_cpus=nano_cpus, platform=test_spec.platform, + cap_add=cap_add, ) logger.info(f"Container for {test_spec.instance_id} created: {container.id}") return container diff --git a/swebench/harness/docker_utils.py b/swebench/harness/docker_utils.py index 2a1dddf2..c8ebd758 100644 --- a/swebench/harness/docker_utils.py +++ b/swebench/harness/docker_utils.py @@ -307,6 +307,8 @@ def should_remove( Determine if an image should be removed based on cache level and clean flag. """ existed_before = image_name in prior_images + if '/' in image_name: + image_name = image_name.split('/', 1)[-1] if image_name.startswith("sweb.base"): if cache_level in {"none"} and (clean or not existed_before): return True diff --git a/swebench/harness/dockerfiles/__init__.py b/swebench/harness/dockerfiles/__init__.py new file mode 100644 index 00000000..a0583521 --- /dev/null +++ b/swebench/harness/dockerfiles/__init__.py @@ -0,0 +1,51 @@ +from swebench.harness.dockerfiles.javascript import ( + _DOCKERFILE_BASE_JS, + _DOCKERFILE_ENV_JS, + _DOCKERFILE_INSTANCE_JS, +) + +from swebench.harness.dockerfiles.python import ( + _DOCKERFILE_BASE_PY, + _DOCKERFILE_ENV_PY, + _DOCKERFILE_INSTANCE_PY, +) + +_DOCKERFILE_BASE = { + "py": _DOCKERFILE_BASE_PY, + "js": _DOCKERFILE_BASE_JS, +} + +_DOCKERFILE_ENV = { + "py": _DOCKERFILE_ENV_PY, + "js": _DOCKERFILE_ENV_JS, +} + +_DOCKERFILE_INSTANCE = { + "py": _DOCKERFILE_INSTANCE_PY, + "js": _DOCKERFILE_INSTANCE_JS, +} + +def get_dockerfile_base(platform, arch, language): + if arch == "arm64": + conda_arch = "aarch64" + else: + conda_arch = arch + return _DOCKERFILE_BASE[language].format( + platform=platform, + conda_arch=conda_arch + ) + + +def get_dockerfile_env(platform, arch, language, **kwargs): + return _DOCKERFILE_ENV[language].format( + platform=platform, + arch=arch, + **kwargs + ) + + +def get_dockerfile_instance(platform, language, env_image_name): + return _DOCKERFILE_INSTANCE[language].format( + platform=platform, + env_image_name=env_image_name + ) diff --git a/swebench/harness/dockerfiles/javascript.py b/swebench/harness/dockerfiles/javascript.py new file mode 100644 index 00000000..fa2235a4 --- /dev/null +++ b/swebench/harness/dockerfiles/javascript.py @@ -0,0 +1,135 @@ +_DOCKERFILE_BASE_JS = r""" +FROM --platform={platform} ubuntu:22.04 + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC +RUN rm /bin/sh && ln -s /bin/bash /bin/sh + +# Install necessary packages +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + git \ + libssl-dev \ + software-properties-common \ + wget \ + gnupg \ + jq \ + ca-certificates \ + dbus \ + ffmpeg \ + imagemagick \ + && apt-get -y autoclean \ + && rm -rf /var/lib/apt/lists/* + +# Install Chrome +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ + && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update \ + && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg \ + fonts-khmeros fonts-kacst fonts-freefont-ttf libxss1 dbus dbus-x11 \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + +# Install NVM +ENV NVM_DIR /usr/local/nvm + +RUN mkdir -p $NVM_DIR +RUN curl --silent -o- https://raw.githubusercontent.com/creationix/nvm/v0.39.3/install.sh | bash + +# Install necessary libraries for Chrome +RUN apt-get update && apt-get install -y \ + procps \ + libasound2 libatk-bridge2.0-0 libatk1.0-0 libcups2 libdrm2 \ + libgbm1 libgconf-2-4 libgdk-pixbuf2.0-0 libgtk-3-0 libnspr4 \ + libnss3 libpango-1.0-0 libpangocairo-1.0-0 libxcomposite1 \ + libxdamage1 libxfixes3 libxkbcommon0 libxrandr2 libxss1 libxshmfence1 libglu1 \ + && apt-get -y autoclean \ + && rm -rf /var/lib/apt/lists/* + +# Set up Chrome for running in a container +ENV CHROME_BIN /usr/bin/google-chrome +RUN echo "CHROME_BIN=$CHROME_BIN" >> /etc/environment + +# Set DBUS for Chrome +RUN mkdir -p /run/dbus +ENV DBUS_SESSION_BUS_ADDRESS="unix:path=/run/dbus/system_bus_socket" +RUN dbus-daemon --system --fork + +# If puppeteer is used, make it use the installed Chrome, not download its own +ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true + +# Fix for PhantomJS runs (used by older task instances) +ENV OPENSSL_CONF /etc/ssl + +# Add a non-root user to run Chrome +RUN useradd -m chromeuser +USER chromeuser +WORKDIR /home/chromeuser + +# Switch back to root for any further commands +USER root +""" + +_DOCKERFILE_ENV_JS = r"""FROM --platform={platform} sweb.base.js.{arch}:latest + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Etc/UTC + +COPY ./setup_env.sh /root/ +RUN sed -i -e 's/\r$//' /root/setup_env.sh +RUN chmod +x /root/setup_env.sh + +# Install Node +ENV NODE_VERSION {node_version} +RUN source $NVM_DIR/nvm.sh \ + && nvm install $NODE_VERSION \ + && nvm alias default $NODE_VERSION \ + && nvm use default + +# Install Python +RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && apt-get install -y python{python_version} +RUN ln -s /usr/bin/python{python_version} /usr/bin/python + +# Install Python2 +RUN apt-get install -y python2 + +# Set up environment variables for Node +ENV NODE_PATH $NVM_DIR/v$NODE_VERSION/lib/node_modules +ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH +RUN echo "PATH=$PATH:/usr/local/nvm/versions/node/$NODE_VERSION/bin/node" >> /etc/environment + +# Install pnpm +ENV PNPM_VERSION {pnpm_version} +ENV PNPM_HOME /usr/local/pnpm +ENV PATH $PNPM_HOME:$PATH + +RUN mkdir -p $PNPM_HOME && \ + wget -qO $PNPM_HOME/pnpm "https://github.com/pnpm/pnpm/releases/download/v$PNPM_VERSION/pnpm-linux-x64" && \ + chmod +x $PNPM_HOME/pnpm && \ + ln -s $PNPM_HOME/pnpm /usr/local/bin/pnpm + +RUN echo "export PNPM_HOME=$PNPM_HOME" >> /etc/profile && \ + echo "export PATH=\$PNPM_HOME:\$PATH" >> /etc/profile + +# Run the setup script +RUN /bin/bash -c "source ~/.bashrc && /root/setup_env.sh" +RUN node -v +RUN npm -v +RUN pnpm -v +RUN python -V +RUN python2 -V + +WORKDIR /testbed/ +""" + +_DOCKERFILE_INSTANCE_JS = r"""FROM --platform={platform} {env_image_name} + +COPY ./setup_repo.sh /root/ +RUN sed -i -e 's/\r$//' /root/setup_repo.sh +RUN node -v +RUN npm -v +RUN /bin/bash /root/setup_repo.sh + +WORKDIR /testbed/ +""" diff --git a/swebench/harness/dockerfiles.py b/swebench/harness/dockerfiles/python.py similarity index 51% rename from swebench/harness/dockerfiles.py rename to swebench/harness/dockerfiles/python.py index b192d0d2..274b123c 100644 --- a/swebench/harness/dockerfiles.py +++ b/swebench/harness/dockerfiles/python.py @@ -1,5 +1,4 @@ -# IF you change the base image, you need to rebuild all images (run with --force_rebuild) -_DOCKERFILE_BASE = r""" +_DOCKERFILE_BASE_PY = r""" FROM --platform={platform} ubuntu:22.04 ARG DEBIAN_FRONTEND=noninteractive @@ -22,7 +21,7 @@ && rm -rf /var/lib/apt/lists/* # Download and install conda -RUN wget 'https://repo.anaconda.com/miniconda/Miniconda3-{conda_version}-Linux-{conda_arch}.sh' -O miniconda.sh \ +RUN wget 'https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-{conda_arch}.sh' -O miniconda.sh \ && bash miniconda.sh -b -p /opt/miniconda3 # Add conda to PATH ENV PATH=/opt/miniconda3/bin:$PATH @@ -33,7 +32,7 @@ RUN adduser --disabled-password --gecos 'dog' nonroot """ -_DOCKERFILE_ENV = r"""FROM --platform={platform} sweb.base.{arch}:latest +_DOCKERFILE_ENV_PY = r"""FROM --platform={platform} sweb.base.py.{arch}:latest COPY ./setup_env.sh /root/ RUN sed -i -e 's/\r$//' /root/setup_env.sh @@ -46,7 +45,7 @@ RUN echo "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed" > /root/.bashrc """ -_DOCKERFILE_INSTANCE = r"""FROM --platform={platform} {env_image_name} +_DOCKERFILE_INSTANCE_PY = r"""FROM --platform={platform} {env_image_name} COPY ./setup_repo.sh /root/ RUN sed -i -e 's/\r$//' /root/setup_repo.sh @@ -54,22 +53,3 @@ WORKDIR /testbed/ """ - - -def get_dockerfile_base(platform, arch, conda_version=None): - if arch == "arm64": - conda_arch = "aarch64" - else: - conda_arch = arch - if conda_version == None: - # Default conda version (from initial SWE-bench release) - conda_version = "py311_23.11.0-2" - return _DOCKERFILE_BASE.format(platform=platform, conda_arch=conda_arch, conda_version=conda_version) - - -def get_dockerfile_env(platform, arch): - return _DOCKERFILE_ENV.format(platform=platform, arch=arch) - - -def get_dockerfile_instance(platform, env_image_name): - return _DOCKERFILE_INSTANCE.format(platform=platform, env_image_name=env_image_name) diff --git a/swebench/harness/grading.py b/swebench/harness/grading.py index 742a2e69..2ee02a95 100644 --- a/swebench/harness/grading.py +++ b/swebench/harness/grading.py @@ -3,21 +3,24 @@ from swebench.harness.constants import ( APPLY_PATCH_FAIL, - APPLY_PATCH_PASS, + END_TEST_OUTPUT, FAIL_TO_FAIL, FAIL_TO_PASS, KEY_INSTANCE_ID, KEY_PREDICTION, + MAP_REPO_VERSION_TO_SPECS, PASS_TO_FAIL, PASS_TO_PASS, RESET_FAILED, + START_TEST_OUTPUT, TESTS_ERROR, TESTS_TIMEOUT, + EvalType, ResolvedStatus, TestStatus, ) -from swebench.harness.test_spec import TestSpec -from swebench.harness.log_parsers import MAP_REPO_TO_PARSER +from swebench.harness.test_spec.test_spec import TestSpec +from swebench.harness.log_parsers import MAP_REPO_TO_PARSER, get_eval_type # MARK: Utility functions @@ -32,7 +35,7 @@ def test_failed(case: str, sm: dict[str, str]) -> bool: # MARK: Evaluation report functions -def get_logs_eval(log_fp: str) -> tuple[dict[str, str], bool]: +def get_logs_eval(test_spec: TestSpec, log_fp: str) -> tuple[dict[str, str], bool]: """ Retrieve evaluation results for a task instance from its corresponding log file @@ -44,41 +47,35 @@ def get_logs_eval(log_fp: str) -> tuple[dict[str, str], bool]: TODO(john-b-yang): Check this is working properly... """ - # Convert e.g. "logs/scikit-learn__scikit-learn-12421/test_output.txt" to "scikit-learn/scikit-learn" - sample_id = str(Path(log_fp).parent.stem) # e.g. scikit-learn__scikit-learn-12421 - repo = "-".join(sample_id.replace("__", "/").split("-")[:-1]) # e.g. scikit-learn/scikit-learn + repo = test_spec.repo + version = test_spec.version log_parser = MAP_REPO_TO_PARSER[repo] + test_cmd = MAP_REPO_VERSION_TO_SPECS[repo][version]["test_cmd"] + if isinstance(test_cmd, list): + test_cmd = test_cmd[-1] with open(log_fp) as f: content = f.read() # TODO fix constant here - if ( - any( - [ - x in content - for x in [ - APPLY_PATCH_FAIL, - RESET_FAILED, - TESTS_ERROR, - TESTS_TIMEOUT, - "Failed to reset task environment", - ] - ] - ) - or "applied patch" not in content.lower() - ): - # Eval patch was not applied successfully + bad_codes = list(filter(lambda x: x in content, [ + APPLY_PATCH_FAIL, RESET_FAILED, TESTS_ERROR, TESTS_TIMEOUT, + ])) + if bad_codes: + return {}, False + elif not (START_TEST_OUTPUT in content and END_TEST_OUTPUT in content): + # Test patch did not apply (should not happen at all) return {}, False # Get status map of evaluation results - content = content.split(f"{APPLY_PATCH_PASS} (pred)")[-1] - return log_parser(content), True + content = content.split(test_cmd)[-1] + return log_parser(content, test_spec), True def get_eval_tests_report( - eval_sm: dict[str, str], + eval_status_map: dict[str, str], gold_results: dict[str, str], calculate_to_fail: bool = False, + eval_type: EvalType = EvalType.PASS_AND_FAIL, ) -> dict[str, dict[str, list[str]]]: """ Create a report based on failure/pass change from gold results to eval results. @@ -102,24 +99,32 @@ def get_eval_tests_report( - Fail-Fail (F2F) + P: Success (Extra Credit) - Pass-Fail (P2F) + P: Not considered """ + def check_pass_and_fail(test_case, eval_status_map, success, failed): + if test_passed(test_case, eval_status_map): + # Assume silent success for now (test case not in eval_sm) + success.append(test_case) + elif test_failed(test_case, eval_status_map): + failed.append(test_case) + + def check_fail_only(test_case, eval_status_map, success, failed): + if test_case in eval_status_map and eval_status_map[test_case] == TestStatus.FAILED.value: + failed.append(test_case) + else: + success.append(test_case) + + check_test_case = check_pass_and_fail if eval_type == EvalType.PASS_AND_FAIL else check_fail_only + # Calculate resolution metrics f2p_success = [] f2p_failure = [] for test_case in gold_results[FAIL_TO_PASS]: - if test_passed(test_case, eval_sm): - # Assume silent success for now (test case not in eval_sm) - f2p_success.append(test_case) - elif test_failed(test_case, eval_sm): - f2p_failure.append(test_case) + check_test_case(test_case, eval_status_map, f2p_success, f2p_failure) # Calculate maintenance metrics p2p_success = [] p2p_failure = [] for test_case in gold_results[PASS_TO_PASS]: - if test_passed(test_case, eval_sm): - p2p_success.append(test_case) - elif test_failed(test_case, eval_sm): - p2p_failure.append(test_case) + check_test_case(test_case, eval_status_map, p2p_success, p2p_failure) results = { FAIL_TO_PASS: { @@ -139,17 +144,11 @@ def get_eval_tests_report( if calculate_to_fail: # Calculate "extra credit" metrics for test_case in gold_results[FAIL_TO_FAIL]: - if test_passed(test_case, eval_sm): - f2f_success.append(test_case) - elif test_failed(test_case, eval_sm): - f2f_failure.append(test_case) + check_test_case(test_case, eval_status_map, f2f_success, f2f_failure) # Calculate not considered metrics for test_case in gold_results[PASS_TO_FAIL]: - if test_passed(test_case, eval_sm): - p2f_success.append(test_case) - elif test_failed(test_case, eval_sm): - p2f_failure.append(test_case) + check_test_case(test_case, eval_status_map, p2f_success, p2f_failure) results.update( { @@ -210,7 +209,7 @@ def get_resolution_status(report: dict[str, dict[str, Any]]) -> str: def get_eval_report( test_spec: TestSpec, prediction: dict[str, str], - log_path: str, + test_log_path: str, include_tests_status: bool, ) -> dict[str, Any]: """ @@ -242,7 +241,7 @@ def get_eval_report( report_map[instance_id]["patch_exists"] = True # Get evaluation logs - eval_sm, found = get_logs_eval(log_path) + eval_status_map, found = get_logs_eval(test_spec, test_log_path) if not found: return report_map @@ -254,7 +253,7 @@ def get_eval_report( PASS_TO_PASS: test_spec.PASS_TO_PASS, } - report = get_eval_tests_report(eval_sm, eval_ref) + report = get_eval_tests_report(eval_status_map, eval_ref, eval_type=get_eval_type(test_spec)) if get_resolution_status(report) == ResolvedStatus.FULL.value: report_map[instance_id]["resolved"] = True diff --git a/swebench/harness/log_parsers/__init__.py b/swebench/harness/log_parsers/__init__.py new file mode 100644 index 00000000..295ef213 --- /dev/null +++ b/swebench/harness/log_parsers/__init__.py @@ -0,0 +1,8 @@ +from swebench.harness.log_parsers.javascript import MAP_REPO_TO_PARSER_JS +from swebench.harness.log_parsers.python import MAP_REPO_TO_PARSER_PY +from swebench.harness.log_parsers.utils import get_eval_type + +MAP_REPO_TO_PARSER = { + **MAP_REPO_TO_PARSER_JS, + **MAP_REPO_TO_PARSER_PY, +} \ No newline at end of file diff --git a/swebench/harness/log_parsers/javascript.py b/swebench/harness/log_parsers/javascript.py new file mode 100644 index 00000000..c861692a --- /dev/null +++ b/swebench/harness/log_parsers/javascript.py @@ -0,0 +1,186 @@ +import json +import re + +from swebench.harness.constants import ( + MAP_REPO_VERSION_TO_SPECS, + TestStatus, +) +from swebench.harness.test_spec.test_spec import TestSpec +from swebench.harness.log_parsers.utils import ansi_escape + + +def parse_log_calypso(log: str, test_spec: TestSpec) -> dict[str, str]: + """ + Parser for test logs generated by Calypso test suite + """ + test_status_map = {} + suite = [] + + get_test_name = lambda suite, match_pattern, line : " - ".join([ + " - ".join([x[0] for x in suite]), + re.match(match_pattern, line).group(1) + ]).strip() + + for log in log.split(" ./node_modules/.bin/jest ")[1:]: + for line in log.split("\n"): + if any([line.startswith(x) for x in [ + "Test Suites", + " ● " + ]]): + break + elif line.strip().startswith("✓"): + # Test passed + match_pattern = r"^\s+✓\s(.*)\(\d+ms\)$" \ + if re.search(r"\(\d+ms\)", line) is not None \ + else r"^\s+✓\s(.*)" + test_status_map[ + get_test_name(suite, match_pattern, line) + ] = TestStatus.PASSED.value + elif line.strip().startswith("✕"): + # Test failed + match_pattern = r"^\s+✕\s(.*)\(\d+ms\)$" \ + if re.search(r"\(\d+ms\)", line) is not None \ + else r"^\s+✕\s(.*)" + test_status_map[ + get_test_name(suite, match_pattern, line) + ] = TestStatus.FAILED.value + elif len(line) - len(line.lstrip()) > 0: + # Adjust suite name + indent = len(line) - len(line.lstrip()) + if len(suite) == 0: + # If suite is empty, initialize it + suite = [(line.strip(), indent)] + else: + while len(suite) > 0 and suite[-1][-1] >= indent: + # Pop until the last element with indent less than current indent + suite.pop() + suite.append([line.strip(), indent]) + + return test_status_map + + +def parse_log_chart_js(log: str, test_spec: TestSpec) -> dict[str, str]: + """ + Parser for test logs generated by ChartJS test suite + """ + test_status_map = {} + failure_case_patterns = [ + (r"Chrome\s[\d\.]+\s\(.*?\)\s(.*)FAILED$", re.MULTILINE), + ] + for failure_case_pattern, flags in failure_case_patterns: + failures = re.findall(failure_case_pattern, log, flags) + if len(failures) == 0: + continue + for failure in failures: + test_status_map[failure] = TestStatus.FAILED.value + return test_status_map + + +def parse_log_marked(log: str, test_spec: TestSpec) -> dict[str, str]: + """ + Parser for test logs generated by Marked test suite + """ + test_status_map = {} + for line in log.split("\n"): + if re.search(r"^\d+\)\s(.*)", line): + test = re.search(r"^\d+\)\s(.*)", line).group(1) + test_status_map[test.strip()] = TestStatus.FAILED.value + return test_status_map + + +def parse_log_p5js(log_content: str, test_spec: TestSpec) -> dict[str, str]: + def remove_json_blocks(log): + filtered_lines = [] + in_json_block = False + in_json_list_block = False + for line in log.split('\n'): + stripped_line = line.rstrip() # Remove trailing whitespace + if stripped_line.endswith('{'): + in_json_block = True + continue + if stripped_line.endswith('['): + in_json_list_block = True + continue + if stripped_line == '}' and in_json_block: + in_json_block = False + continue + if stripped_line == ']' and in_json_list_block: + in_json_list_block = False + continue + if in_json_block or in_json_list_block: + continue + if stripped_line.startswith('{') and stripped_line.endswith('}'): + continue + if stripped_line.startswith('[') and stripped_line.endswith(']'): + continue + filtered_lines.append(line) + return '\n'.join(filtered_lines) + + def remove_xml_blocks(log): + xml_pat = re.compile(r'<(\w+)>[\s\S]*?<\/\1>', re.MULTILINE) + match = xml_pat.search(log) + while match: + # count the number of opening tags in the match + opening_tags = match.group().count(rf'<{match.group(1)}>') - 1 + opening_tags = max(opening_tags, 0) + start = match.start() + end = match.end() + log = log[:start] + f'<{match.group(1)}>' * opening_tags + log[end:] + match = xml_pat.search(log) + return log + def is_valid_fail(match): + last_line_indent = 0 + for line in match.group(2).split('\n'): + line_indent = len(line) - len(line.lstrip()) + if line_indent <= last_line_indent: + return False + last_line_indent = line_indent + return True + + test_name_pat = re.compile(r'^(.*?)(?:\s*\(\d+(?:[A-Za-z]+)\))?$') + log_content = ansi_escape(log_content) + log_content = remove_json_blocks(log_content) + log_content = remove_xml_blocks(log_content) + test_results = {} + + # Parse failing tests + fail_pattern = re.compile(r'^\s*(\d+)\)(.{0,1000}?):', re.MULTILINE | re.DOTALL) + for match in fail_pattern.finditer(log_content): + if is_valid_fail(match): + test_names = list(map(str.strip, match.group(2).split('\n'))) + full_name = ":".join(test_names) + test_results[full_name] = TestStatus.FAILED.value + + return test_results + + +def parse_log_react_pdf(log: str, test_spec: TestSpec) -> dict[str, str]: + """ + Parser for test logs generated by Carbon test suite + """ + test_status_map = {} + for line in log.split("\n"): + for pattern in [ + (r"^PASS\s(.*)\s\([\d\.]+ms\)", TestStatus.PASSED.value), + (r"^PASS\s(.*)\s\([\d\.]+\ss\)", TestStatus.PASSED.value), + (r"^PASS\s(.*)\s\([\d\.]+s\)", TestStatus.PASSED.value), + (r"^PASS\s(.*)", TestStatus.PASSED.value), + (r"^FAIL\s(.*)\s\([\d\.]+ms\)", TestStatus.FAILED.value), + (r"^FAIL\s(.*)\s\([\d\.]+\ss\)", TestStatus.FAILED.value), + (r"^FAIL\s(.*)\s\([\d\.]+s\)", TestStatus.FAILED.value), + (r"^FAIL\s(.*)", TestStatus.FAILED.value), + ]: + if re.search(pattern[0], line): + test_name = re.match(pattern[0], line).group(1) + test_status_map[test_name] = pattern[1] + break + return test_status_map + + +MAP_REPO_TO_PARSER_JS = { + "Automattic/wp-calypso": parse_log_calypso, + "chartjs/Chart.js": parse_log_chart_js, + "markedjs/marked": parse_log_marked, + "processing/p5.js": parse_log_p5js, + "diegomura/react-pdf": parse_log_react_pdf, +} diff --git a/swebench/harness/log_parsers.py b/swebench/harness/log_parsers/python.py similarity index 93% rename from swebench/harness/log_parsers.py rename to swebench/harness/log_parsers/python.py index 1d99aec2..a8c5a937 100644 --- a/swebench/harness/log_parsers.py +++ b/swebench/harness/log_parsers/python.py @@ -1,9 +1,11 @@ import re -from enum import Enum + from swebench.harness.constants import TestStatus +from swebench.harness.test_spec.test_spec import TestSpec +from swebench.harness.log_parsers.utils import ansi_escape -def parse_log_pytest(log: str) -> dict[str, str]: +def parse_log_pytest(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with PyTest framework @@ -25,7 +27,7 @@ def parse_log_pytest(log: str) -> dict[str, str]: return test_status_map -def parse_log_pytest_options(log: str) -> dict[str, str]: +def parse_log_pytest_options(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with PyTest framework with options @@ -56,7 +58,7 @@ def parse_log_pytest_options(log: str) -> dict[str, str]: return test_status_map -def parse_log_django(log: str) -> dict[str, str]: +def parse_log_django(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with Django tester framework @@ -132,7 +134,7 @@ def parse_log_django(log: str) -> dict[str, str]: return test_status_map -def parse_log_pytest_v2(log: str) -> dict[str, str]: +def parse_log_pytest_v2(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with PyTest framework (Later Version) @@ -157,11 +159,11 @@ def parse_log_pytest_v2(log: str) -> dict[str, str]: elif any([line.endswith(x.value) for x in TestStatus]): test_case = line.split() if len(test_case) >= 2: - test_status_map[test_case[0]] = test_case[1] + test_status_map[test_case[1]] = test_case[0] return test_status_map -def parse_log_seaborn(log: str) -> dict[str, str]: +def parse_log_seaborn(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with seaborn testing framework @@ -187,7 +189,7 @@ def parse_log_seaborn(log: str) -> dict[str, str]: return test_status_map -def parse_log_sympy(log: str) -> dict[str, str]: +def parse_log_sympy(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with Sympy framework @@ -205,9 +207,6 @@ def parse_log_sympy(log: str) -> dict[str, str]: for line in log.split("\n"): line = line.strip() if line.startswith("test_"): - if line.endswith("[FAIL]") or line.endswith("[OK]"): - line = line[: line.rfind("[")] - line = line.strip() if line.endswith(" E"): test = line.split()[0] test_status_map[test] = TestStatus.ERROR.value @@ -220,7 +219,7 @@ def parse_log_sympy(log: str) -> dict[str, str]: return test_status_map -def parse_log_matplotlib(log: str) -> dict[str, str]: +def parse_log_matplotlib(log: str, test_spec: TestSpec) -> dict[str, str]: """ Parser for test logs generated with PyTest framework @@ -261,7 +260,7 @@ def parse_log_matplotlib(log: str) -> dict[str, str]: parse_log_sphinx = parse_log_pytest_v2 -MAP_REPO_TO_PARSER = { +MAP_REPO_TO_PARSER_PY = { "astropy/astropy": parse_log_astropy, "django/django": parse_log_django, "marshmallow-code/marshmallow": parse_log_marshmallow, diff --git a/swebench/harness/log_parsers/utils.py b/swebench/harness/log_parsers/utils.py new file mode 100644 index 00000000..393210b2 --- /dev/null +++ b/swebench/harness/log_parsers/utils.py @@ -0,0 +1,20 @@ +import re +from swebench.harness.constants.constants import EvalType, FAIL_ONLY_REPOS +from swebench.harness.test_spec.test_spec import TestSpec + + +def ansi_escape(text: str) -> str: + """ + Remove ANSI escape sequences from text + """ + pattern = re.compile( + r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])', + re.VERBOSE, + ) + return pattern.sub('', text) + + +def get_eval_type(test_spec: TestSpec) -> str: + if test_spec.repo in FAIL_ONLY_REPOS: + return EvalType.FAIL_ONLY + return EvalType.PASS_AND_FAIL diff --git a/swebench/harness/prepare_images.py b/swebench/harness/prepare_images.py index 3d14fe81..5d798aee 100644 --- a/swebench/harness/prepare_images.py +++ b/swebench/harness/prepare_images.py @@ -6,7 +6,7 @@ from swebench.harness.constants import KEY_INSTANCE_ID from swebench.harness.docker_build import build_instance_images from swebench.harness.docker_utils import list_images -from swebench.harness.test_spec import make_test_spec +from swebench.harness.test_spec.test_spec import make_test_spec from swebench.harness.utils import load_swebench_dataset, str2bool @@ -14,7 +14,9 @@ def filter_dataset_to_build( dataset: list, instance_ids: list | None, client: docker.DockerClient, - force_rebuild: bool + force_rebuild: bool, + namespace: str = None, + tag: str = None, ): """ Filter the dataset to only include instances that need to be built. @@ -43,7 +45,7 @@ def filter_dataset_to_build( continue # Check if the instance needs to be built (based on force_rebuild flag and existing images) - spec = make_test_spec(instance) + spec = make_test_spec(instance, namespace=namespace, instance_image_tag=tag) if force_rebuild: data_to_build.append(instance) elif spec.instance_image_key not in existing_images: @@ -59,6 +61,8 @@ def main( max_workers, force_rebuild, open_file_limit, + namespace, + tag, ): """ Build Docker images for the specified instances. @@ -75,7 +79,7 @@ def main( # Filter out instances that were not specified dataset = load_swebench_dataset(dataset_name, split) - dataset = filter_dataset_to_build(dataset, instance_ids, client, force_rebuild) + dataset = filter_dataset_to_build(dataset, instance_ids, client, force_rebuild, namespace, tag) # Build images for remaining instances successful, failed = build_instance_images( @@ -83,6 +87,8 @@ def main( dataset=dataset, force_rebuild=force_rebuild, max_workers=max_workers, + namespace=namespace, + tag=tag, ) print(f"Successfully built {len(successful)} images") print(f"Failed to build {len(failed)} images") @@ -96,5 +102,7 @@ def main( parser.add_argument("--max_workers", type=int, default=4, help="Max workers for parallel processing") parser.add_argument("--force_rebuild", type=str2bool, default=False, help="Force rebuild images") parser.add_argument("--open_file_limit", type=int, default=8192, help="Open file limit") + parser.add_argument("--namespace", type=str, default=None, help="Namespace to use for the images") + parser.add_argument("--tag", type=str, default=None, help="Tag to use for the images") args = parser.parse_args() main(**vars(args)) diff --git a/swebench/harness/remove_containers.py b/swebench/harness/remove_containers.py index fc538767..addc9bea 100644 --- a/swebench/harness/remove_containers.py +++ b/swebench/harness/remove_containers.py @@ -1,7 +1,7 @@ +import docker import json -from argparse import ArgumentParser -import docker +from argparse import ArgumentParser """ Script for removing containers associated with specified instance IDs. diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py index 1dd51ffd..8da3311c 100644 --- a/swebench/harness/run_evaluation.py +++ b/swebench/harness/run_evaluation.py @@ -8,10 +8,8 @@ if platform.system() == 'Linux': import resource -from argparse import ArgumentParser -from concurrent.futures import ThreadPoolExecutor, as_completed +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from pathlib import Path, PurePosixPath -from tqdm import tqdm from swebench.harness.constants import ( APPLY_PATCH_FAIL, @@ -30,13 +28,13 @@ UTF8, ) from swebench.harness.docker_utils import ( - remove_image, + clean_images, + cleanup_container, copy_to_container, exec_run_with_timeout, - cleanup_container, list_images, + remove_image, should_remove, - clean_images, ) from swebench.harness.docker_build import ( BuildImageError, @@ -46,8 +44,15 @@ setup_logger, ) from swebench.harness.grading import get_eval_report -from swebench.harness.test_spec import make_test_spec, TestSpec -from swebench.harness.utils import load_swebench_dataset, str2bool +from swebench.harness.test_spec.test_spec import make_test_spec, TestSpec +from swebench.harness.utils import load_swebench_dataset, str2bool, run_threadpool + +GIT_APPLY_CMDS = [ + "git apply --allow-empty -v", + "git apply --verbose", + "git apply --verbose --reject", + "patch --batch --fuzz=5 -p1 -i", +] class EvaluationError(Exception): @@ -73,6 +78,7 @@ def run_instance( client: docker.DockerClient, run_id: str, timeout: int | None = None, + rewrite_reports: bool = False, ): """ Run a single instance with the given prediction. @@ -85,29 +91,47 @@ def run_instance( client (docker.DockerClient): Docker client run_id (str): Run ID timeout (int): Timeout for running tests + rewrite_reports (bool): True if eval run is just to reformat existing report """ # Set up logging directory instance_id = test_spec.instance_id model_name_or_path = pred.get(KEY_MODEL, "None").replace("/", "__") log_dir = RUN_EVALUATION_LOG_DIR / run_id / model_name_or_path / instance_id - log_dir.mkdir(parents=True, exist_ok=True) - - # Link the image build dir in the log dir - build_dir = INSTANCE_IMAGE_BUILD_DIR / test_spec.instance_image_key.replace(":", "__") - image_build_link = log_dir / "image_build_dir" - if not image_build_link.exists(): - try: - # link the image build dir in the log dir - image_build_link.symlink_to(build_dir.absolute(), target_is_directory=True) - except: - # some error, idk why - pass - log_file = log_dir / LOG_INSTANCE - # Set up report file + logger + # Set up report file report_path = log_dir / LOG_REPORT + if rewrite_reports: + test_output_path = log_dir / LOG_TEST_OUTPUT + if not test_output_path.exists(): + raise ValueError(f"Test output file {test_output_path} does not exist") + report = get_eval_report( + test_spec=test_spec, + prediction=pred, + test_log_path=test_output_path, + include_tests_status=True, + ) + # Write report to report.json + with open(report_path, "w") as f: + f.write(json.dumps(report, indent=4)) + return instance_id, report if report_path.exists(): return instance_id, json.loads(report_path.read_text()) + + if not test_spec.is_remote_image: + # Link the image build dir in the log dir + build_dir = INSTANCE_IMAGE_BUILD_DIR / test_spec.instance_image_key.replace(":", "__") + image_build_link = log_dir / "image_build_dir" + if not image_build_link.exists(): + try: + # link the image build dir in the log dir + image_build_link.symlink_to(build_dir.absolute(), target_is_directory=True) + except: + # some error, idk why + pass + + # Set up logger + log_dir.mkdir(parents=True, exist_ok=True) + log_file = log_dir / LOG_INSTANCE logger = setup_logger(instance_id, log_file) # Run the instance @@ -126,36 +150,27 @@ def run_instance( ) copy_to_container(container, patch_file, PurePosixPath(DOCKER_PATCH)) - # Attempt to apply patch to container - val = container.exec_run( - f"git apply --allow-empty -v {DOCKER_PATCH}", - workdir=DOCKER_WORKDIR, - user=DOCKER_USER, - ) - if val.exit_code != 0: - logger.info(f"Failed to apply patch to container, trying again...") - - # try "patch --batch --fuzz=5 -p1 -i {patch_path}" to try again - val = container.exec_run( - f"patch --batch --fuzz=5 -p1 -i {DOCKER_PATCH}", - workdir=DOCKER_WORKDIR, - user=DOCKER_USER, - ) - if val.exit_code != 0: - logger.info(f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}") - raise EvaluationError( - instance_id, - f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}", - logger, - ) - else: + # Attempt to apply patch to container (TODO: FIX THIS) + applied_patch = False + for git_apply_cmd in GIT_APPLY_CMDS: + val = container.exec_run(f"{git_apply_cmd} {DOCKER_PATCH}", workdir=DOCKER_WORKDIR, user=DOCKER_USER) + if val.exit_code == 0: logger.info(f"{APPLY_PATCH_PASS}:\n{val.output.decode(UTF8)}") - else: - logger.info(f"{APPLY_PATCH_PASS}:\n{val.output.decode(UTF8)}") + applied_patch = True + break + else: + logger.info(f"Failed to apply patch to container: {git_apply_cmd}") + if not applied_patch: + logger.info(f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}") + raise EvaluationError( + instance_id, + f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}", + logger, + ) # Get git diff before running eval script git_diff_output_before = ( - container.exec_run("git diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip() + container.exec_run("git -c core.fileMode=false diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip() ) logger.info(f"Git diff before:\n{git_diff_output_before}") @@ -164,7 +179,7 @@ def run_instance( logger.info( f"Eval script for {instance_id} written to {eval_file}; copying to container..." ) - copy_to_container(container, eval_file, Path("/eval.sh")) + copy_to_container(container, eval_file, PurePosixPath("/eval.sh")) # Run eval script, write output to logs test_output, timed_out, total_runtime = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout) @@ -181,9 +196,9 @@ def run_instance( logger, ) - # Get git diff after running eval script + # Get git diff after running eval script (ignore permission changes) git_diff_output_after = ( - container.exec_run("git diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip() + container.exec_run("git -c core.fileMode=false diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip() ) # Check if git diff changed after running eval script @@ -196,7 +211,7 @@ def run_instance( report = get_eval_report( test_spec=test_spec, prediction=pred, - log_path=test_output_path, + test_log_path=test_output_path, include_tests_status=True, ) logger.info( @@ -239,6 +254,9 @@ def run_instances( max_workers: int, run_id: str, timeout: int, + namespace: str = None, + instance_image_tag: str = 'latest', + rewrite_reports: bool = False, ): """ Run all instances for the given predictions in parallel. @@ -254,7 +272,10 @@ def run_instances( timeout (int): Timeout for running tests """ client = docker.from_env() - test_specs = list(map(make_test_spec, instances)) + test_specs = list(map( + lambda instance: make_test_spec(instance, namespace=namespace, instance_image_tag=instance_image_tag), + instances + )) # print number of existing instance images instance_image_ids = {x.instance_image_key for x in test_specs} @@ -265,38 +286,28 @@ def run_instances( if not force_rebuild and len(existing_images): print(f"Found {len(existing_images)} existing instance images. Will reuse them.") + # run instances in parallel + payloads = [] + for test_spec in test_specs: + payloads.append(( + test_spec, + predictions[test_spec.instance_id], + should_remove( + test_spec.instance_image_key, + cache_level, + clean, + existing_images, + ), + force_rebuild, + client, + run_id, + timeout, + rewrite_reports, + )) + # run instances in parallel print(f"Running {len(instances)} instances...") - with tqdm(total=len(instances), smoothing=0) as pbar: - with ThreadPoolExecutor(max_workers=max_workers) as executor: - # Create a future for running each instance - futures = { - executor.submit( - run_instance, - test_spec, - predictions[test_spec.instance_id], - should_remove( - test_spec.instance_image_key, - cache_level, - clean, - existing_images, - ), - force_rebuild, - client, - run_id, - timeout, - ): None - for test_spec in test_specs - } - # Wait for each future to complete - for future in as_completed(futures): - pbar.update(1) - try: - # Update progress bar, check if instance ran successfully - future.result() - except Exception as e: - traceback.print_exc() - continue + run_threadpool(run_instance, payloads, max_workers) print("All instances run.") @@ -306,7 +317,8 @@ def get_dataset_from_preds( instance_ids: list, predictions: dict, run_id: str, - exclude_completed: bool = True + rewrite_reports: bool, + exclude_completed: bool = True, ): """ Return only instances that have predictions and are in the dataset. @@ -335,6 +347,25 @@ def get_dataset_from_preds( if instance_ids: dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids] + if rewrite_reports: + # we only return instances that have existing test outputs + test_output_ids = set() + for instance in dataset: + if instance[KEY_INSTANCE_ID] not in predictions: + continue + prediction = predictions[instance[KEY_INSTANCE_ID]] + test_output_file = ( + RUN_EVALUATION_LOG_DIR + / run_id + / prediction["model_name_or_path"].replace("/", "__") + / prediction[KEY_INSTANCE_ID] + / "test_output.txt" + ) + if test_output_file.exists(): + test_output_ids.add(instance[KEY_INSTANCE_ID]) + dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in prediction_ids and i[KEY_INSTANCE_ID] in test_output_ids] + return dataset + # check which instance IDs have already been run completed_ids = set() for instance in dataset: @@ -368,7 +399,10 @@ def make_run_report( predictions: dict, full_dataset: list, client: docker.DockerClient, - run_id: str + run_id: str, + namespace: str | None = None, + instance_image_tag: str = 'latest', + report_dir: str = '.', ) -> Path: """ Make a final evaluation and run report of the instances that have been run. @@ -427,7 +461,10 @@ def make_run_report( # get remaining images and containers images = list_images(client) - test_specs = list(map(make_test_spec, full_dataset)) + test_specs = list(map( + lambda x: make_test_spec(x, namespace=namespace, instance_image_tag=instance_image_tag), + full_dataset + )) for spec in test_specs: image_name = spec.instance_image_key if image_name in images: @@ -476,6 +513,8 @@ def make_run_report( + f".{run_id}" + ".json" ) + if report_dir is not None: + report_file = Path(report_dir) / report_file with open(report_file, "w") as f: print(json.dumps(report, indent=4), file=f) print(f"Report written to {report_file}") @@ -508,16 +547,28 @@ def main( open_file_limit: int, run_id: str, timeout: int, + namespace: str | None, + rewrite_reports: bool, + instance_image_tag: str = 'latest', + report_dir: str = '.' ): """ Run evaluation harness for the given dataset and predictions. """ # set open file limit assert len(run_id) > 0, "Run ID must be provided" + if report_dir is not None: + report_dir = Path(report_dir) + if not report_dir.exists(): + report_dir.mkdir(parents=True) + if platform.system() == 'Linux': resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit)) client = docker.from_env() + if force_rebuild and namespace is not None: + raise ValueError("Cannot force rebuild and use a namespace at the same time.") + # load predictions as map of instance_id to prediction if predictions_path == 'gold': print("Using gold predictions - ignoring predictions_path") @@ -534,7 +585,7 @@ def main( predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions} # get dataset from predictions - dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id) + dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id, rewrite_reports) full_dataset = load_swebench_dataset(dataset_name, split, instance_ids) existing_images = list_images(client) print(f"Running {len(dataset)} unevaluated instances...") @@ -542,8 +593,21 @@ def main( print("No instances to run.") else: # build environment images + run instances - build_env_images(client, dataset, force_rebuild, max_workers) - run_instances(predictions, dataset, cache_level, clean, force_rebuild, max_workers, run_id, timeout) + if namespace is None and not rewrite_reports: + build_env_images(client, dataset, force_rebuild, max_workers) + run_instances( + predictions, + dataset, + cache_level, + clean, + force_rebuild, + max_workers, + run_id, + timeout, + namespace=namespace, + instance_image_tag=instance_image_tag, + rewrite_reports=rewrite_reports, + ) # clean images + make final report clean_images(client, existing_images, cache_level, clean) @@ -551,18 +615,57 @@ def main( if __name__ == "__main__": - parser = ArgumentParser() - parser.add_argument("--dataset_name", default="princeton-nlp/SWE-bench_Lite", type=str, help="Name of dataset or path to JSON file.") - parser.add_argument("--split", type=str, default="test", help="Split of the dataset") - parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)") - parser.add_argument("--predictions_path", type=str, help="Path to predictions file - if 'gold', uses gold predictions", required=True) - parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of workers (should be <= 75%% of CPU cores)") - parser.add_argument("--open_file_limit", type=int, default=4096, help="Open file limit") + parser = ArgumentParser( + description="Run evaluation harness for the given dataset and predictions.", + formatter_class=ArgumentDefaultsHelpFormatter, + ) parser.add_argument( - "--timeout", type=int, default=1_800, help="Timeout (in seconds) for running tests for each instance" - ) + "--dataset_name", + default="princeton-nlp/SWE-bench_Lite", + type=str, + help="Name of dataset or path to JSON file." + ) + parser.add_argument( + "--split", + type=str, + default="test", + help="Split of the dataset", + ) + parser.add_argument( + "--instance_ids", + nargs="+", + type=str, + help="Instance IDs to run (space separated)", + ) + parser.add_argument( + "--predictions_path", + type=str, + help="Path to predictions file - if 'gold', uses gold predictions", + required=True, + ) + parser.add_argument( + "--max_workers", + type=int, + default=4, + help="Maximum number of workers (should be <= 75%% of CPU cores)", + ) parser.add_argument( - "--force_rebuild", type=str2bool, default=False, help="Force rebuild of all images" + "--open_file_limit", + type=int, + default=4096, + help="Open file limit", + ) + parser.add_argument( + "--timeout", + type=int, + default=1_800, + help="Timeout (in seconds) for running tests for each instance", + ) + parser.add_argument( + "--force_rebuild", + type=str2bool, + default=False, + help="Force rebuild of all images", ) parser.add_argument( "--cache_level", @@ -574,9 +677,41 @@ def main( # if clean is true then we remove all images that are above the cache level # if clean is false, we only remove images above the cache level if they don't already exist parser.add_argument( - "--clean", type=str2bool, default=False, help="Clean images above cache level" + "--clean", + type=str2bool, + default=False, + help="Clean images above cache level", + ) + parser.add_argument( + "--run_id", + type=str, + required=True, + help="Run ID - identifies the run", + ) + parser.add_argument( + "--namespace", + type=str, + default=None, + help="Namespace for images", + ) + parser.add_argument( + "--instance_image_tag", + type=str, + default='latest', + help="Instance image tag", + ) + parser.add_argument( + "--rewrite_reports", + type=str2bool, + default=False, + help="Doesn't run new instances, only writes reports for instances with existing test outputs", + ) + parser.add_argument( + "--report_dir", + type=str, + default=".", + help="Directory to write reports to", ) - parser.add_argument("--run_id", type=str, required=True, help="Run ID - identifies the run") args = parser.parse_args() main(**vars(args)) diff --git a/swebench/harness/test_spec.py b/swebench/harness/test_spec.py deleted file mode 100644 index bc5d2f23..00000000 --- a/swebench/harness/test_spec.py +++ /dev/null @@ -1,330 +0,0 @@ -from __future__ import annotations - -import hashlib -import json -import platform -import re - -from dataclasses import dataclass -from typing import Any, Union, cast - -from swebench.harness.constants import ( - SWEbenchInstance, - KEY_INSTANCE_ID, - FAIL_TO_PASS, - PASS_TO_PASS, - MAP_REPO_TO_INSTALL, - MAP_REPO_VERSION_TO_SPECS, - USE_X86, - UTF8, -) -from swebench.harness.dockerfiles import ( - get_dockerfile_base, - get_dockerfile_env, - get_dockerfile_instance, -) -from swebench.harness.utils import ( - get_requirements, - get_environment_yml, - get_test_directives, -) - -DIFF_MODIFIED_FILE_REGEX = r"--- a/(.*)" - - -@dataclass -class TestSpec: - """ - A dataclass that represents a test specification for a single instance of SWE-bench. - """ - instance_id: str - repo: str - version: str - repo_script_list: list[str] - eval_script_list: list[str] - env_script_list: list[str] - arch: str - FAIL_TO_PASS: list[str] - PASS_TO_PASS: list[str] - - @property - def setup_env_script(self): - return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.env_script_list) + "\n" - - @property - def eval_script(self): - return "\n".join(["#!/bin/bash", "set -uxo pipefail"] + self.eval_script_list) + "\n" - # Don't exit early because we need to revert tests at the end - - @property - def install_repo_script(self): - return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.repo_script_list) + "\n" - - @property - def base_image_key(self): - return f"sweb.base.{self.arch}:latest" - - @property - def env_image_key(self): - """ - The key for the environment image is based on the hash of the environment script list. - If the environment script list changes, the image will be rebuilt automatically. - - Note that old images are not automatically deleted, so consider cleaning up old images periodically. - """ - hash_object = hashlib.sha256() - hash_object.update(str(self.env_script_list).encode(UTF8)) - hash_value = hash_object.hexdigest() - val = hash_value[:22] # 22 characters is still very likely to be unique - return f"sweb.env.{self.arch}.{val}:latest" - - @property - def instance_image_key(self): - return f"sweb.eval.{self.arch}.{self.instance_id}:latest" - - def get_instance_container_name(self, run_id=None): - if not run_id: - return f"sweb.eval.{self.instance_id}" - return f"sweb.eval.{self.instance_id}.{run_id}" - - @property - def base_dockerfile(self): - return get_dockerfile_base(self.platform, self.arch) - - @property - def env_dockerfile(self): - return get_dockerfile_env(self.platform, self.arch) - - @property - def instance_dockerfile(self): - return get_dockerfile_instance(self.platform, self.env_image_key) - - @property - def platform(self): - if self.arch == "x86_64": - return "linux/x86_64" - elif self.arch == "arm64": - return "linux/arm64/v8" - else: - raise ValueError(f"Invalid architecture: {self.arch}") - - -def get_test_specs_from_dataset(dataset: Union[list[SWEbenchInstance], list[TestSpec]]) -> list[TestSpec]: - """ - Idempotent function that converts a list of SWEbenchInstance objects to a list of TestSpec objects. - """ - if isinstance(dataset[0], TestSpec): - return cast(list[TestSpec], dataset) - return list(map(make_test_spec, cast(list[SWEbenchInstance], dataset))) - - -def make_repo_script_list(specs, repo, repo_directory, base_commit, env_name): - """ - Create a list of bash commands to set up the repository for testing. - This is the setup script for the instance image. - """ - setup_commands = [ - f"git clone -o origin https://github.com/{repo} {repo_directory}", - f"chmod -R 777 {repo_directory}", # So nonroot user can run tests - f"cd {repo_directory}", - f"git reset --hard {base_commit}", - # Remove the remote so the agent won't see newer commits. - "git remote remove origin", - # Make sure conda is available for later use - "source /opt/miniconda3/bin/activate", - f"conda activate {env_name}", - 'echo "Current environment: $CONDA_DEFAULT_ENV"', - ] - if repo in MAP_REPO_TO_INSTALL: - setup_commands.append(MAP_REPO_TO_INSTALL[repo]) - - # Run pre-install set up if provided - if "pre_install" in specs: - for pre_install in specs["pre_install"]: - setup_commands.append(pre_install) - - if "install" in specs: - setup_commands.append(specs["install"]) - return setup_commands - - -def replace_uninstallable_packages_requirements_txt(requirement_str: str) -> str: - """Replaces certain packages in a requirements.txt-like string. - For example, some packages have been yanked and we need to replace them with compatible alternatives. - """ - replacements = { - # See https://github.com/princeton-nlp/SWE-bench/issues/199 - # This package was sinced yanked, so we need to force pip - # to install it. - "types-pkg_resources": "types-pkg-resources==0.1.3", - } - requirements = [req.strip() for req in requirement_str.split("\n") if req.strip()] - requirements_replaced = [] - for requirement in requirements: - if requirement in replacements: - print(f"Replaced {requirement!r} with {replacements[requirement]!r} (replace_uninstallable_packages)") - requirements_replaced.append(replacements[requirement]) - else: - requirements_replaced.append(requirement) - return "\n".join(requirements_replaced) + "\n" - - -def make_env_script_list(instance: SWEbenchInstance, specs: dict, env_name: str) -> list[str]: - """ - Creates the list of commands to set up the conda environment for testing. - This is the setup script for the environment image. - - Returns: - list[str]: List of commands to set up the conda environment - """ - HEREDOC_DELIMITER = "EOF_59812759871" - reqs_commands = [ - "source /opt/miniconda3/bin/activate", - ] - # Create conda environment according to install instructinos - pkgs = specs.get("packages", "") - if pkgs == "requirements.txt": - # Create environment - cmd = f"conda create -n {env_name} python={specs['python']} -y" - reqs_commands.append(cmd) - - # Install dependencies - reqs = replace_uninstallable_packages_requirements_txt(get_requirements(instance)) - path_to_reqs = "$HOME/requirements.txt" - reqs_commands.append( - f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}" - ) - cmd = f"conda activate {env_name} && python -m pip install -r {path_to_reqs}" - reqs_commands.append(cmd) - reqs_commands.append(f"rm {path_to_reqs}") - elif pkgs == "environment.yml": - # Create environment from yml - reqs = get_environment_yml(instance, env_name) - path_to_reqs = "environment.yml" - reqs_commands.append( - f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}" - ) - if "no_use_env" in specs and specs["no_use_env"]: - # `conda create` based installation - cmd = f"conda create -c conda-forge -n {env_name} python={specs['python']} -y" - reqs_commands.append(cmd) - - # Install dependencies - cmd = f"conda env update -f {path_to_reqs}" - reqs_commands.append(cmd) - else: - # `conda env create` based installation - cmd = f"conda env create --file {path_to_reqs}" - reqs_commands.append(cmd) - - cmd = f"conda activate {env_name} && conda install python={specs['python']} -y" - reqs_commands.append(cmd) - - # Remove environment.yml - reqs_commands.append(f"rm {path_to_reqs}") - else: - # Create environment + install dependencies - cmd = f"conda create -n {env_name} python={specs['python']} {pkgs} -y" - reqs_commands.append(cmd) - - reqs_commands.append(f"conda activate {env_name}") - - # Install additional packages if specified - if "pip_packages" in specs: - pip_packages = " ".join(specs["pip_packages"]) - cmd = f"python -m pip install {pip_packages}" - reqs_commands.append(cmd) - return reqs_commands - - -def make_eval_script_list(instance, specs, env_name, repo_directory, base_commit, test_patch): - """ - Applies the test patch and runs the tests. - """ - HEREDOC_DELIMITER = "EOF_114329324912" - test_files = re.findall(DIFF_MODIFIED_FILE_REGEX, test_patch) - # Reset test files to the state they should be in before the patch. - reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" - apply_test_patch_command = ( - f"git apply -v - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}" - ) - test_command = " ".join( - [ - MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"], - *get_test_directives(instance), - ] - ) - eval_commands = [ - "source /opt/miniconda3/bin/activate", - f"conda activate {env_name}", - f"cd {repo_directory}", - ] - if "eval_commands" in specs: - eval_commands += specs["eval_commands"] - eval_commands += [ - f"git config --global --add safe.directory {repo_directory}", # for nonroot user - f"cd {repo_directory}", - # This is just informational, so we have a record - "git status", - "git show", - f"git diff {base_commit}", - "source /opt/miniconda3/bin/activate", - f"conda activate {env_name}", - ] - if "install" in specs: - eval_commands.append(specs["install"]) - eval_commands += [ - reset_tests_command, - apply_test_patch_command, - test_command, - reset_tests_command, # Revert tests after done, leave the repo in the same state as before - ] - return eval_commands - - -def make_test_spec(instance: SWEbenchInstance) -> TestSpec: - if isinstance(instance, TestSpec): - return instance - instance_id = instance[KEY_INSTANCE_ID] - repo = instance["repo"] - version = instance["version"] - base_commit = instance["base_commit"] - problem_statement = instance["problem_statement"] - hints_text = instance["hints_text"] # Unused - test_patch = instance["test_patch"] - - def _from_json_or_obj(key: str) -> Any: - """If key points to string, load with json""" - if isinstance(instance[key], str): - return json.loads(instance[key]) - return instance[key] - - pass_to_pass = _from_json_or_obj(PASS_TO_PASS) - fail_to_pass = _from_json_or_obj(FAIL_TO_PASS) - - env_name = "testbed" - repo_directory = f"/{env_name}" - specs = MAP_REPO_VERSION_TO_SPECS[repo][version] - - repo_script_list = make_repo_script_list(specs, repo, repo_directory, base_commit, env_name) - env_script_list = make_env_script_list(instance, specs, env_name) - eval_script_list = make_eval_script_list( - instance, specs, env_name, repo_directory, base_commit, test_patch - ) - if platform.machine() in {"aarch64", "arm64"}: - # use arm64 unless explicitly specified - arch = "arm64" if instance_id not in USE_X86 else "x86_64" - else: - arch = "x86_64" - - return TestSpec( - instance_id=instance_id, - repo=repo, - env_script_list=env_script_list, - repo_script_list=repo_script_list, - eval_script_list=eval_script_list, - version=version, - arch=arch, - FAIL_TO_PASS=fail_to_pass, - PASS_TO_PASS=pass_to_pass, - ) diff --git a/swebench/harness/test_spec/__init__.py b/swebench/harness/test_spec/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/swebench/harness/test_spec/create_scripts.py b/swebench/harness/test_spec/create_scripts.py new file mode 100644 index 00000000..7587b90f --- /dev/null +++ b/swebench/harness/test_spec/create_scripts.py @@ -0,0 +1,51 @@ +from swebench.harness.test_spec.javascript import ( + make_repo_script_list_js, + make_env_script_list_js, + make_eval_script_list_js, +) +from swebench.harness.test_spec.python import ( + make_repo_script_list_py, + make_env_script_list_py, + make_eval_script_list_py, +) +from swebench.harness.constants import MAP_REPO_TO_EXT + + +def make_repo_script_list( + specs, repo, repo_directory, base_commit, env_name) -> list: + """ + Create a list of bash commands to set up the repository for testing. + This is the setup script for the instance image. + """ + ext = MAP_REPO_TO_EXT[repo] + func = { + "js": make_repo_script_list_js, + "py": make_repo_script_list_py, + }[ext] + return func(specs, repo, repo_directory, base_commit, env_name) + + +def make_env_script_list(instance, specs, env_name) -> list: + """ + Creates the list of commands to set up the environment for testing. + This is the setup script for the environment image. + """ + ext = MAP_REPO_TO_EXT[instance["repo"]] + func = { + "js": make_env_script_list_js, + "py": make_env_script_list_py, + }[ext] + return func(instance, specs, env_name) + + +def make_eval_script_list( + instance, specs, env_name, repo_directory, base_commit, test_patch) -> list: + """ + Applies the test patch and runs the tests. + """ + ext = MAP_REPO_TO_EXT[instance["repo"]] + func = { + "js": make_eval_script_list_js, + "py": make_eval_script_list_py, + }[ext] + return func(instance, specs, env_name, repo_directory, base_commit, test_patch) diff --git a/swebench/harness/test_spec/javascript.py b/swebench/harness/test_spec/javascript.py new file mode 100644 index 00000000..acee9b7f --- /dev/null +++ b/swebench/harness/test_spec/javascript.py @@ -0,0 +1,138 @@ +import re + +from pathlib import Path +from swebench.harness.constants import ( + END_TEST_OUTPUT, + MAP_REPO_VERSION_TO_SPECS, + START_TEST_OUTPUT, + TEST_XVFB_PREFIX, +) +from swebench.harness.utils import get_modified_files +from unidiff import PatchSet + + +# MARK: Test Command Creation Functions +def get_test_cmds_calypso(instance) -> list: + test_paths = [x.path for x in PatchSet(instance['test_patch'])] + test_cmds = [] + for test_path in test_paths: + if re.search(r"__snapshots__/(.*).js.snap$", test_path): + # Jest snapshots are not run directly + test_path = "/".join(test_path.split("/")[:-2]) + + # Determine which testing script to use + if any([test_path.startswith(x) for x in ["client", "packages"]]): + pkg = test_path.split("/")[0] + if instance['version'] in [ + '10.10.0', '10.12.0', '10.13.0', + '10.14.0', '10.15.2', '10.16.3' + ]: + test_cmds.append(f"./node_modules/.bin/jest --verbose -c=test/{pkg}/jest.config.js '{test_path}'") + elif instance['version'] in [ + '6.11.5', '8.9.1', '8.9.3', '8.9.4', '8.11.0', '8.11.2', + '10.4.1', '10.5.0', '10.6.0', '10.9.0', + ]: + test_cmds.append(f"./node_modules/.bin/jest --verbose -c=test/{pkg}/jest.config.json '{test_path}'") + else: + test_cmds.append(f"npm run test-{pkg} --verbose '{test_path}'") + elif any([test_path.startswith(x) for x in ["test/e2e"]]): + test_cmds.extend([ + "cd test/e2e", + f"NODE_CONFIG_ENV=test npm run test {test_path}", + "cd ../..", + ]) + + return test_cmds + + +MAP_REPO_TO_TEST_CMDS = { + "Automattic/wp-calypso": get_test_cmds_calypso, +} + + +def get_test_cmds(instance) -> list: + if instance["repo"] in MAP_REPO_TO_TEST_CMDS: + return MAP_REPO_TO_TEST_CMDS[instance["repo"]](instance) + test_cmd = MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"] + return [test_cmd] if isinstance(test_cmd, str) else test_cmd + + +# MARK: Utility Functions + +def get_download_img_commands(instance) -> list: + cmds = [] + for i in instance.get("image_assets", {}).get("test_patch", []): + folder = Path(i["path"]).parent + cmds.append(f"mkdir -p {folder}") + cmds.append(f"curl -o {i['path']} {i['url']}") + cmds.append(f"chmod 777 {i['path']}") + return cmds + + +# MARK: Script Creation Functions + +def make_repo_script_list_js(specs, repo, repo_directory, base_commit, env_name) -> list: + """ + Create a list of bash commands to set up the repository for testing. + This is the setup script for the instance image. + """ + setup_commands = [ + f"git clone -o origin https://github.com/{repo} {repo_directory}", + f"cd {repo_directory}", + f"git reset --hard {base_commit}", + f"chmod -R 777 {repo_directory}", # So nonroot user can run tests + # Remove the remote so the agent won't see newer commits. + f"git remote remove origin", + ] + if "install" in specs: + setup_commands.extend(specs["install"]) + return setup_commands + + +def make_env_script_list_js(instance, specs, env_name) -> list: + """ + Creates the list of commands to set up the environment for testing. + This is the setup script for the environment image. + """ + reqs_commands = [] + if "apt-pkgs" in specs: + reqs_commands += [ + "apt-get update", + f"apt-get install -y {' '.join(specs['apt-pkgs'])}" + ] + return reqs_commands + + +def make_eval_script_list_js(instance, specs, env_name, repo_directory, base_commit, test_patch) -> list: + """ + Applies the test patch and runs the tests. + """ + HEREDOC_DELIMITER = "EOF_114329324912" + test_files = get_modified_files(test_patch) + # Reset test files to the state they should be in before the patch. + if test_files: + reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" + else: + reset_tests_command = f'echo "No test files to reset"' + + apply_test_patch_command = ( + f"git apply --verbose --reject - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}" + ) + test_commands = get_test_cmds(instance) + eval_commands = [ + f"cd {repo_directory}", + f"git config --global --add safe.directory {repo_directory}", # for nonroot user + f"cd {repo_directory}", + # This is just informational, so we have a record + # f"git status", + # f"git show", + # f"git -c core.fileMode=false diff {base_commit}", + reset_tests_command, + *get_download_img_commands(instance), + apply_test_patch_command, + f": '{START_TEST_OUTPUT}'", + *test_commands, + f": '{END_TEST_OUTPUT}'", + reset_tests_command, + ] + return eval_commands diff --git a/swebench/harness/test_spec/python.py b/swebench/harness/test_spec/python.py new file mode 100644 index 00000000..235751db --- /dev/null +++ b/swebench/harness/test_spec/python.py @@ -0,0 +1,306 @@ +import os +import posixpath +import re +import requests + +from swebench.harness.constants import ( + SWEbenchInstance, + MAP_REPO_TO_ENV_YML_PATHS, + MAP_REPO_TO_INSTALL, + MAP_REPO_TO_REQS_PATHS, + MAP_REPO_VERSION_TO_SPECS, + NON_TEST_EXTS, + SWE_BENCH_URL_RAW, + START_TEST_OUTPUT, + END_TEST_OUTPUT, +) +from swebench.harness.utils import get_modified_files +from functools import cache + +HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} + + +@cache +def get_environment_yml_by_commit(repo: str, commit: str, env_name: str) -> str: + for req_path in MAP_REPO_TO_ENV_YML_PATHS[repo]: + reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path) + reqs = requests.get(reqs_url, headers=HEADERS) + if reqs.status_code == 200: + break + else: + raise ValueError( + f"Could not find environment.yml at paths {MAP_REPO_TO_ENV_YML_PATHS[repo]} for repo {repo} at commit {commit}" + ) + + lines = reqs.text.split("\n") + cleaned = [] + for line in lines: + # Rename environment to given name + if line.startswith("name:"): + cleaned.append(f"name: {env_name}") + continue + cleaned.append(line) + + return "\n".join(cleaned) + + +def get_environment_yml(instance: SWEbenchInstance, env_name: str) -> str: + """ + Get environment.yml for given task instance + + Args: + instance (dict): SWE Bench Task instance + env_name (str): Rename retrieved environment.yml to this name + Returns: + environment.yml (str): Returns environment.yml as string + """ + # Attempt to find environment.yml at each path based on task instance's repo + commit = ( + instance["environment_setup_commit"] + if "environment_setup_commit" in instance + else instance["base_commit"] + ) + + return get_environment_yml_by_commit(instance["repo"], commit, env_name) + + +@cache +def get_requirements_by_commit(repo: str, commit: str) -> str: + for req_path in MAP_REPO_TO_REQS_PATHS[repo]: + reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path) + reqs = requests.get(reqs_url, headers=HEADERS) + if reqs.status_code == 200: + break + else: + raise ValueError( + f"Could not find requirements.txt at paths {MAP_REPO_TO_REQS_PATHS[repo]} for repo {repo} at commit {commit}" + ) + + lines = reqs.text + original_req = [] + additional_reqs = [] + req_dir = "/".join(req_path.split("/")[:-1]) + exclude_line = lambda line: any( + [line.strip().startswith(x) for x in ["-e .", "#", ".[test"]] + ) + + for line in lines.split("\n"): + if line.strip().startswith("-r"): + # Handle recursive requirements + file_name = line[len("-r") :].strip() + reqs_url = os.path.join( + SWE_BENCH_URL_RAW, + repo, + commit, + req_dir, + file_name, + ) + reqs = requests.get(reqs_url, headers=HEADERS) + if reqs.status_code == 200: + for line_extra in reqs.text.split("\n"): + if not exclude_line(line_extra): + additional_reqs.append(line_extra) + else: + if not exclude_line(line): + original_req.append(line) + + # Combine all requirements into single text body + additional_reqs.append("\n".join(original_req)) + all_reqs = "\n".join(additional_reqs) + + return all_reqs + + +def get_requirements(instance: SWEbenchInstance) -> str: + """ + Get requirements.txt for given task instance + + Args: + instance (dict): task instance + Returns: + requirements.txt (str): Returns requirements.txt as string + """ + # Attempt to find requirements.txt at each path based on task instance's repo + commit = ( + instance["environment_setup_commit"] + if "environment_setup_commit" in instance + else instance["base_commit"] + ) + + return get_requirements_by_commit(instance["repo"], commit) + + +def get_test_directives(instance: SWEbenchInstance) -> list: + """ + Get test directives from the test_patch of a task instance + + Args: + instance (dict): task instance + Returns: + directives (list): List of test directives + """ + # For seq2seq code repos, testing command is fixed + if instance["repo"] == "swe-bench/humaneval": + return ["test.py"] + + # Get test directives from test patch and remove non-test files + diff_pat = r"diff --git a/.* b/(.*)" + test_patch = instance["test_patch"] + directives = re.findall(diff_pat, test_patch) + directives = [ + d for d in directives if not any(d.endswith(ext) for ext in NON_TEST_EXTS) + ] + + # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing) + if instance["repo"] == "django/django": + directives_transformed = [] + for d in directives: + d = d[: -len(".py")] if d.endswith(".py") else d + d = d[len("tests/") :] if d.startswith("tests/") else d + d = d.replace("/", ".") + directives_transformed.append(d) + directives = directives_transformed + + return directives + + +def make_repo_script_list_py(specs, repo, repo_directory, base_commit, env_name) -> list: + """ + Create a list of bash commands to set up the repository for testing. + This is the setup script for the instance image. + """ + setup_commands = [ + f"git clone -o origin https://github.com/{repo} {repo_directory}", + f"chmod -R 777 {repo_directory}", # So nonroot user can run tests + f"cd {repo_directory}", + f"git reset --hard {base_commit}", + # Remove the remote so the agent won't see newer commits. + f"git remote remove origin", + # Make sure conda is available for later use + "source /opt/miniconda3/bin/activate", + f"conda activate {env_name}", + f'echo "Current environment: $CONDA_DEFAULT_ENV"', + ] + if repo in MAP_REPO_TO_INSTALL: + setup_commands.append(MAP_REPO_TO_INSTALL[repo]) + + # Run pre-install set up if provided + if "pre_install" in specs: + for pre_install in specs["pre_install"]: + setup_commands.append(pre_install) + + if "install" in specs: + setup_commands.append(specs["install"]) + return setup_commands + + +def make_env_script_list_py(instance, specs, env_name) -> list: + """ + Creates the list of commands to set up the conda environment for testing. + This is the setup script for the environment image. + """ + HEREDOC_DELIMITER = "EOF_59812759871" + reqs_commands = [ + "source /opt/miniconda3/bin/activate", + ] + # Create conda environment according to install instructinos + pkgs = specs.get("packages", "") + if pkgs == "requirements.txt": + # Create environment + cmd = f"conda create -n {env_name} python={specs['python']} -y" + reqs_commands.append(cmd) + + # Install dependencies + reqs = get_requirements(instance) + path_to_reqs = "$HOME/requirements.txt" + reqs_commands.append( + f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}" + ) + cmd = f"conda activate {env_name} && python -m pip install -r {path_to_reqs}" + reqs_commands.append(cmd) + reqs_commands.append(f"rm {path_to_reqs}") + elif pkgs == "environment.yml": + # Create environment from yml + reqs = get_environment_yml(instance, env_name) + path_to_reqs = "environment.yml" + reqs_commands.append( + f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}" + ) + if "no_use_env" in specs and specs["no_use_env"]: + # `conda create` based installation + cmd = f"conda create -c conda-forge -n {env_name} python={specs['python']} -y" + reqs_commands.append(cmd) + + # Install dependencies + cmd = f"conda env update -f {path_to_reqs}" + reqs_commands.append(cmd) + else: + # `conda env create` based installation + cmd = f"conda env create --file {path_to_reqs}" + reqs_commands.append(cmd) + + cmd = f"conda activate {env_name} && conda install python={specs['python']} -y" + reqs_commands.append(cmd) + + # Remove environment.yml + reqs_commands.append(f"rm {path_to_reqs}") + else: + # Create environment + install dependencies + cmd = f"conda create -n {env_name} python={specs['python']} {pkgs} -y" + reqs_commands.append(cmd) + + reqs_commands.append(f"conda activate {env_name}") + + # Install additional packages if specified + if "pip_packages" in specs: + pip_packages = " ".join(specs["pip_packages"]) + cmd = f"python -m pip install {pip_packages}" + reqs_commands.append(cmd) + return reqs_commands + + +def make_eval_script_list_py(instance, specs, env_name, repo_directory, base_commit, test_patch) -> list: + """ + Applies the test patch and runs the tests. + """ + HEREDOC_DELIMITER = "EOF_114329324912" + test_files = get_modified_files(test_patch) + # Reset test files to the state they should be in before the patch. + reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}" + apply_test_patch_command = ( + f"git apply -v - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}" + ) + test_command = " ".join( + [ + MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"], + *get_test_directives(instance), + ] + ) + eval_commands = [ + f"source /opt/miniconda3/bin/activate", + f"conda activate {env_name}", + f"cd {repo_directory}", + ] + if "eval_commands" in specs: + eval_commands += specs["eval_commands"] + eval_commands += [ + f"git config --global --add safe.directory {repo_directory}", # for nonroot user + f"cd {repo_directory}", + # This is just informational, so we have a record + f"git status", + f"git show", + f"git -c core.fileMode=false diff {base_commit}", + "source /opt/miniconda3/bin/activate", + f"conda activate {env_name}", + ] + if "install" in specs: + eval_commands.append(specs["install"]) + eval_commands += [ + reset_tests_command, + apply_test_patch_command, + f": '{START_TEST_OUTPUT}'", + test_command, + f": '{END_TEST_OUTPUT}'", + reset_tests_command, # Revert tests after done, leave the repo in the same state as before + ] + return eval_commands diff --git a/swebench/harness/test_spec/test_spec.py b/swebench/harness/test_spec/test_spec.py new file mode 100644 index 00000000..83abe995 --- /dev/null +++ b/swebench/harness/test_spec/test_spec.py @@ -0,0 +1,202 @@ +import hashlib +import json +import platform + +from dataclasses import dataclass +from typing import Any, Union, cast + +from swebench.harness.constants import ( + DEFAULT_DOCKER_SPECS, + KEY_INSTANCE_ID, + LATEST, + MAP_REPO_TO_EXT, + MAP_REPO_VERSION_TO_SPECS, + USE_X86, +) +from swebench.harness.constants.constants import SWEbenchInstance +from swebench.harness.dockerfiles import ( + get_dockerfile_base, + get_dockerfile_env, + get_dockerfile_instance, +) +from swebench.harness.test_spec.create_scripts import ( + make_repo_script_list, + make_env_script_list, + make_eval_script_list, +) + + +@dataclass +class TestSpec: + """ + A dataclass that represents a test specification for a single instance of SWE-bench. + """ + instance_id: str + repo: str + version: str + repo_script_list: list[str] + eval_script_list: list[str] + env_script_list: list[str] + arch: str + FAIL_TO_PASS: list[str] + PASS_TO_PASS: list[str] + language: str + docker_specs: dict + namespace: str + base_image_tag: str = LATEST + env_image_tag: str = LATEST + instance_image_tag: str = LATEST + + @property + def setup_env_script(self): + return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.env_script_list) + "\n" + + @property + def eval_script(self): + return "\n".join(["#!/bin/bash", "set -uxo pipefail"] + self.eval_script_list) + "\n" + # Don't exit early because we need to revert tests at the end + + @property + def install_repo_script(self): + return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.repo_script_list) + "\n" + + @property + def base_image_key(self): + return f"sweb.base.{MAP_REPO_TO_EXT[self.repo]}.{self.arch}:{self.base_image_tag}" + + @property + def env_image_key(self): + """ + The key for the environment image is based on the hash of the environment script list. + If the environment script list changes, the image will be rebuilt automatically. + + Note that old images are not automatically deleted, so consider cleaning up old images periodically. + """ + hash_key = str(self.env_script_list) + if self.docker_specs != {}: + hash_key += str(self.docker_specs) + hash_object = hashlib.sha256() + hash_object.update(hash_key.encode("utf-8")) + hash_value = hash_object.hexdigest() + val = hash_value[:22] # 22 characters is still very likely to be unique + return f"sweb.env.{MAP_REPO_TO_EXT[self.repo]}.{self.arch}.{val}:{self.env_image_tag}" + + @property + def instance_image_key(self): + key = f"sweb.eval.{self.arch}.{self.instance_id.lower()}:{self.instance_image_tag}" + if self.is_remote_image: + key = f"{self.namespace}/{key}".replace("__", "_1776_") + return key + + @property + def is_remote_image(self): + return self.namespace is not None + + def get_instance_container_name(self, run_id=None): + if not run_id: + return f"sweb.eval.{self.instance_id}" + return f"sweb.eval.{self.instance_id.lower()}.{run_id}" + + @property + def base_dockerfile(self): + return get_dockerfile_base(self.platform, self.arch, self.language) + + @property + def env_dockerfile(self): + return get_dockerfile_env(self.platform, self.arch, self.language, **{ + **DEFAULT_DOCKER_SPECS, + **self.docker_specs + }) + + @property + def instance_dockerfile(self): + return get_dockerfile_instance(self.platform, self.language, self.env_image_key) + + @property + def platform(self): + if self.arch == "x86_64": + return "linux/x86_64" + elif self.arch == "arm64": + return "linux/arm64/v8" + else: + raise ValueError(f"Invalid architecture: {self.arch}") + + +def get_test_specs_from_dataset( + dataset: Union[list[SWEbenchInstance], list[TestSpec]], + namespace: str=None, + instance_image_tag: str=LATEST, +) -> list[TestSpec]: + """ + Idempotent function that converts a list of SWEbenchInstance objects to a list of TestSpec objects. + """ + if isinstance(dataset[0], TestSpec): + return cast(list[TestSpec], dataset) + return list(map(lambda x: make_test_spec(x, namespace, instance_image_tag), cast(list[SWEbenchInstance], dataset))) + + +def make_test_spec( + instance: SWEbenchInstance, + namespace: str=None, + base_image_tag: str=LATEST, + env_image_tag: str=LATEST, + instance_image_tag: str=LATEST, + ) -> TestSpec: + if isinstance(instance, TestSpec): + return instance + assert base_image_tag is not None, "base_image_tag cannot be None" + assert env_image_tag is not None, "env_image_tag cannot be None" + assert instance_image_tag is not None, "instance_image_tag cannot be None" + instance_id = instance[KEY_INSTANCE_ID] + repo = instance["repo"] + version = instance.get("version") + base_commit = instance["base_commit"] + problem_statement = instance.get("problem_statement") + hints_text = instance.get("hints_text") # Unused + test_patch = instance["test_patch"] + + def _from_json_or_obj(key: str) -> Any: + """If key points to string, load with json""" + if key not in instance: + # If P2P, F2P keys not found, it's a validation instance + return [] + if isinstance(instance[key], str): + return json.loads(instance[key]) + return instance[key] + + pass_to_pass = _from_json_or_obj("PASS_TO_PASS") + fail_to_pass = _from_json_or_obj("FAIL_TO_PASS") + + env_name = "testbed" + repo_directory = f"/{env_name}" + specs = MAP_REPO_VERSION_TO_SPECS[repo][version] + docker_specs = specs.get("docker_specs", {}) + + repo_script_list = make_repo_script_list(specs, repo, repo_directory, base_commit, env_name) + env_script_list = make_env_script_list(instance, specs, env_name) + eval_script_list = make_eval_script_list( + instance, specs, env_name, repo_directory, base_commit, test_patch + ) + if platform.machine() in {"aarch64", "arm64"}: + # use arm64 unless explicitly specified + arch = "arm64" if instance_id not in USE_X86 else "x86_64" + else: + arch = "x86_64" + + return TestSpec( + instance_id=instance_id, + repo=repo, + env_script_list=env_script_list, + repo_script_list=repo_script_list, + eval_script_list=eval_script_list, + version=version, + arch=arch, + FAIL_TO_PASS=fail_to_pass, + PASS_TO_PASS=pass_to_pass, + language=MAP_REPO_TO_EXT[repo], + docker_specs=docker_specs, + namespace=namespace, + base_image_tag=base_image_tag, + env_image_tag=env_image_tag, + instance_image_tag=instance_image_tag, + ) diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py index 9a5b7af9..55524071 100644 --- a/swebench/harness/utils.py +++ b/swebench/harness/utils.py @@ -1,28 +1,68 @@ import json -import os -import posixpath -from pathlib import Path import re import requests +import traceback from argparse import ArgumentTypeError -from datasets import Dataset, load_dataset +from concurrent.futures import ThreadPoolExecutor, as_completed +from datasets import Dataset, load_dataset, load_from_disk from dotenv import load_dotenv -from functools import cache +from pathlib import Path +from tqdm import tqdm from typing import cast - -from swebench.harness.constants import ( - SWEbenchInstance, - MAP_REPO_TO_ENV_YML_PATHS, - MAP_REPO_TO_REQS_PATHS, - NON_TEST_EXTS, - SWE_BENCH_URL_RAW, - KEY_INSTANCE_ID, -) +from swebench.harness.constants import SWEbenchInstance, KEY_INSTANCE_ID +from unidiff import PatchSet load_dotenv() -HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'} + +def run_threadpool(func, payloads, max_workers): + if max_workers <= 0: + return run_sequential(func, payloads) + succeeded, failed = [], [] + with tqdm(total=len(payloads), smoothing=0) as pbar: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Create a future for running each instance + futures = { + executor.submit( + func, + *payload + ): payload + for payload in payloads + } + # Wait for each future to complete + for future in as_completed(futures): + try: + # Update progress bar, check if instance ran successfully + future.result() + succeeded.append(futures[future]) + except Exception as e: + print(f"{type(e)}: {e}") + traceback.print_exc() + failed.append(futures[future]) + continue + pbar.update(1) + pbar.set_description(f"{len(succeeded)} ran successfully, {len(failed)} failed") + return succeeded, failed + + +def run_sequential(func, args_list): + """ + Run a function with a list of arguments sequentially + """ + succeeded, failed = [], [] + pbar = tqdm(total=len(args_list), smoothing=0) + for args in args_list: + try: + func(*args) + succeeded.append(args) + except Exception as e: + traceback.print_exc() + failed.append(args) + pbar.update(1) + pbar.set_description(f"{len(succeeded)} ran successfully, {len(failed)} failed") + pbar.close() + return succeeded, failed def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]: @@ -42,7 +82,10 @@ def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance name = "princeton-nlp/SWE-bench" elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}: name = "princeton-nlp/SWE-bench_Lite" - dataset = cast(Dataset, load_dataset(name, split=split)) + if (Path(name) / split / 'dataset_info.json').exists(): + dataset = cast(Dataset, load_from_disk(Path(name) / split)) + else: + dataset = cast(Dataset, load_dataset(name, split=split)) dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset} if instance_ids: if instance_ids - dataset_ids: @@ -88,6 +131,8 @@ def strip_content(hunk): first_idx = get_first_idx(first_chars) last_idx = get_last_idx(first_chars) new_lines = list(map(lambda x: x.rstrip(), hunk.split("\n")[first_idx:last_idx])) + # should leave one space for empty context lines + new_lines = [line if line.strip() else " " for line in new_lines] new_hunk = "\n" + "\n".join(new_lines) + "\n" return new_hunk, first_idx - 1 @@ -174,151 +219,6 @@ def get_lines_with_word(text, target_word): return False -@cache -def get_environment_yml_by_commit(repo: str, commit: str, env_name: str) -> str: - for req_path in MAP_REPO_TO_ENV_YML_PATHS[repo]: - reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path) - reqs = requests.get(reqs_url, headers=HEADERS) - if reqs.status_code == 200: - break - else: - raise ValueError( - f"Could not find environment.yml at paths {MAP_REPO_TO_ENV_YML_PATHS[repo]} for repo {repo} at commit {commit}" - ) - - lines = reqs.text.split("\n") - cleaned = [] - for line in lines: - # Rename environment to given name - if line.startswith("name:"): - cleaned.append(f"name: {env_name}") - continue - cleaned.append(line) - - return "\n".join(cleaned) - - -def get_environment_yml(instance: SWEbenchInstance, env_name: str) -> str: - """ - Get environment.yml for given task instance - - Args: - instance (dict): SWE Bench Task instance - env_name (str): Rename retrieved environment.yml to this name - Returns: - environment.yml (str): Returns environment.yml as string - """ - # Attempt to find environment.yml at each path based on task instance's repo - - commit = ( - instance["environment_setup_commit"] - if "environment_setup_commit" in instance - else instance["base_commit"] - ) - - return get_environment_yml_by_commit(instance["repo"], commit, env_name) - - -@cache -def get_requirements_by_commit(repo: str, commit: str) -> str: - for req_path in MAP_REPO_TO_REQS_PATHS[repo]: - reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path) - reqs = requests.get(reqs_url, headers=HEADERS) - if reqs.status_code == 200: - break - else: - raise ValueError( - f"Could not find requirements.txt at paths {MAP_REPO_TO_REQS_PATHS[repo]} for repo {repo} at commit {commit}" - ) - - lines = reqs.text - original_req = [] - additional_reqs = [] - req_dir = "/".join(req_path.split("/")[:-1]) - exclude_line = lambda line: any( - [line.strip().startswith(x) for x in ["-e .", "#", ".[test"]] - ) - - for line in lines.split("\n"): - if line.strip().startswith("-r"): - # Handle recursive requirements - file_name = line[len("-r") :].strip() - reqs_url = os.path.join( - SWE_BENCH_URL_RAW, - repo, - commit, - req_dir, - file_name, - ) - reqs = requests.get(reqs_url, headers=HEADERS) - if reqs.status_code == 200: - for line_extra in reqs.text.split("\n"): - if not exclude_line(line_extra): - additional_reqs.append(line_extra) - else: - if not exclude_line(line): - original_req.append(line) - - # Combine all requirements into single text body - additional_reqs.append("\n".join(original_req)) - all_reqs = "\n".join(additional_reqs) - - return all_reqs - - -def get_requirements(instance: SWEbenchInstance) -> str: - """ - Get requirements.txt for given task instance - - Args: - instance (dict): task instance - Returns: - requirements.txt (str): Returns requirements.txt as string - """ - # Attempt to find requirements.txt at each path based on task instance's repo - commit = ( - instance["environment_setup_commit"] - if "environment_setup_commit" in instance - else instance["base_commit"] - ) - - return get_requirements_by_commit(instance["repo"], commit) - - -def get_test_directives(instance: SWEbenchInstance) -> list: - """ - Get test directives from the test_patch of a task instance - - Args: - instance (dict): task instance - Returns: - directives (list): List of test directives - """ - # For seq2seq code repos, testing command is fixed - if instance["repo"] == "swe-bench/humaneval": - return ["test.py"] - - # Get test directives from test patch and remove non-test files - diff_pat = r"diff --git a/.* b/(.*)" - test_patch = instance["test_patch"] - directives = re.findall(diff_pat, test_patch) - directives = [ - d for d in directives if not any(d.endswith(ext) for ext in NON_TEST_EXTS) - ] - - # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing) - if instance["repo"] == "django/django": - directives_transformed = [] - for d in directives: - d = d[: -len(".py")] if d.endswith(".py") else d - d = d[len("tests/") :] if d.startswith("tests/") else d - d = d.replace("/", ".") - directives_transformed.append(d) - directives = directives_transformed - - return directives - - def str2bool(v): """ Minor helper function to convert string to boolean @@ -331,3 +231,26 @@ def str2bool(v): return False else: raise ArgumentTypeError("Boolean value expected.") + + +def get_repo_file(repo, commit, filepath): + url = f'https://raw.githubusercontent.com/{repo}/{commit}/{filepath}' + try: + response = requests.get(url) + if response.status_code == 200: + return response.text + return None + except: + return None + + +def get_modified_files(patch: str) -> list[str]: + """ + Get the list of modified files in a patch + """ + source_files = [] + for file in PatchSet(patch): + if file.source_file != '/dev/null': + source_files.append(file.source_file) + source_files = [x[2:] for x in source_files if x.startswith('a/')] + return source_files