Skip to content

Commit

Permalink
SWE-bench Multimodal Dev (#285)
Browse files Browse the repository at this point in the history
* Add SWE-bench multimodal refactoring

* Minor cleanup

* Misc changes

* Update test_spec
  • Loading branch information
john-b-yang authored Jan 13, 2025
1 parent 5f5a7df commit d83e100
Show file tree
Hide file tree
Showing 26 changed files with 1,846 additions and 883 deletions.
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
'jedi',
'tenacity',
],
'test': [
'pytest',
'pytest-cov',
]
},
include_package_data=True,
)
7 changes: 3 additions & 4 deletions swebench/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "2.1.7"
__version__ = "3.0.0"

from swebench.collect.build_dataset import main as build_dataset
from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline
Expand Down Expand Up @@ -48,8 +48,7 @@
)

from swebench.harness.utils import (
get_environment_yml,
get_requirements,
run_threadpool,
)

from swebench.versioning.constants import (
Expand All @@ -59,9 +58,9 @@

from swebench.versioning.get_versions import (
get_version,
map_version_to_task_instances,
get_versions_from_build,
get_versions_from_web,
map_version_to_task_instances,
)

from swebench.versioning.utils import (
Expand Down
21 changes: 21 additions & 0 deletions swebench/harness/constants/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from swebench.harness.constants.constants import *
from swebench.harness.constants.javascript import *
from swebench.harness.constants.python import *

MAP_REPO_VERSION_TO_SPECS = {
**MAP_REPO_VERSION_TO_SPECS_JS,
**MAP_REPO_VERSION_TO_SPECS_PY,
}

MAP_REPO_TO_INSTALL = {
**MAP_REPO_TO_INSTALL_JS,
**MAP_REPO_TO_INSTALL_PY,
}

MAP_REPO_TO_EXT = {
**{k: "js" for k in MAP_REPO_VERSION_TO_SPECS_JS.keys()},
**{k: "py" for k in MAP_REPO_VERSION_TO_SPECS_PY.keys()},
}

LATEST = "latest"
USE_X86 = USE_X86_PY
113 changes: 113 additions & 0 deletions swebench/harness/constants/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from enum import Enum
from pathlib import Path
from typing import TypedDict

# Constants - Evaluation Log Directories
BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base")
ENV_IMAGE_BUILD_DIR = Path("logs/build_images/env")
INSTANCE_IMAGE_BUILD_DIR = Path("logs/build_images/instances")
RUN_EVALUATION_LOG_DIR = Path("logs/run_evaluation")
RUN_VALIDATION_LOG_DIR = Path("logs/run_validation")

# Constants - Task Instance Class
class SWEbenchInstance(TypedDict):
repo: str
instance_id: str
base_commit: str
patch: str
test_patch: str
problem_statement: str
hints_text: str
created_at: str
version: str
FAIL_TO_PASS: str
PASS_TO_PASS: str
environment_setup_commit: str

# Constants - Test Types, Statuses, Commands
FAIL_TO_PASS = "FAIL_TO_PASS"
FAIL_TO_FAIL = "FAIL_TO_FAIL"
PASS_TO_PASS = "PASS_TO_PASS"
PASS_TO_FAIL = "PASS_TO_FAIL"

class ResolvedStatus(Enum):
NO = "RESOLVED_NO"
PARTIAL = "RESOLVED_PARTIAL"
FULL = "RESOLVED_FULL"

class TestStatus(Enum):
FAILED = "FAILED"
PASSED = "PASSED"
SKIPPED = "SKIPPED"
ERROR = "ERROR"
XFAIL = "XFAIL"

class EvalType(Enum):
PASS_AND_FAIL = "pass_and_fail"
FAIL_ONLY = "fail_only"

# Constants - Evaluation Keys
KEY_INSTANCE_ID = "instance_id"
KEY_MODEL = "model_name_or_path"
KEY_PREDICTION = "model_patch"

# Constants - Harness
DOCKER_PATCH = "/tmp/patch.diff"
DOCKER_USER = "root"
DOCKER_WORKDIR = "/testbed"
LOG_REPORT = "report.json"
LOG_INSTANCE = "run_instance.log"
LOG_TEST_OUTPUT = "test_output.txt"
UTF8 = "utf-8"

# Constants - Logging
APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
APPLY_PATCH_PASS = ">>>>> Applied Patch"
INSTALL_FAIL = ">>>>> Init Failed"
INSTALL_PASS = ">>>>> Init Succeeded"
INSTALL_TIMEOUT = ">>>>> Init Timed Out"
RESET_FAILED = ">>>>> Reset Failed"
TESTS_ERROR = ">>>>> Tests Errored"
TESTS_FAILED = ">>>>> Some Tests Failed"
TESTS_PASSED = ">>>>> All Tests Passed"
TESTS_TIMEOUT = ">>>>> Tests Timed Out"
START_TEST_OUTPUT = ">>>>> Start Test Output"
END_TEST_OUTPUT = ">>>>> End Test Output"

# Constants - Patch Types
class PatchType(Enum):
PATCH_GOLD = "gold"
PATCH_PRED = "pred"
PATCH_PRED_TRY = "pred_try"
PATCH_PRED_MINIMAL = "pred_minimal"
PATCH_PRED_MINIMAL_TRY = "pred_minimal_try"
PATCH_TEST = "test"

def __str__(self):
return self.value

# Constants - Miscellaneous
NON_TEST_EXTS = [
".json",
".png",
"csv",
".txt",
".md",
".jpg",
".jpeg",
".pkl",
".yml",
".yaml",
".toml",
]
SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
DEFAULT_DOCKER_SPECS = {
"pnpm_version": "9.5.0",
"node_version": "21.6.2",
"python_version": "3.9",
}
FAIL_ONLY_REPOS = {
"chartjs/Chart.js",
"processing/p5.js",
"markedjs/marked",
}
161 changes: 161 additions & 0 deletions swebench/harness/constants/javascript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Constants - Commonly Used Commands
TEST_XVFB_PREFIX = 'xvfb-run --server-args="-screen 0 1280x1024x24 -ac :99"'
XVFB_DEPS = [
"python3", "python3-pip", "xvfb", "x11-xkb-utils", "xfonts-100dpi",
"xfonts-75dpi", "xfonts-scalable", "xfonts-cyrillic", "x11-apps", "firefox"
]
X11_DEPS = [
"libx11-xcb1", "libxcomposite1", "libxcursor1", "libxdamage1", "libxi6",
"libxtst6", "libnss3", "libcups2", "libxss1", "libxrandr2", "libasound2",
"libatk1.0-0", "libgtk-3-0", "x11-utils",
]

# Constants - Task Instance Installation Environment
SPECS_CALYPSO = {
**{k: {
"apt-pkgs": ["libsass-dev", "sassc"],
"install": ["npm install --unsafe-perm"],
"test_cmd": "npm run test-client",
"docker_specs": {
"node_version": k,
}
} for k in [
'0.8',
'4.2.3', '4.3.0',
'5.10.1', '5.11.1',
'6.1.0', '6.7.0', '6.9.0', '6.9.1', '6.9.4', '6.10.0', '6.10.2', '6.10.3', '6.11.1', '6.11.2', '6.11.5',
'8.9.1', '8.9.3', '8.9.4', '8.11.0', '8.11.2',
'10.4.1', '10.5.0', '10.6.0', '10.9.0', '10.10.0', '10.12.0', '10.13.0', '10.14.0', '10.15.2', '10.16.3',
]}
}

TEST_CHART_JS_TEMPLATE = "./node_modules/.bin/cross-env NODE_ENV=test ./node_modules/.bin/karma start {} --single-run --coverage --grep --auto-watch false"
SPECS_CHART_JS = {
**{k: {
"install": [
"pnpm install",
"pnpm run build",
],
"test_cmd": [
"pnpm install",
"pnpm run build",
f"{TEST_XVFB_PREFIX} su chromeuser -c \"{TEST_CHART_JS_TEMPLATE.format('./karma.conf.cjs')}\""
],
"docker_specs": {
"node_version": "21.6.2",
"pnpm_version": "7.9.0",
"run_args": {
"cap_add": ["SYS_ADMIN"],
}
},
} for k in ['4.0', '4.1', '4.2', '4.3', '4.4']},
**{k: {
"install": ["npm install"],
"test_cmd": [
"npm install",
"npm run build",
f"{TEST_XVFB_PREFIX} su chromeuser -c \"{TEST_CHART_JS_TEMPLATE.format('./karma.conf.js')}\""
],
"docker_specs": {
"node_version": "21.6.2",
"run_args": {
"cap_add": ["SYS_ADMIN"],
}
}
} for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8']},
**{k: {
"install": [
"npm install",
"npm install -g gulp-cli"
],
"test_cmd": [
"npm install",
"gulp build",
TEST_XVFB_PREFIX + ' su chromeuser -c "gulp test"'
],
"docker_specs": {
"node_version": "21.6.2",
"run_args": {
"cap_add": ["SYS_ADMIN"],
}
}
} for k in ['2.0', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7', '2.8', '2.9']}
}
for v in SPECS_CHART_JS.keys():
SPECS_CHART_JS[v]["apt-pkgs"] = XVFB_DEPS

SPECS_MARKED = {
**{k: {
"install": ["npm install"],
"test_cmd": "./node_modules/.bin/jasmine --no-color --config=jasmine.json",
"docker_specs": {
"node_version": "12.22.12",
}
} for k in [
'0.3', '0.5', '0.6', '0.7', '1.0', '1.1',
'1.2', '2.0', '3.9', '4.0', '4.1', '5.0'
]}
}
for v in ['4.0', '4.1', '5.0']:
SPECS_MARKED[v]["docker_specs"]["node_version"] = "20.16.0"

SPECS_P5_JS = {
**{k: {
"apt-pkgs": X11_DEPS,
"install": [
"npm install",
"PUPPETEER_SKIP_CHROMIUM_DOWNLOAD='' node node_modules/puppeteer/install.js",
"./node_modules/.bin/grunt yui",
],
"test_cmd": (
"""sed -i 's/concurrency:[[:space:]]*[0-9][0-9]*/concurrency: 1/g' Gruntfile.js\n"""
"stdbuf -o 1M ./node_modules/.bin/grunt test --quiet --force"
),
"docker_specs": {
"node_version": "14.17.3",
}
} for k in [
"0.10", "0.2", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9",
"1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7",
"1.8", "1.9",
]
},
}
for k in ['0.4', '0.5', '0.6',]:
SPECS_P5_JS[k]["install"] = [
"npm install",
"./node_modules/.bin/grunt yui",
]

SPECS_REACT_PDF = {
**{k: {
"apt-pkgs": ["pkg-config", "build-essential", "libpixman-1-0", "libpixman-1-dev", "libcairo2-dev", "libpango1.0-dev",
"libjpeg-dev", "libgif-dev", "librsvg2-dev"] + X11_DEPS,
"install": [
"npm i -g yarn",
"yarn install"
],
"test_cmd": 'NODE_OPTIONS="--experimental-vm-modules" ./node_modules/.bin/jest --no-color',
"docker_specs": {
"node_version": "18.20.4"
}
} for k in ['1.0', '1.1', '1.2', '2.0']}
}
for v in ['1.0', '1.1', '1.2']:
SPECS_REACT_PDF[v]["docker_specs"]["node_version"] = "8.17.0"
SPECS_REACT_PDF[v]["install"] = [
"npm install",
"npm install [email protected]"
]
SPECS_REACT_PDF[v]["test_cmd"] = "./node_modules/.bin/jest --no-color"

MAP_REPO_VERSION_TO_SPECS_JS = {
"Automattic/wp-calypso": SPECS_CALYPSO,
"chartjs/Chart.js": SPECS_CHART_JS,
"markedjs/marked": SPECS_MARKED,
"processing/p5.js": SPECS_P5_JS,
"diegomura/react-pdf": SPECS_REACT_PDF,
}

# Constants - Repository Specific Installation Instructions
MAP_REPO_TO_INSTALL_JS = {}
Loading

0 comments on commit d83e100

Please sign in to comment.