diff --git a/setup.py b/setup.py
index 5fa82e5e..f9467af6 100644
--- a/setup.py
+++ b/setup.py
@@ -56,6 +56,10 @@
             'jedi',
             'tenacity',
         ],
+        'test': [
+            'pytest',
+            'pytest-cov',
+        ]
     },
     include_package_data=True,
 )
\ No newline at end of file
diff --git a/swebench/__init__.py b/swebench/__init__.py
index 10b51570..e1216c6d 100644
--- a/swebench/__init__.py
+++ b/swebench/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.1.7"
+__version__ = "3.0.0"
 
 from swebench.collect.build_dataset import main as build_dataset
 from swebench.collect.get_tasks_pipeline import main as get_tasks_pipeline
@@ -48,8 +48,7 @@
 )
 
 from swebench.harness.utils import (
-    get_environment_yml,
-    get_requirements,
+    run_threadpool,
 )
 
 from swebench.versioning.constants import (
@@ -59,9 +58,9 @@
 
 from swebench.versioning.get_versions import (
     get_version,
-    map_version_to_task_instances,
     get_versions_from_build,
     get_versions_from_web,
+    map_version_to_task_instances,
 )
 
 from swebench.versioning.utils import (
diff --git a/swebench/harness/constants/__init__.py b/swebench/harness/constants/__init__.py
new file mode 100644
index 00000000..ad4dcc8b
--- /dev/null
+++ b/swebench/harness/constants/__init__.py
@@ -0,0 +1,21 @@
+from swebench.harness.constants.constants import *
+from swebench.harness.constants.javascript import *
+from swebench.harness.constants.python import *
+
+MAP_REPO_VERSION_TO_SPECS = {
+    **MAP_REPO_VERSION_TO_SPECS_JS,
+    **MAP_REPO_VERSION_TO_SPECS_PY,
+}
+
+MAP_REPO_TO_INSTALL = {
+    **MAP_REPO_TO_INSTALL_JS,
+    **MAP_REPO_TO_INSTALL_PY,
+}
+
+MAP_REPO_TO_EXT = {
+    **{k: "js" for k in MAP_REPO_VERSION_TO_SPECS_JS.keys()},
+    **{k: "py" for k in MAP_REPO_VERSION_TO_SPECS_PY.keys()},
+}
+
+LATEST = "latest"
+USE_X86 = USE_X86_PY
diff --git a/swebench/harness/constants/constants.py b/swebench/harness/constants/constants.py
new file mode 100644
index 00000000..39a7524a
--- /dev/null
+++ b/swebench/harness/constants/constants.py
@@ -0,0 +1,113 @@
+from enum import Enum
+from pathlib import Path
+from typing import TypedDict
+
+# Constants - Evaluation Log Directories
+BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base")
+ENV_IMAGE_BUILD_DIR = Path("logs/build_images/env")
+INSTANCE_IMAGE_BUILD_DIR = Path("logs/build_images/instances")
+RUN_EVALUATION_LOG_DIR = Path("logs/run_evaluation")
+RUN_VALIDATION_LOG_DIR = Path("logs/run_validation")
+
+# Constants - Task Instance Class
+class SWEbenchInstance(TypedDict):
+    repo: str
+    instance_id: str
+    base_commit: str
+    patch: str
+    test_patch: str
+    problem_statement: str
+    hints_text: str
+    created_at: str
+    version: str
+    FAIL_TO_PASS: str
+    PASS_TO_PASS: str
+    environment_setup_commit: str
+
+# Constants - Test Types, Statuses, Commands
+FAIL_TO_PASS = "FAIL_TO_PASS"
+FAIL_TO_FAIL = "FAIL_TO_FAIL"
+PASS_TO_PASS = "PASS_TO_PASS"
+PASS_TO_FAIL = "PASS_TO_FAIL"
+
+class ResolvedStatus(Enum):
+    NO = "RESOLVED_NO"
+    PARTIAL = "RESOLVED_PARTIAL"
+    FULL = "RESOLVED_FULL"
+
+class TestStatus(Enum):
+    FAILED = "FAILED"
+    PASSED = "PASSED"
+    SKIPPED = "SKIPPED"
+    ERROR = "ERROR"
+    XFAIL = "XFAIL"
+    
+class EvalType(Enum):
+    PASS_AND_FAIL = "pass_and_fail"
+    FAIL_ONLY = "fail_only"
+
+# Constants - Evaluation Keys
+KEY_INSTANCE_ID = "instance_id"
+KEY_MODEL = "model_name_or_path"
+KEY_PREDICTION = "model_patch"
+
+# Constants - Harness
+DOCKER_PATCH = "/tmp/patch.diff"
+DOCKER_USER = "root"
+DOCKER_WORKDIR = "/testbed"
+LOG_REPORT = "report.json"
+LOG_INSTANCE = "run_instance.log"
+LOG_TEST_OUTPUT = "test_output.txt"
+UTF8 = "utf-8"
+
+# Constants - Logging
+APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
+APPLY_PATCH_PASS = ">>>>> Applied Patch"
+INSTALL_FAIL = ">>>>> Init Failed"
+INSTALL_PASS = ">>>>> Init Succeeded"
+INSTALL_TIMEOUT = ">>>>> Init Timed Out"
+RESET_FAILED = ">>>>> Reset Failed"
+TESTS_ERROR = ">>>>> Tests Errored"
+TESTS_FAILED = ">>>>> Some Tests Failed"
+TESTS_PASSED = ">>>>> All Tests Passed"
+TESTS_TIMEOUT = ">>>>> Tests Timed Out"
+START_TEST_OUTPUT = ">>>>> Start Test Output"
+END_TEST_OUTPUT = ">>>>> End Test Output"
+
+# Constants - Patch Types
+class PatchType(Enum):
+    PATCH_GOLD = "gold"
+    PATCH_PRED = "pred"
+    PATCH_PRED_TRY = "pred_try"
+    PATCH_PRED_MINIMAL = "pred_minimal"
+    PATCH_PRED_MINIMAL_TRY = "pred_minimal_try"
+    PATCH_TEST = "test"
+
+    def __str__(self):
+        return self.value
+
+# Constants - Miscellaneous
+NON_TEST_EXTS = [
+    ".json",
+    ".png",
+    "csv",
+    ".txt",
+    ".md",
+    ".jpg",
+    ".jpeg",
+    ".pkl",
+    ".yml",
+    ".yaml",
+    ".toml",
+]
+SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
+DEFAULT_DOCKER_SPECS = {
+    "pnpm_version": "9.5.0",
+    "node_version": "21.6.2",
+    "python_version": "3.9",
+}
+FAIL_ONLY_REPOS = {
+    "chartjs/Chart.js",
+    "processing/p5.js",
+    "markedjs/marked",
+}
diff --git a/swebench/harness/constants/javascript.py b/swebench/harness/constants/javascript.py
new file mode 100644
index 00000000..59b486a1
--- /dev/null
+++ b/swebench/harness/constants/javascript.py
@@ -0,0 +1,161 @@
+# Constants - Commonly Used Commands
+TEST_XVFB_PREFIX = 'xvfb-run --server-args="-screen 0 1280x1024x24 -ac :99"'
+XVFB_DEPS = [
+    "python3", "python3-pip", "xvfb", "x11-xkb-utils", "xfonts-100dpi",
+    "xfonts-75dpi", "xfonts-scalable", "xfonts-cyrillic", "x11-apps", "firefox"
+]
+X11_DEPS = [
+    "libx11-xcb1", "libxcomposite1", "libxcursor1", "libxdamage1", "libxi6", 
+    "libxtst6", "libnss3", "libcups2", "libxss1", "libxrandr2", "libasound2",
+    "libatk1.0-0", "libgtk-3-0", "x11-utils",
+]
+
+# Constants - Task Instance Installation Environment
+SPECS_CALYPSO = {
+    **{k: {
+        "apt-pkgs": ["libsass-dev", "sassc"],
+        "install": ["npm install --unsafe-perm"],
+        "test_cmd": "npm run test-client",
+        "docker_specs": {
+            "node_version": k,
+        }
+    } for k in [
+        '0.8',
+        '4.2.3', '4.3.0',
+        '5.10.1', '5.11.1',
+        '6.1.0', '6.7.0', '6.9.0', '6.9.1', '6.9.4', '6.10.0', '6.10.2', '6.10.3', '6.11.1', '6.11.2', '6.11.5',
+        '8.9.1', '8.9.3', '8.9.4', '8.11.0', '8.11.2',
+        '10.4.1', '10.5.0', '10.6.0', '10.9.0', '10.10.0', '10.12.0', '10.13.0', '10.14.0', '10.15.2', '10.16.3',
+    ]}
+}
+
+TEST_CHART_JS_TEMPLATE = "./node_modules/.bin/cross-env NODE_ENV=test ./node_modules/.bin/karma start {} --single-run --coverage --grep --auto-watch false"
+SPECS_CHART_JS = {
+    **{k: {
+        "install": [
+            "pnpm install",
+            "pnpm run build",
+        ],
+        "test_cmd": [
+            "pnpm install",
+            "pnpm run build",
+            f"{TEST_XVFB_PREFIX} su chromeuser -c \"{TEST_CHART_JS_TEMPLATE.format('./karma.conf.cjs')}\""
+        ],
+        "docker_specs": {
+            "node_version": "21.6.2",
+            "pnpm_version": "7.9.0",
+            "run_args": {
+                "cap_add": ["SYS_ADMIN"],
+            }
+        },
+    } for k in ['4.0', '4.1', '4.2', '4.3', '4.4']},
+    **{k: {
+        "install": ["npm install"],
+        "test_cmd": [
+            "npm install",
+            "npm run build",
+            f"{TEST_XVFB_PREFIX} su chromeuser -c \"{TEST_CHART_JS_TEMPLATE.format('./karma.conf.js')}\""
+        ],
+        "docker_specs": {
+            "node_version": "21.6.2",
+            "run_args": {
+                "cap_add": ["SYS_ADMIN"],
+            }
+        }
+    } for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8']},
+    **{k: {
+        "install": [
+            "npm install",
+            "npm install -g gulp-cli"
+        ],
+        "test_cmd": [
+            "npm install",
+            "gulp build",
+            TEST_XVFB_PREFIX + ' su chromeuser -c "gulp test"'
+        ],
+        "docker_specs": {
+            "node_version": "21.6.2",
+            "run_args": {
+                "cap_add": ["SYS_ADMIN"],
+            }
+        }
+    } for k in ['2.0', '2.1', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7', '2.8', '2.9']}
+}
+for v in SPECS_CHART_JS.keys():
+    SPECS_CHART_JS[v]["apt-pkgs"] = XVFB_DEPS
+
+SPECS_MARKED = {
+    **{k: {
+        "install": ["npm install"],
+        "test_cmd": "./node_modules/.bin/jasmine --no-color --config=jasmine.json",
+        "docker_specs": {
+            "node_version": "12.22.12",
+        }
+    } for k in [
+        '0.3', '0.5', '0.6', '0.7', '1.0', '1.1',
+        '1.2', '2.0', '3.9', '4.0', '4.1', '5.0'
+    ]}
+}
+for v in ['4.0', '4.1', '5.0']:
+    SPECS_MARKED[v]["docker_specs"]["node_version"] = "20.16.0"
+
+SPECS_P5_JS = {
+    **{k: {
+        "apt-pkgs": X11_DEPS,
+        "install": [
+            "npm install",
+            "PUPPETEER_SKIP_CHROMIUM_DOWNLOAD='' node node_modules/puppeteer/install.js",
+            "./node_modules/.bin/grunt yui",
+        ],
+        "test_cmd": (
+            """sed -i 's/concurrency:[[:space:]]*[0-9][0-9]*/concurrency: 1/g' Gruntfile.js\n"""
+            "stdbuf -o 1M ./node_modules/.bin/grunt test --quiet --force"
+        ),
+        "docker_specs": {
+            "node_version": "14.17.3",
+        }
+    } for k in [
+        "0.10", "0.2", "0.4", "0.5", "0.6", "0.7", "0.8", "0.9",
+        "1.0", "1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7",
+        "1.8", "1.9",
+    ]
+    },
+}
+for k in ['0.4', '0.5', '0.6',]:
+    SPECS_P5_JS[k]["install"] = [
+        "npm install",
+        "./node_modules/.bin/grunt yui",
+    ]
+
+SPECS_REACT_PDF = {
+    **{k: {
+        "apt-pkgs": ["pkg-config", "build-essential", "libpixman-1-0", "libpixman-1-dev", "libcairo2-dev", "libpango1.0-dev",
+                "libjpeg-dev", "libgif-dev", "librsvg2-dev"] + X11_DEPS,
+        "install": [
+            "npm i -g yarn",
+            "yarn install"
+        ],
+        "test_cmd": 'NODE_OPTIONS="--experimental-vm-modules" ./node_modules/.bin/jest --no-color',
+        "docker_specs": {
+            "node_version": "18.20.4"
+        }
+    } for k in ['1.0', '1.1', '1.2', '2.0']}
+}
+for v in ['1.0', '1.1', '1.2']:
+    SPECS_REACT_PDF[v]["docker_specs"]["node_version"] = "8.17.0"
+    SPECS_REACT_PDF[v]["install"] = [
+        "npm install",
+        "npm install cheerio@1.0.0-rc.3"
+    ]
+    SPECS_REACT_PDF[v]["test_cmd"] = "./node_modules/.bin/jest --no-color"
+
+MAP_REPO_VERSION_TO_SPECS_JS = {
+    "Automattic/wp-calypso": SPECS_CALYPSO,
+    "chartjs/Chart.js": SPECS_CHART_JS,
+    "markedjs/marked": SPECS_MARKED,
+    "processing/p5.js": SPECS_P5_JS,
+    "diegomura/react-pdf": SPECS_REACT_PDF,
+}
+
+# Constants - Repository Specific Installation Instructions
+MAP_REPO_TO_INSTALL_JS = {}
diff --git a/swebench/harness/constants.py b/swebench/harness/constants/python.py
similarity index 94%
rename from swebench/harness/constants.py
rename to swebench/harness/constants/python.py
index cc6e8252..c37d27f5 100644
--- a/swebench/harness/constants.py
+++ b/swebench/harness/constants/python.py
@@ -1,47 +1,4 @@
-from enum import Enum
-from pathlib import Path
-from typing import TypedDict
-
-# Constants - Evaluation Log Directories
-BASE_IMAGE_BUILD_DIR = Path("logs/build_images/base")
-ENV_IMAGE_BUILD_DIR = Path("logs/build_images/env")
-INSTANCE_IMAGE_BUILD_DIR = Path("logs/build_images/instances")
-RUN_EVALUATION_LOG_DIR = Path("logs/run_evaluation")
-
-# Constants - Task Instance Class
-class SWEbenchInstance(TypedDict):
-    repo: str
-    instance_id: str
-    base_commit: str
-    patch: str
-    test_patch: str
-    problem_statement: str
-    hints_text: str
-    created_at: str
-    version: str
-    FAIL_TO_PASS: str
-    PASS_TO_PASS: str
-    environment_setup_commit: str
-
-
-# Constants - Test Types, Statuses, Commands
-FAIL_TO_PASS = "FAIL_TO_PASS"
-FAIL_TO_FAIL = "FAIL_TO_FAIL"
-PASS_TO_PASS = "PASS_TO_PASS"
-PASS_TO_FAIL = "PASS_TO_FAIL"
-
-class ResolvedStatus(Enum):
-    NO = "RESOLVED_NO"
-    PARTIAL = "RESOLVED_PARTIAL"
-    FULL = "RESOLVED_FULL"
-
-class TestStatus(Enum):
-    FAILED = "FAILED"
-    PASSED = "PASSED"
-    SKIPPED = "SKIPPED"
-    ERROR = "ERROR"
-    XFAIL = "XFAIL"
-
+# Constants - Testing Commands 
 TEST_PYTEST = "pytest --no-header -rA --tb=no -p no:cacheprovider"
 TEST_PYTEST_VERBOSE = "pytest -rA --tb=long -p no:cacheprovider"
 TEST_ASTROPY_PYTEST = "pytest -rA -vv -o console_output_style=classic --tb=no"
@@ -912,7 +869,7 @@ class TestStatus(Enum):
 SPECS_HUMANEVAL = {k: {"python": "3.9", "test_cmd": "python"} for k in ["1.0"]}
 
 # Constants - Task Instance Instllation Environment
-MAP_REPO_VERSION_TO_SPECS = {
+MAP_REPO_VERSION_TO_SPECS_PY = {
     "astropy/astropy": SPECS_ASTROPY,
     "dbt-labs/dbt-core": SPECS_DBT_CORE,
     "django/django": SPECS_DJANGO,
@@ -936,7 +893,7 @@ class TestStatus(Enum):
 }
 
 # Constants - Repository Specific Installation Instructions
-MAP_REPO_TO_INSTALL = {}
+MAP_REPO_TO_INSTALL_PY = {}
 
 
 # Constants - Task Instance Requirements File Paths
@@ -961,65 +918,7 @@ class TestStatus(Enum):
     "pydata/xarray": ["ci/requirements/environment.yml", "environment.yml"],
 }
 
-
-# Constants - Evaluation Keys
-KEY_INSTANCE_ID = "instance_id"
-KEY_MODEL = "model_name_or_path"
-KEY_PREDICTION = "model_patch"
-
-
-# Constants - Harness
-DOCKER_PATCH = "/tmp/patch.diff"
-DOCKER_USER = "root"
-DOCKER_WORKDIR = "/testbed"
-LOG_REPORT = "report.json"
-LOG_INSTANCE = "run_instance.log"
-LOG_TEST_OUTPUT = "test_output.txt"
-UTF8 = "utf-8"
-
-
-# Constants - Logging
-APPLY_PATCH_FAIL = ">>>>> Patch Apply Failed"
-APPLY_PATCH_PASS = ">>>>> Applied Patch"
-INSTALL_FAIL = ">>>>> Init Failed"
-INSTALL_PASS = ">>>>> Init Succeeded"
-INSTALL_TIMEOUT = ">>>>> Init Timed Out"
-RESET_FAILED = ">>>>> Reset Failed"
-TESTS_ERROR = ">>>>> Tests Errored"
-TESTS_FAILED = ">>>>> Some Tests Failed"
-TESTS_PASSED = ">>>>> All Tests Passed"
-TESTS_TIMEOUT = ">>>>> Tests Timed Out"
-
-
-# Constants - Patch Types
-class PatchType(Enum):
-    PATCH_GOLD = "gold"
-    PATCH_PRED = "pred"
-    PATCH_PRED_TRY = "pred_try"
-    PATCH_PRED_MINIMAL = "pred_minimal"
-    PATCH_PRED_MINIMAL_TRY = "pred_minimal_try"
-    PATCH_TEST = "test"
-
-    def __str__(self):
-        return self.value
-
-
-# Constants - Miscellaneous
-NON_TEST_EXTS = [
-    ".json",
-    ".png",
-    "csv",
-    ".txt",
-    ".md",
-    ".jpg",
-    ".jpeg",
-    ".pkl",
-    ".yml",
-    ".yaml",
-    ".toml",
-]
-SWE_BENCH_URL_RAW = "https://raw.githubusercontent.com/"
-USE_X86 = {
+USE_X86_PY = {
     "astropy__astropy-7973",
     "django__django-10087",
     "django__django-10097",
diff --git a/swebench/harness/docker_build.py b/swebench/harness/docker_build.py
index 58cadb4b..affc73de 100644
--- a/swebench/harness/docker_build.py
+++ b/swebench/harness/docker_build.py
@@ -5,28 +5,27 @@
 import traceback
 import docker
 import docker.errors
-from tqdm import tqdm
-from concurrent.futures import ThreadPoolExecutor, as_completed
+
 from pathlib import Path
 
 from swebench.harness.constants import (
-    DOCKER_USER,
     BASE_IMAGE_BUILD_DIR,
+    DOCKER_USER,
     ENV_IMAGE_BUILD_DIR,
     INSTANCE_IMAGE_BUILD_DIR,
-    MAP_REPO_VERSION_TO_SPECS,
     UTF8,
 )
-from swebench.harness.test_spec import (
-    get_test_specs_from_dataset,
-    make_test_spec,
-    TestSpec
-)
 from swebench.harness.docker_utils import (
     cleanup_container,
     remove_image,
     find_dependent_images
 )
+from swebench.harness.test_spec.test_spec import (
+    get_test_specs_from_dataset,
+    make_test_spec,
+    TestSpec,
+)
+from swebench.harness.utils import run_threadpool
 
 ansi_escape = re.compile(r"\x1B\[[0-?]*[ -/]*[@-~]")
 
@@ -280,44 +279,18 @@ def build_env_images(
         return [], []
     print(f"Total environment images to build: {len(configs_to_build)}")
 
-    # Build the environment images
-    successful, failed = list(), list()
-    with tqdm(
-        total=len(configs_to_build), smoothing=0, desc="Building environment images"
-    ) as pbar:
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # Create a future for each image to build
-            futures = {
-                executor.submit(
-                    build_image,
-                    image_name,
-                    {"setup_env.sh": config["setup_script"]},
-                    config["dockerfile"],
-                    config["platform"],
-                    client,
-                    ENV_IMAGE_BUILD_DIR / image_name.replace(":", "__"),
-                ): image_name
-                for image_name, config in configs_to_build.items()
-            }
-
-            # Wait for each future to complete
-            for future in as_completed(futures):
-                pbar.update(1)
-                try:
-                    # Update progress bar, check if image built successfully
-                    future.result()
-                    successful.append(futures[future])
-                except BuildImageError as e:
-                    print(f"BuildImageError {e.image_name}")
-                    traceback.print_exc()
-                    failed.append(futures[future])
-                    continue
-                except Exception:
-                    print("Error building image")
-                    traceback.print_exc()
-                    failed.append(futures[future])
-                    continue
-
+    args_list = list()
+    for image_name, config in configs_to_build.items():
+        args_list.append((
+            image_name,
+            {"setup_env.sh": config["setup_script"]},
+            config["dockerfile"],
+            config["platform"],
+            client,
+            ENV_IMAGE_BUILD_DIR / image_name.replace(":", "__"),
+        ))
+    
+    successful, failed = run_threadpool(build_image, args_list, max_workers)
     # Show how many images failed to build
     if len(failed) == 0:
         print("All environment images built successfully.")
@@ -332,7 +305,9 @@ def build_instance_images(
         client: docker.DockerClient,
         dataset: list,
         force_rebuild: bool = False,
-        max_workers: int = 4
+        max_workers: int = 4,
+        namespace: str = None,
+        tag: str = None,
     ):
     """
     Builds the instance images required for the dataset if they do not already exist.
@@ -344,7 +319,7 @@ def build_instance_images(
         max_workers (int): Maximum number of workers to use for building images
     """
     # Build environment images (and base images as needed) first
-    test_specs = list(map(make_test_spec, dataset))
+    test_specs = list(map(lambda x: make_test_spec(x, namespace=namespace, instance_image_tag=tag), dataset))
     if force_rebuild:
         for spec in test_specs:
             remove_image(client, spec.instance_image_key, "quiet")
@@ -357,42 +332,11 @@ def build_instance_images(
         print(f"Skipping {len(dont_run_specs)} instances - due to failed env image builds")
     print(f"Building instance images for {len(test_specs)} instances")
     successful, failed = list(), list()
-
+    
+    # `logger` is set to None b/c logger is created in build-instage_image
+    payloads = [(spec, client, None, False) for spec in test_specs]
     # Build the instance images
-    with tqdm(
-        total=len(test_specs), smoothing=0, desc="Building instance images"
-    ) as pbar:
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # Create a future for each image to build
-            futures = {
-                executor.submit(
-                    build_instance_image,
-                    test_spec,
-                    client,
-                    None,  # logger is created in build_instance_image, don't make loggers before you need them
-                    False,
-                ): test_spec
-                for test_spec in test_specs
-            }
-
-            # Wait for each future to complete
-            for future in as_completed(futures):
-                pbar.update(1)
-                try:
-                    # Update progress bar, check if image built successfully
-                    future.result()
-                    successful.append(futures[future])
-                except BuildImageError as e:
-                    print(f"BuildImageError {e.image_name}")
-                    traceback.print_exc()
-                    failed.append(futures[future])
-                    continue
-                except Exception:
-                    print("Error building image")
-                    traceback.print_exc()
-                    failed.append(futures[future])
-                    continue
-
+    successful, failed = run_threadpool(build_instance_image, payloads, max_workers)
     # Show how many images failed to build
     if len(failed) == 0:
         print("All instance images built successfully.")
@@ -432,7 +376,7 @@ def build_instance_image(
 
     # Check that the env. image the instance image is based on exists
     try:
-        client.images.get(env_image_name)
+        env_image = client.images.get(env_image_name)
     except docker.errors.ImageNotFound as e:
         raise BuildImageError(
             test_spec.instance_id,
@@ -494,25 +438,34 @@ def build_container(
     # Build corresponding instance image
     if force_rebuild:
         remove_image(client, test_spec.instance_image_key, "quiet")
-    build_instance_image(test_spec, client, logger, nocache)
+    if not test_spec.is_remote_image:
+        build_instance_image(test_spec, client, logger, nocache)
+    else:
+        try:
+            client.images.get(test_spec.instance_image_key)
+        except docker.errors.ImageNotFound:
+            try:
+                client.images.pull(test_spec.instance_image_key)
+            except docker.errors.NotFound as e:
+                raise BuildImageError(test_spec.instance_id, str(e), logger) from e
 
     container = None
     try:
-        # Get configurations for how container should be created
-        config = MAP_REPO_VERSION_TO_SPECS[test_spec.repo][test_spec.version]
-        user = DOCKER_USER if not config.get("execute_test_as_nonroot", False) else "nonroot"
-        nano_cpus = config.get("nano_cpus")
-
         # Create the container
         logger.info(f"Creating container for {test_spec.instance_id}...")
+
+        # Define arguments for running the container
+        run_args = test_spec.docker_specs.get("run_args", {})
+        cap_add = run_args.get("cap_add", [])
+
         container = client.containers.create(
             image=test_spec.instance_image_key,
             name=test_spec.get_instance_container_name(run_id),
-            user=user,
+            user=DOCKER_USER,
             detach=True,
             command="tail -f /dev/null",
-            nano_cpus=nano_cpus,
             platform=test_spec.platform,
+            cap_add=cap_add,
         )
         logger.info(f"Container for {test_spec.instance_id} created: {container.id}")
         return container
diff --git a/swebench/harness/docker_utils.py b/swebench/harness/docker_utils.py
index 2a1dddf2..c8ebd758 100644
--- a/swebench/harness/docker_utils.py
+++ b/swebench/harness/docker_utils.py
@@ -307,6 +307,8 @@ def should_remove(
     Determine if an image should be removed based on cache level and clean flag.
     """
     existed_before = image_name in prior_images
+    if '/' in image_name:
+        image_name = image_name.split('/', 1)[-1]
     if image_name.startswith("sweb.base"):
         if cache_level in {"none"} and (clean or not existed_before):
             return True
diff --git a/swebench/harness/dockerfiles/__init__.py b/swebench/harness/dockerfiles/__init__.py
new file mode 100644
index 00000000..a0583521
--- /dev/null
+++ b/swebench/harness/dockerfiles/__init__.py
@@ -0,0 +1,51 @@
+from swebench.harness.dockerfiles.javascript import (
+    _DOCKERFILE_BASE_JS,
+    _DOCKERFILE_ENV_JS,
+    _DOCKERFILE_INSTANCE_JS,
+)
+
+from swebench.harness.dockerfiles.python import (
+    _DOCKERFILE_BASE_PY,
+    _DOCKERFILE_ENV_PY,
+    _DOCKERFILE_INSTANCE_PY,
+)
+
+_DOCKERFILE_BASE = {
+    "py": _DOCKERFILE_BASE_PY,
+    "js": _DOCKERFILE_BASE_JS,
+}
+
+_DOCKERFILE_ENV = {
+    "py": _DOCKERFILE_ENV_PY,
+    "js": _DOCKERFILE_ENV_JS,
+}
+
+_DOCKERFILE_INSTANCE = {
+    "py": _DOCKERFILE_INSTANCE_PY,
+    "js": _DOCKERFILE_INSTANCE_JS,
+}
+
+def get_dockerfile_base(platform, arch, language):
+    if arch == "arm64":
+        conda_arch = "aarch64"
+    else:
+        conda_arch = arch
+    return _DOCKERFILE_BASE[language].format(
+        platform=platform,
+        conda_arch=conda_arch
+    )
+
+
+def get_dockerfile_env(platform, arch, language, **kwargs):
+    return _DOCKERFILE_ENV[language].format(
+        platform=platform,
+        arch=arch,
+        **kwargs
+    )
+
+
+def get_dockerfile_instance(platform, language, env_image_name):
+    return _DOCKERFILE_INSTANCE[language].format(
+        platform=platform,
+        env_image_name=env_image_name
+    )
diff --git a/swebench/harness/dockerfiles/javascript.py b/swebench/harness/dockerfiles/javascript.py
new file mode 100644
index 00000000..fa2235a4
--- /dev/null
+++ b/swebench/harness/dockerfiles/javascript.py
@@ -0,0 +1,135 @@
+_DOCKERFILE_BASE_JS = r"""
+FROM --platform={platform} ubuntu:22.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+
+# Install necessary packages
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    libssl-dev \
+    software-properties-common \
+    wget \
+    gnupg \
+    jq \
+    ca-certificates \
+    dbus \
+    ffmpeg \
+    imagemagick \
+    && apt-get -y autoclean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Chrome
+RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
+    && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
+    && apt-get update \
+    && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg \
+        fonts-khmeros fonts-kacst fonts-freefont-ttf libxss1 dbus dbus-x11 \
+        --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install NVM
+ENV NVM_DIR /usr/local/nvm
+
+RUN mkdir -p $NVM_DIR
+RUN curl --silent -o- https://raw.githubusercontent.com/creationix/nvm/v0.39.3/install.sh | bash
+
+# Install necessary libraries for Chrome
+RUN apt-get update && apt-get install -y \
+    procps \
+    libasound2 libatk-bridge2.0-0 libatk1.0-0 libcups2 libdrm2 \
+    libgbm1 libgconf-2-4 libgdk-pixbuf2.0-0 libgtk-3-0 libnspr4 \
+    libnss3 libpango-1.0-0 libpangocairo-1.0-0 libxcomposite1 \
+    libxdamage1 libxfixes3 libxkbcommon0 libxrandr2 libxss1 libxshmfence1 libglu1 \
+    && apt-get -y autoclean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set up Chrome for running in a container
+ENV CHROME_BIN /usr/bin/google-chrome
+RUN echo "CHROME_BIN=$CHROME_BIN" >> /etc/environment
+
+# Set DBUS for Chrome
+RUN mkdir -p /run/dbus
+ENV DBUS_SESSION_BUS_ADDRESS="unix:path=/run/dbus/system_bus_socket"
+RUN dbus-daemon --system --fork
+
+# If puppeteer is used, make it use the installed Chrome, not download its own
+ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
+
+# Fix for PhantomJS runs (used by older task instances)
+ENV OPENSSL_CONF /etc/ssl
+
+# Add a non-root user to run Chrome
+RUN useradd -m chromeuser
+USER chromeuser
+WORKDIR /home/chromeuser
+
+# Switch back to root for any further commands
+USER root
+"""
+
+_DOCKERFILE_ENV_JS = r"""FROM --platform={platform} sweb.base.js.{arch}:latest
+
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+
+COPY ./setup_env.sh /root/
+RUN sed -i -e 's/\r$//' /root/setup_env.sh
+RUN chmod +x /root/setup_env.sh
+
+# Install Node
+ENV NODE_VERSION {node_version}
+RUN source $NVM_DIR/nvm.sh \
+    && nvm install $NODE_VERSION \
+    && nvm alias default $NODE_VERSION \
+    && nvm use default
+
+# Install Python
+RUN add-apt-repository ppa:deadsnakes/ppa && apt-get update && apt-get install -y python{python_version}
+RUN ln -s /usr/bin/python{python_version} /usr/bin/python
+
+# Install Python2
+RUN apt-get install -y python2
+
+# Set up environment variables for Node
+ENV NODE_PATH $NVM_DIR/v$NODE_VERSION/lib/node_modules
+ENV PATH $NVM_DIR/versions/node/v$NODE_VERSION/bin:$PATH
+RUN echo "PATH=$PATH:/usr/local/nvm/versions/node/$NODE_VERSION/bin/node" >> /etc/environment
+
+# Install pnpm
+ENV PNPM_VERSION {pnpm_version}
+ENV PNPM_HOME /usr/local/pnpm
+ENV PATH $PNPM_HOME:$PATH
+
+RUN mkdir -p $PNPM_HOME && \
+    wget -qO $PNPM_HOME/pnpm "https://github.com/pnpm/pnpm/releases/download/v$PNPM_VERSION/pnpm-linux-x64" && \
+    chmod +x $PNPM_HOME/pnpm && \
+    ln -s $PNPM_HOME/pnpm /usr/local/bin/pnpm
+
+RUN echo "export PNPM_HOME=$PNPM_HOME" >> /etc/profile && \
+    echo "export PATH=\$PNPM_HOME:\$PATH" >> /etc/profile
+
+# Run the setup script
+RUN /bin/bash -c "source ~/.bashrc && /root/setup_env.sh"
+RUN node -v
+RUN npm -v
+RUN pnpm -v
+RUN python -V
+RUN python2 -V
+
+WORKDIR /testbed/
+"""
+
+_DOCKERFILE_INSTANCE_JS = r"""FROM --platform={platform} {env_image_name}
+
+COPY ./setup_repo.sh /root/
+RUN sed -i -e 's/\r$//' /root/setup_repo.sh
+RUN node -v
+RUN npm -v
+RUN /bin/bash /root/setup_repo.sh
+
+WORKDIR /testbed/
+"""
diff --git a/swebench/harness/dockerfiles.py b/swebench/harness/dockerfiles/python.py
similarity index 51%
rename from swebench/harness/dockerfiles.py
rename to swebench/harness/dockerfiles/python.py
index b192d0d2..274b123c 100644
--- a/swebench/harness/dockerfiles.py
+++ b/swebench/harness/dockerfiles/python.py
@@ -1,5 +1,4 @@
-# IF you change the base image, you need to rebuild all images (run with --force_rebuild)
-_DOCKERFILE_BASE = r"""
+_DOCKERFILE_BASE_PY = r"""
 FROM --platform={platform} ubuntu:22.04
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -22,7 +21,7 @@
 && rm -rf /var/lib/apt/lists/*
 
 # Download and install conda
-RUN wget 'https://repo.anaconda.com/miniconda/Miniconda3-{conda_version}-Linux-{conda_arch}.sh' -O miniconda.sh \
+RUN wget 'https://repo.anaconda.com/miniconda/Miniconda3-py311_23.11.0-2-Linux-{conda_arch}.sh' -O miniconda.sh \
     && bash miniconda.sh -b -p /opt/miniconda3
 # Add conda to PATH
 ENV PATH=/opt/miniconda3/bin:$PATH
@@ -33,7 +32,7 @@
 RUN adduser --disabled-password --gecos 'dog' nonroot
 """
 
-_DOCKERFILE_ENV = r"""FROM --platform={platform} sweb.base.{arch}:latest
+_DOCKERFILE_ENV_PY = r"""FROM --platform={platform} sweb.base.py.{arch}:latest
 
 COPY ./setup_env.sh /root/
 RUN sed -i -e 's/\r$//' /root/setup_env.sh
@@ -46,7 +45,7 @@
 RUN echo "source /opt/miniconda3/etc/profile.d/conda.sh && conda activate testbed" > /root/.bashrc
 """
 
-_DOCKERFILE_INSTANCE = r"""FROM --platform={platform} {env_image_name}
+_DOCKERFILE_INSTANCE_PY = r"""FROM --platform={platform} {env_image_name}
 
 COPY ./setup_repo.sh /root/
 RUN sed -i -e 's/\r$//' /root/setup_repo.sh
@@ -54,22 +53,3 @@
 
 WORKDIR /testbed/
 """
-
-
-def get_dockerfile_base(platform, arch, conda_version=None):
-    if arch == "arm64":
-        conda_arch = "aarch64"
-    else:
-        conda_arch = arch
-    if conda_version == None:
-        # Default conda version (from initial SWE-bench release)
-        conda_version = "py311_23.11.0-2"
-    return _DOCKERFILE_BASE.format(platform=platform, conda_arch=conda_arch, conda_version=conda_version)
-
-
-def get_dockerfile_env(platform, arch):
-    return _DOCKERFILE_ENV.format(platform=platform, arch=arch)
-
-
-def get_dockerfile_instance(platform, env_image_name):
-    return _DOCKERFILE_INSTANCE.format(platform=platform, env_image_name=env_image_name)
diff --git a/swebench/harness/grading.py b/swebench/harness/grading.py
index 742a2e69..2ee02a95 100644
--- a/swebench/harness/grading.py
+++ b/swebench/harness/grading.py
@@ -3,21 +3,24 @@
 
 from swebench.harness.constants import (
     APPLY_PATCH_FAIL,
-    APPLY_PATCH_PASS,
+    END_TEST_OUTPUT,
     FAIL_TO_FAIL,
     FAIL_TO_PASS,
     KEY_INSTANCE_ID,
     KEY_PREDICTION,
+    MAP_REPO_VERSION_TO_SPECS,
     PASS_TO_FAIL,
     PASS_TO_PASS,
     RESET_FAILED,
+    START_TEST_OUTPUT,
     TESTS_ERROR,
     TESTS_TIMEOUT,
+    EvalType,
     ResolvedStatus,
     TestStatus,
 )
-from swebench.harness.test_spec import TestSpec
-from swebench.harness.log_parsers import MAP_REPO_TO_PARSER
+from swebench.harness.test_spec.test_spec import TestSpec
+from swebench.harness.log_parsers import MAP_REPO_TO_PARSER, get_eval_type
 
 
 # MARK: Utility functions
@@ -32,7 +35,7 @@ def test_failed(case: str, sm: dict[str, str]) -> bool:
 
 
 # MARK: Evaluation report functions
-def get_logs_eval(log_fp: str) -> tuple[dict[str, str], bool]:
+def get_logs_eval(test_spec: TestSpec, log_fp: str) -> tuple[dict[str, str], bool]:
     """
     Retrieve evaluation results for a task instance from its corresponding log file
 
@@ -44,41 +47,35 @@ def get_logs_eval(log_fp: str) -> tuple[dict[str, str], bool]:
     
     TODO(john-b-yang): Check this is working properly...
     """
-    # Convert e.g. "logs/scikit-learn__scikit-learn-12421/test_output.txt" to "scikit-learn/scikit-learn"
-    sample_id = str(Path(log_fp).parent.stem)  # e.g. scikit-learn__scikit-learn-12421
-    repo = "-".join(sample_id.replace("__", "/").split("-")[:-1])  # e.g. scikit-learn/scikit-learn
+    repo = test_spec.repo
+    version = test_spec.version
     log_parser = MAP_REPO_TO_PARSER[repo]
+    test_cmd = MAP_REPO_VERSION_TO_SPECS[repo][version]["test_cmd"]
+    if isinstance(test_cmd, list):
+        test_cmd = test_cmd[-1]
 
     with open(log_fp) as f:
         content = f.read()
         # TODO fix constant here
-        if (
-            any(
-                [
-                    x in content
-                    for x in [
-                        APPLY_PATCH_FAIL,
-                        RESET_FAILED,
-                        TESTS_ERROR,
-                        TESTS_TIMEOUT,
-                        "Failed to reset task environment",
-                    ]
-                ]
-            )
-            or "applied patch" not in content.lower()
-        ):
-            # Eval patch was not applied successfully
+        bad_codes = list(filter(lambda x: x in content, [
+            APPLY_PATCH_FAIL, RESET_FAILED, TESTS_ERROR, TESTS_TIMEOUT,
+        ]))
+        if bad_codes:
+            return {}, False
+        elif not (START_TEST_OUTPUT in content and END_TEST_OUTPUT in content):
+            # Test patch did not apply (should not happen at all)
             return {}, False
 
         # Get status map of evaluation results
-        content = content.split(f"{APPLY_PATCH_PASS} (pred)")[-1]
-        return log_parser(content), True
+        content = content.split(test_cmd)[-1]
+        return log_parser(content, test_spec), True
 
 
 def get_eval_tests_report(
-    eval_sm: dict[str, str],
+    eval_status_map: dict[str, str],
     gold_results: dict[str, str],
     calculate_to_fail: bool = False,
+    eval_type: EvalType = EvalType.PASS_AND_FAIL,
 ) -> dict[str, dict[str, list[str]]]:
     """
     Create a report based on failure/pass change from gold results to eval results.
@@ -102,24 +99,32 @@ def get_eval_tests_report(
     - Fail-Fail (F2F) + P: Success (Extra Credit)
     - Pass-Fail (P2F) + P: Not considered
     """
+    def check_pass_and_fail(test_case, eval_status_map, success, failed):
+        if test_passed(test_case, eval_status_map):
+            # Assume silent success for now (test case not in eval_sm)
+            success.append(test_case)
+        elif test_failed(test_case, eval_status_map):
+            failed.append(test_case)
+    
+    def check_fail_only(test_case, eval_status_map, success, failed):
+        if test_case in eval_status_map and eval_status_map[test_case] == TestStatus.FAILED.value:
+            failed.append(test_case)
+        else:
+            success.append(test_case)
+    
+    check_test_case = check_pass_and_fail if eval_type == EvalType.PASS_AND_FAIL else check_fail_only
+
     # Calculate resolution metrics
     f2p_success = []
     f2p_failure = []
     for test_case in gold_results[FAIL_TO_PASS]:
-        if test_passed(test_case, eval_sm):
-            # Assume silent success for now (test case not in eval_sm)
-            f2p_success.append(test_case)
-        elif test_failed(test_case, eval_sm):
-            f2p_failure.append(test_case)
+        check_test_case(test_case, eval_status_map, f2p_success, f2p_failure)
 
     # Calculate maintenance metrics
     p2p_success = []
     p2p_failure = []
     for test_case in gold_results[PASS_TO_PASS]:
-        if test_passed(test_case, eval_sm):
-            p2p_success.append(test_case)
-        elif test_failed(test_case, eval_sm):
-            p2p_failure.append(test_case)
+        check_test_case(test_case, eval_status_map, p2p_success, p2p_failure)
 
     results = {
         FAIL_TO_PASS: {
@@ -139,17 +144,11 @@ def get_eval_tests_report(
     if calculate_to_fail:
         # Calculate "extra credit" metrics
         for test_case in gold_results[FAIL_TO_FAIL]:
-            if test_passed(test_case, eval_sm):
-                f2f_success.append(test_case)
-            elif test_failed(test_case, eval_sm):
-                f2f_failure.append(test_case)
+            check_test_case(test_case, eval_status_map, f2f_success, f2f_failure)
 
         # Calculate not considered metrics
         for test_case in gold_results[PASS_TO_FAIL]:
-            if test_passed(test_case, eval_sm):
-                p2f_success.append(test_case)
-            elif test_failed(test_case, eval_sm):
-                p2f_failure.append(test_case)
+            check_test_case(test_case, eval_status_map, p2f_success, p2f_failure)
 
     results.update(
         {
@@ -210,7 +209,7 @@ def get_resolution_status(report: dict[str, dict[str, Any]]) -> str:
 def get_eval_report(
     test_spec: TestSpec,
     prediction: dict[str, str],
-    log_path: str,
+    test_log_path: str,
     include_tests_status: bool,
 ) -> dict[str, Any]:
     """
@@ -242,7 +241,7 @@ def get_eval_report(
     report_map[instance_id]["patch_exists"] = True
 
     # Get evaluation logs
-    eval_sm, found = get_logs_eval(log_path)
+    eval_status_map, found = get_logs_eval(test_spec, test_log_path)
 
     if not found:
         return report_map
@@ -254,7 +253,7 @@ def get_eval_report(
         PASS_TO_PASS: test_spec.PASS_TO_PASS,
     }
 
-    report = get_eval_tests_report(eval_sm, eval_ref)
+    report = get_eval_tests_report(eval_status_map, eval_ref, eval_type=get_eval_type(test_spec))
     if get_resolution_status(report) == ResolvedStatus.FULL.value:
         report_map[instance_id]["resolved"] = True
 
diff --git a/swebench/harness/log_parsers/__init__.py b/swebench/harness/log_parsers/__init__.py
new file mode 100644
index 00000000..295ef213
--- /dev/null
+++ b/swebench/harness/log_parsers/__init__.py
@@ -0,0 +1,8 @@
+from swebench.harness.log_parsers.javascript import MAP_REPO_TO_PARSER_JS
+from swebench.harness.log_parsers.python import MAP_REPO_TO_PARSER_PY
+from swebench.harness.log_parsers.utils import get_eval_type
+
+MAP_REPO_TO_PARSER = {
+    **MAP_REPO_TO_PARSER_JS,
+    **MAP_REPO_TO_PARSER_PY,
+}
\ No newline at end of file
diff --git a/swebench/harness/log_parsers/javascript.py b/swebench/harness/log_parsers/javascript.py
new file mode 100644
index 00000000..c861692a
--- /dev/null
+++ b/swebench/harness/log_parsers/javascript.py
@@ -0,0 +1,186 @@
+import json
+import re
+
+from swebench.harness.constants import (
+    MAP_REPO_VERSION_TO_SPECS,
+    TestStatus,
+)
+from swebench.harness.test_spec.test_spec import TestSpec
+from swebench.harness.log_parsers.utils import ansi_escape
+
+
+def parse_log_calypso(log: str, test_spec: TestSpec) -> dict[str, str]:
+    """
+    Parser for test logs generated by Calypso test suite
+    """
+    test_status_map = {}
+    suite = []
+
+    get_test_name = lambda suite, match_pattern, line : " - ".join([
+            " - ".join([x[0] for x in suite]),
+            re.match(match_pattern, line).group(1)
+        ]).strip()
+    
+    for log in log.split(" ./node_modules/.bin/jest ")[1:]:
+        for line in log.split("\n"):
+            if any([line.startswith(x) for x in [
+                "Test Suites",
+                "  ● "
+            ]]):
+                break
+            elif line.strip().startswith("✓"):
+                # Test passed
+                match_pattern = r"^\s+✓\s(.*)\(\d+ms\)$" \
+                    if re.search(r"\(\d+ms\)", line) is not None \
+                    else r"^\s+✓\s(.*)"
+                test_status_map[
+                    get_test_name(suite, match_pattern, line)
+                ] = TestStatus.PASSED.value
+            elif line.strip().startswith("✕"):
+                # Test failed
+                match_pattern = r"^\s+✕\s(.*)\(\d+ms\)$" \
+                    if re.search(r"\(\d+ms\)", line) is not None \
+                    else r"^\s+✕\s(.*)"
+                test_status_map[
+                    get_test_name(suite, match_pattern, line)
+                ] = TestStatus.FAILED.value
+            elif len(line) - len(line.lstrip()) > 0:
+                # Adjust suite name
+                indent = len(line) - len(line.lstrip())
+                if len(suite) == 0:
+                    # If suite is empty, initialize it
+                    suite = [(line.strip(), indent)]
+                else:
+                    while len(suite) > 0 and suite[-1][-1] >= indent:
+                        # Pop until the last element with indent less than current indent
+                        suite.pop()
+                    suite.append([line.strip(), indent])
+
+    return test_status_map
+
+
+def parse_log_chart_js(log: str, test_spec: TestSpec) -> dict[str, str]:
+    """
+    Parser for test logs generated by ChartJS test suite
+    """
+    test_status_map = {}
+    failure_case_patterns = [
+        (r"Chrome\s[\d\.]+\s\(.*?\)\s(.*)FAILED$", re.MULTILINE),
+    ]
+    for failure_case_pattern, flags in failure_case_patterns:
+        failures = re.findall(failure_case_pattern, log, flags)
+        if len(failures) == 0:
+            continue
+        for failure in failures:
+            test_status_map[failure] = TestStatus.FAILED.value
+    return test_status_map
+
+
+def parse_log_marked(log: str, test_spec: TestSpec) -> dict[str, str]:
+    """
+    Parser for test logs generated by Marked test suite
+    """
+    test_status_map = {}
+    for line in log.split("\n"):
+        if re.search(r"^\d+\)\s(.*)", line):
+            test = re.search(r"^\d+\)\s(.*)", line).group(1)
+            test_status_map[test.strip()] = TestStatus.FAILED.value
+    return test_status_map
+
+
+def parse_log_p5js(log_content: str, test_spec: TestSpec) -> dict[str, str]:
+    def remove_json_blocks(log):
+        filtered_lines = []
+        in_json_block = False
+        in_json_list_block = False
+        for line in log.split('\n'):
+            stripped_line = line.rstrip()  # Remove trailing whitespace
+            if stripped_line.endswith('{'):
+                in_json_block = True
+                continue
+            if stripped_line.endswith('['):
+                in_json_list_block = True
+                continue
+            if stripped_line == '}' and in_json_block:
+                in_json_block = False
+                continue
+            if stripped_line == ']' and in_json_list_block:
+                in_json_list_block = False
+                continue
+            if in_json_block or in_json_list_block:
+                continue
+            if stripped_line.startswith('{') and stripped_line.endswith('}'):
+                continue
+            if stripped_line.startswith('[') and stripped_line.endswith(']'):
+                continue
+            filtered_lines.append(line)
+        return '\n'.join(filtered_lines)
+    
+    def remove_xml_blocks(log):
+        xml_pat = re.compile(r'<(\w+)>[\s\S]*?<\/\1>', re.MULTILINE)
+        match = xml_pat.search(log)
+        while match:
+            # count the number of opening tags in the match
+            opening_tags = match.group().count(rf'<{match.group(1)}>') - 1
+            opening_tags = max(opening_tags, 0)
+            start = match.start()
+            end = match.end()
+            log = log[:start] + f'<{match.group(1)}>' * opening_tags + log[end:]
+            match = xml_pat.search(log)
+        return log
+    def is_valid_fail(match):
+        last_line_indent = 0
+        for line in match.group(2).split('\n'):
+            line_indent = len(line) - len(line.lstrip())
+            if line_indent <= last_line_indent:
+                return False
+            last_line_indent = line_indent
+        return True    
+
+    test_name_pat = re.compile(r'^(.*?)(?:\s*\(\d+(?:[A-Za-z]+)\))?$')
+    log_content = ansi_escape(log_content)
+    log_content = remove_json_blocks(log_content)
+    log_content = remove_xml_blocks(log_content)
+    test_results = {}
+    
+    # Parse failing tests
+    fail_pattern = re.compile(r'^\s*(\d+)\)(.{0,1000}?):', re.MULTILINE | re.DOTALL)
+    for match in fail_pattern.finditer(log_content):
+        if is_valid_fail(match):
+            test_names = list(map(str.strip, match.group(2).split('\n')))
+            full_name = ":".join(test_names)
+            test_results[full_name] = TestStatus.FAILED.value
+
+    return test_results
+
+
+def parse_log_react_pdf(log: str, test_spec: TestSpec) -> dict[str, str]:
+    """
+    Parser for test logs generated by Carbon test suite
+    """
+    test_status_map = {}
+    for line in log.split("\n"):
+        for pattern in [
+            (r"^PASS\s(.*)\s\([\d\.]+ms\)", TestStatus.PASSED.value),
+            (r"^PASS\s(.*)\s\([\d\.]+\ss\)", TestStatus.PASSED.value),
+            (r"^PASS\s(.*)\s\([\d\.]+s\)", TestStatus.PASSED.value),
+            (r"^PASS\s(.*)", TestStatus.PASSED.value),
+            (r"^FAIL\s(.*)\s\([\d\.]+ms\)", TestStatus.FAILED.value),
+            (r"^FAIL\s(.*)\s\([\d\.]+\ss\)", TestStatus.FAILED.value),
+            (r"^FAIL\s(.*)\s\([\d\.]+s\)", TestStatus.FAILED.value),
+            (r"^FAIL\s(.*)", TestStatus.FAILED.value),
+        ]:
+            if re.search(pattern[0], line):
+                test_name = re.match(pattern[0], line).group(1)
+                test_status_map[test_name] = pattern[1]
+                break
+    return test_status_map
+
+
+MAP_REPO_TO_PARSER_JS = {
+    "Automattic/wp-calypso": parse_log_calypso,
+    "chartjs/Chart.js": parse_log_chart_js,
+    "markedjs/marked": parse_log_marked,
+    "processing/p5.js": parse_log_p5js,
+    "diegomura/react-pdf": parse_log_react_pdf,
+}
diff --git a/swebench/harness/log_parsers.py b/swebench/harness/log_parsers/python.py
similarity index 93%
rename from swebench/harness/log_parsers.py
rename to swebench/harness/log_parsers/python.py
index 1d99aec2..a8c5a937 100644
--- a/swebench/harness/log_parsers.py
+++ b/swebench/harness/log_parsers/python.py
@@ -1,9 +1,11 @@
 import re
-from enum import Enum
+
 from swebench.harness.constants import TestStatus
+from swebench.harness.test_spec.test_spec import TestSpec
+from swebench.harness.log_parsers.utils import ansi_escape
 
 
-def parse_log_pytest(log: str) -> dict[str, str]:
+def parse_log_pytest(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with PyTest framework
 
@@ -25,7 +27,7 @@ def parse_log_pytest(log: str) -> dict[str, str]:
     return test_status_map
 
 
-def parse_log_pytest_options(log: str) -> dict[str, str]:
+def parse_log_pytest_options(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with PyTest framework with options
 
@@ -56,7 +58,7 @@ def parse_log_pytest_options(log: str) -> dict[str, str]:
     return test_status_map
 
 
-def parse_log_django(log: str) -> dict[str, str]:
+def parse_log_django(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with Django tester framework
 
@@ -132,7 +134,7 @@ def parse_log_django(log: str) -> dict[str, str]:
     return test_status_map
 
 
-def parse_log_pytest_v2(log: str) -> dict[str, str]:
+def parse_log_pytest_v2(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with PyTest framework (Later Version)
 
@@ -157,11 +159,11 @@ def parse_log_pytest_v2(log: str) -> dict[str, str]:
         elif any([line.endswith(x.value) for x in TestStatus]):
             test_case = line.split()
             if len(test_case) >= 2:
-                test_status_map[test_case[0]] = test_case[1]
+                test_status_map[test_case[1]] = test_case[0]
     return test_status_map
 
 
-def parse_log_seaborn(log: str) -> dict[str, str]:
+def parse_log_seaborn(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with seaborn testing framework
 
@@ -187,7 +189,7 @@ def parse_log_seaborn(log: str) -> dict[str, str]:
     return test_status_map
 
 
-def parse_log_sympy(log: str) -> dict[str, str]:
+def parse_log_sympy(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with Sympy framework
 
@@ -205,9 +207,6 @@ def parse_log_sympy(log: str) -> dict[str, str]:
     for line in log.split("\n"):
         line = line.strip()
         if line.startswith("test_"):
-            if line.endswith("[FAIL]") or line.endswith("[OK]"):
-                line = line[: line.rfind("[")]
-                line = line.strip()
             if line.endswith(" E"):
                 test = line.split()[0]
                 test_status_map[test] = TestStatus.ERROR.value
@@ -220,7 +219,7 @@ def parse_log_sympy(log: str) -> dict[str, str]:
     return test_status_map
 
 
-def parse_log_matplotlib(log: str) -> dict[str, str]:
+def parse_log_matplotlib(log: str, test_spec: TestSpec) -> dict[str, str]:
     """
     Parser for test logs generated with PyTest framework
 
@@ -261,7 +260,7 @@ def parse_log_matplotlib(log: str) -> dict[str, str]:
 parse_log_sphinx = parse_log_pytest_v2
 
 
-MAP_REPO_TO_PARSER = {
+MAP_REPO_TO_PARSER_PY = {
     "astropy/astropy": parse_log_astropy,
     "django/django": parse_log_django,
     "marshmallow-code/marshmallow": parse_log_marshmallow,
diff --git a/swebench/harness/log_parsers/utils.py b/swebench/harness/log_parsers/utils.py
new file mode 100644
index 00000000..393210b2
--- /dev/null
+++ b/swebench/harness/log_parsers/utils.py
@@ -0,0 +1,20 @@
+import re
+from swebench.harness.constants.constants import EvalType, FAIL_ONLY_REPOS
+from swebench.harness.test_spec.test_spec import TestSpec
+
+
+def ansi_escape(text: str) -> str:
+    """
+    Remove ANSI escape sequences from text
+    """
+    pattern = re.compile(
+        r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])',
+        re.VERBOSE,
+    )
+    return pattern.sub('', text)
+
+
+def get_eval_type(test_spec: TestSpec) -> str:
+    if test_spec.repo in FAIL_ONLY_REPOS:
+        return EvalType.FAIL_ONLY
+    return EvalType.PASS_AND_FAIL
diff --git a/swebench/harness/prepare_images.py b/swebench/harness/prepare_images.py
index 3d14fe81..5d798aee 100644
--- a/swebench/harness/prepare_images.py
+++ b/swebench/harness/prepare_images.py
@@ -6,7 +6,7 @@
 from swebench.harness.constants import KEY_INSTANCE_ID
 from swebench.harness.docker_build import build_instance_images
 from swebench.harness.docker_utils import list_images
-from swebench.harness.test_spec import make_test_spec
+from swebench.harness.test_spec.test_spec import make_test_spec
 from swebench.harness.utils import load_swebench_dataset, str2bool
 
 
@@ -14,7 +14,9 @@ def filter_dataset_to_build(
         dataset: list,
         instance_ids: list | None,
         client: docker.DockerClient,
-        force_rebuild: bool
+        force_rebuild: bool,
+        namespace: str = None,
+        tag: str = None,
     ):
     """
     Filter the dataset to only include instances that need to be built.
@@ -43,7 +45,7 @@ def filter_dataset_to_build(
             continue
 
         # Check if the instance needs to be built (based on force_rebuild flag and existing images)
-        spec = make_test_spec(instance)
+        spec = make_test_spec(instance, namespace=namespace, instance_image_tag=tag)
         if force_rebuild:
             data_to_build.append(instance)
         elif spec.instance_image_key not in existing_images:
@@ -59,6 +61,8 @@ def main(
     max_workers,
     force_rebuild,
     open_file_limit,
+    namespace,
+    tag,
 ):
     """
     Build Docker images for the specified instances.
@@ -75,7 +79,7 @@ def main(
 
     # Filter out instances that were not specified
     dataset = load_swebench_dataset(dataset_name, split)
-    dataset = filter_dataset_to_build(dataset, instance_ids, client, force_rebuild)
+    dataset = filter_dataset_to_build(dataset, instance_ids, client, force_rebuild, namespace, tag)
 
     # Build images for remaining instances
     successful, failed = build_instance_images(
@@ -83,6 +87,8 @@ def main(
         dataset=dataset,
         force_rebuild=force_rebuild,
         max_workers=max_workers,
+        namespace=namespace,
+        tag=tag,
     )
     print(f"Successfully built {len(successful)} images")
     print(f"Failed to build {len(failed)} images")
@@ -96,5 +102,7 @@ def main(
     parser.add_argument("--max_workers", type=int, default=4, help="Max workers for parallel processing")
     parser.add_argument("--force_rebuild", type=str2bool, default=False, help="Force rebuild images")
     parser.add_argument("--open_file_limit", type=int, default=8192, help="Open file limit")
+    parser.add_argument("--namespace", type=str, default=None, help="Namespace to use for the images")
+    parser.add_argument("--tag", type=str, default=None, help="Tag to use for the images")
     args = parser.parse_args()
     main(**vars(args))
diff --git a/swebench/harness/remove_containers.py b/swebench/harness/remove_containers.py
index fc538767..addc9bea 100644
--- a/swebench/harness/remove_containers.py
+++ b/swebench/harness/remove_containers.py
@@ -1,7 +1,7 @@
+import docker
 import json
-from argparse import ArgumentParser
 
-import docker
+from argparse import ArgumentParser
 
 """
 Script for removing containers associated with specified instance IDs.
diff --git a/swebench/harness/run_evaluation.py b/swebench/harness/run_evaluation.py
index 1dd51ffd..8da3311c 100644
--- a/swebench/harness/run_evaluation.py
+++ b/swebench/harness/run_evaluation.py
@@ -8,10 +8,8 @@
 if platform.system() == 'Linux':
     import resource
 
-from argparse import ArgumentParser
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 from pathlib import Path, PurePosixPath
-from tqdm import tqdm
 
 from swebench.harness.constants import (
     APPLY_PATCH_FAIL,
@@ -30,13 +28,13 @@
     UTF8,
 )
 from swebench.harness.docker_utils import (
-    remove_image,
+    clean_images,
+    cleanup_container,
     copy_to_container,
     exec_run_with_timeout,
-    cleanup_container,
     list_images,
+    remove_image,
     should_remove,
-    clean_images,
 )
 from swebench.harness.docker_build import (
     BuildImageError,
@@ -46,8 +44,15 @@
     setup_logger,
 )
 from swebench.harness.grading import get_eval_report
-from swebench.harness.test_spec import make_test_spec, TestSpec
-from swebench.harness.utils import load_swebench_dataset, str2bool
+from swebench.harness.test_spec.test_spec import make_test_spec, TestSpec
+from swebench.harness.utils import load_swebench_dataset, str2bool, run_threadpool
+
+GIT_APPLY_CMDS = [
+    "git apply --allow-empty -v",
+    "git apply --verbose",
+    "git apply --verbose --reject",
+    "patch --batch --fuzz=5 -p1 -i",
+]
 
 
 class EvaluationError(Exception):
@@ -73,6 +78,7 @@ def run_instance(
         client: docker.DockerClient,
         run_id: str,
         timeout: int | None = None,
+        rewrite_reports: bool = False,
     ):
     """
     Run a single instance with the given prediction.
@@ -85,29 +91,47 @@ def run_instance(
         client (docker.DockerClient): Docker client
         run_id (str): Run ID
         timeout (int): Timeout for running tests
+        rewrite_reports (bool): True if eval run is just to reformat existing report
     """
     # Set up logging directory
     instance_id = test_spec.instance_id
     model_name_or_path = pred.get(KEY_MODEL, "None").replace("/", "__")
     log_dir = RUN_EVALUATION_LOG_DIR / run_id / model_name_or_path / instance_id
-    log_dir.mkdir(parents=True, exist_ok=True)
-
-    # Link the image build dir in the log dir
-    build_dir = INSTANCE_IMAGE_BUILD_DIR / test_spec.instance_image_key.replace(":", "__")
-    image_build_link = log_dir / "image_build_dir"
-    if not image_build_link.exists():
-        try:
-            # link the image build dir in the log dir
-            image_build_link.symlink_to(build_dir.absolute(), target_is_directory=True)
-        except:
-            # some error, idk why
-            pass
-    log_file = log_dir / LOG_INSTANCE
 
-    # Set up report file + logger
+    # Set up report file
     report_path = log_dir / LOG_REPORT
+    if rewrite_reports:
+        test_output_path = log_dir / LOG_TEST_OUTPUT
+        if not test_output_path.exists():
+            raise ValueError(f"Test output file {test_output_path} does not exist")
+        report = get_eval_report(
+            test_spec=test_spec,
+            prediction=pred,
+            test_log_path=test_output_path,
+            include_tests_status=True,
+        )
+        # Write report to report.json
+        with open(report_path, "w") as f:
+            f.write(json.dumps(report, indent=4))
+        return instance_id, report
     if report_path.exists():
         return instance_id, json.loads(report_path.read_text())
+
+    if not test_spec.is_remote_image:
+        # Link the image build dir in the log dir
+        build_dir = INSTANCE_IMAGE_BUILD_DIR / test_spec.instance_image_key.replace(":", "__")
+        image_build_link = log_dir / "image_build_dir"
+        if not image_build_link.exists():
+            try:
+                # link the image build dir in the log dir
+                image_build_link.symlink_to(build_dir.absolute(), target_is_directory=True)
+            except:
+                # some error, idk why
+                pass
+    
+    # Set up logger
+    log_dir.mkdir(parents=True, exist_ok=True)
+    log_file = log_dir / LOG_INSTANCE
     logger = setup_logger(instance_id, log_file)
 
     # Run the instance
@@ -126,36 +150,27 @@ def run_instance(
         )
         copy_to_container(container, patch_file, PurePosixPath(DOCKER_PATCH))
 
-        # Attempt to apply patch to container
-        val = container.exec_run(
-            f"git apply --allow-empty -v {DOCKER_PATCH}",
-            workdir=DOCKER_WORKDIR,
-            user=DOCKER_USER,
-        )
-        if val.exit_code != 0:
-            logger.info(f"Failed to apply patch to container, trying again...")
-            
-            # try "patch --batch --fuzz=5 -p1 -i {patch_path}" to try again
-            val = container.exec_run(
-                f"patch --batch --fuzz=5 -p1 -i {DOCKER_PATCH}",
-                workdir=DOCKER_WORKDIR,
-                user=DOCKER_USER,
-            )
-            if val.exit_code != 0:
-                logger.info(f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}")
-                raise EvaluationError(
-                    instance_id,
-                    f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}",
-                    logger,
-                )
-            else:
+        # Attempt to apply patch to container (TODO: FIX THIS)
+        applied_patch = False
+        for git_apply_cmd in GIT_APPLY_CMDS:
+            val = container.exec_run(f"{git_apply_cmd} {DOCKER_PATCH}", workdir=DOCKER_WORKDIR, user=DOCKER_USER)
+            if val.exit_code == 0:
                 logger.info(f"{APPLY_PATCH_PASS}:\n{val.output.decode(UTF8)}")
-        else:
-            logger.info(f"{APPLY_PATCH_PASS}:\n{val.output.decode(UTF8)}")
+                applied_patch = True
+                break
+            else:
+                logger.info(f"Failed to apply patch to container: {git_apply_cmd}")
+        if not applied_patch:
+            logger.info(f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}")
+            raise EvaluationError(
+                instance_id,
+                f"{APPLY_PATCH_FAIL}:\n{val.output.decode(UTF8)}",
+                logger,
+            )
 
         # Get git diff before running eval script
         git_diff_output_before = (
-            container.exec_run("git diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip()
+            container.exec_run("git -c core.fileMode=false diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip()
         )
         logger.info(f"Git diff before:\n{git_diff_output_before}")
 
@@ -164,7 +179,7 @@ def run_instance(
         logger.info(
             f"Eval script for {instance_id} written to {eval_file}; copying to container..."
         )
-        copy_to_container(container, eval_file, Path("/eval.sh"))
+        copy_to_container(container, eval_file, PurePosixPath("/eval.sh"))
 
         # Run eval script, write output to logs
         test_output, timed_out, total_runtime = exec_run_with_timeout(container, "/bin/bash /eval.sh", timeout)
@@ -181,9 +196,9 @@ def run_instance(
                     logger,
                 )
 
-        # Get git diff after running eval script
+        # Get git diff after running eval script (ignore permission changes)
         git_diff_output_after = (
-            container.exec_run("git diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip()
+            container.exec_run("git -c core.fileMode=false diff", workdir=DOCKER_WORKDIR).output.decode(UTF8).strip()
         )
 
         # Check if git diff changed after running eval script
@@ -196,7 +211,7 @@ def run_instance(
         report = get_eval_report(
             test_spec=test_spec,
             prediction=pred,
-            log_path=test_output_path,
+            test_log_path=test_output_path,
             include_tests_status=True,
         )
         logger.info(
@@ -239,6 +254,9 @@ def run_instances(
         max_workers: int,
         run_id: str,
         timeout: int,
+        namespace: str = None,
+        instance_image_tag: str = 'latest',
+        rewrite_reports: bool = False,
     ):
     """
     Run all instances for the given predictions in parallel.
@@ -254,7 +272,10 @@ def run_instances(
         timeout (int): Timeout for running tests
     """
     client = docker.from_env()
-    test_specs = list(map(make_test_spec, instances))
+    test_specs = list(map(
+        lambda instance: make_test_spec(instance, namespace=namespace, instance_image_tag=instance_image_tag),
+        instances
+    ))
 
     # print number of existing instance images
     instance_image_ids = {x.instance_image_key for x in test_specs}
@@ -265,38 +286,28 @@ def run_instances(
     if not force_rebuild and len(existing_images):
         print(f"Found {len(existing_images)} existing instance images. Will reuse them.")
 
+    # run instances in parallel
+    payloads = []
+    for test_spec in test_specs:
+        payloads.append((
+            test_spec,
+            predictions[test_spec.instance_id],
+            should_remove(
+                test_spec.instance_image_key,
+                cache_level,
+                clean,
+                existing_images,
+            ),
+            force_rebuild,
+            client,
+            run_id,
+            timeout,
+            rewrite_reports,
+        ))
+    
     # run instances in parallel
     print(f"Running {len(instances)} instances...")
-    with tqdm(total=len(instances), smoothing=0) as pbar:
-        with ThreadPoolExecutor(max_workers=max_workers) as executor:
-            # Create a future for running each instance
-            futures = {
-                executor.submit(
-                    run_instance,
-                    test_spec,
-                    predictions[test_spec.instance_id],
-                    should_remove(
-                        test_spec.instance_image_key,
-                        cache_level,
-                        clean,
-                        existing_images,
-                    ),
-                    force_rebuild,
-                    client,
-                    run_id,
-                    timeout,
-                ): None
-                for test_spec in test_specs
-            }
-            # Wait for each future to complete
-            for future in as_completed(futures):
-                pbar.update(1)
-                try:
-                    # Update progress bar, check if instance ran successfully
-                    future.result()
-                except Exception as e:
-                    traceback.print_exc()
-                    continue
+    run_threadpool(run_instance, payloads, max_workers)
     print("All instances run.")
 
 
@@ -306,7 +317,8 @@ def get_dataset_from_preds(
         instance_ids: list,
         predictions: dict,
         run_id: str,
-        exclude_completed: bool = True
+        rewrite_reports: bool,
+        exclude_completed: bool = True,
     ):
     """
     Return only instances that have predictions and are in the dataset.
@@ -335,6 +347,25 @@ def get_dataset_from_preds(
     if instance_ids:
         dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids]
 
+    if rewrite_reports:
+        # we only return instances that have existing test outputs
+        test_output_ids = set()
+        for instance in dataset:
+            if instance[KEY_INSTANCE_ID] not in predictions:
+                continue
+            prediction = predictions[instance[KEY_INSTANCE_ID]]
+            test_output_file = (
+                RUN_EVALUATION_LOG_DIR
+                / run_id
+                / prediction["model_name_or_path"].replace("/", "__")
+                / prediction[KEY_INSTANCE_ID]
+                / "test_output.txt"
+            )
+            if test_output_file.exists():
+                test_output_ids.add(instance[KEY_INSTANCE_ID])
+        dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in prediction_ids and i[KEY_INSTANCE_ID] in test_output_ids]
+        return dataset
+    
     # check which instance IDs have already been run
     completed_ids = set()
     for instance in dataset:
@@ -368,7 +399,10 @@ def make_run_report(
         predictions: dict,
         full_dataset: list,
         client: docker.DockerClient,
-        run_id: str
+        run_id: str,
+        namespace: str | None = None,
+        instance_image_tag: str = 'latest',
+        report_dir: str = '.',
     ) -> Path:
     """
     Make a final evaluation and run report of the instances that have been run.
@@ -427,7 +461,10 @@ def make_run_report(
 
     # get remaining images and containers
     images = list_images(client)
-    test_specs = list(map(make_test_spec, full_dataset))
+    test_specs = list(map(
+        lambda x: make_test_spec(x, namespace=namespace, instance_image_tag=instance_image_tag),
+        full_dataset
+    ))
     for spec in test_specs:
         image_name = spec.instance_image_key
         if image_name in images:
@@ -476,6 +513,8 @@ def make_run_report(
         + f".{run_id}"
         + ".json"
     )
+    if report_dir is not None:
+        report_file = Path(report_dir) / report_file
     with open(report_file, "w") as f:
         print(json.dumps(report, indent=4), file=f)
     print(f"Report written to {report_file}")
@@ -508,16 +547,28 @@ def main(
         open_file_limit: int,
         run_id: str,
         timeout: int,
+        namespace: str | None,
+        rewrite_reports: bool,
+        instance_image_tag: str = 'latest',
+        report_dir: str = '.'
     ):
     """
     Run evaluation harness for the given dataset and predictions.
     """
     # set open file limit
     assert len(run_id) > 0, "Run ID must be provided"
+    if report_dir is not None:
+        report_dir = Path(report_dir)
+        if not report_dir.exists():
+            report_dir.mkdir(parents=True)
+
     if platform.system() == 'Linux':
         resource.setrlimit(resource.RLIMIT_NOFILE, (open_file_limit, open_file_limit))
     client = docker.from_env()
 
+    if force_rebuild and namespace is not None:
+        raise ValueError("Cannot force rebuild and use a namespace at the same time.")
+
     # load predictions as map of instance_id to prediction
     if predictions_path == 'gold':
         print("Using gold predictions - ignoring predictions_path")
@@ -534,7 +585,7 @@ def main(
     predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions}
 
     # get dataset from predictions
-    dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
+    dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id, rewrite_reports)
     full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
     existing_images = list_images(client)
     print(f"Running {len(dataset)} unevaluated instances...")
@@ -542,8 +593,21 @@ def main(
         print("No instances to run.")
     else:
         # build environment images + run instances
-        build_env_images(client, dataset, force_rebuild, max_workers)
-        run_instances(predictions, dataset, cache_level, clean, force_rebuild, max_workers, run_id, timeout)
+        if namespace is None and not rewrite_reports:
+            build_env_images(client, dataset, force_rebuild, max_workers)
+        run_instances(
+            predictions,
+            dataset,
+            cache_level,
+            clean,
+            force_rebuild,
+            max_workers,
+            run_id,
+            timeout,
+            namespace=namespace,
+            instance_image_tag=instance_image_tag,
+            rewrite_reports=rewrite_reports,
+        )
 
     # clean images + make final report
     clean_images(client, existing_images, cache_level, clean)
@@ -551,18 +615,57 @@ def main(
 
 
 if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument("--dataset_name", default="princeton-nlp/SWE-bench_Lite", type=str, help="Name of dataset or path to JSON file.")
-    parser.add_argument("--split", type=str, default="test", help="Split of the dataset")
-    parser.add_argument("--instance_ids", nargs="+", type=str, help="Instance IDs to run (space separated)")
-    parser.add_argument("--predictions_path", type=str, help="Path to predictions file - if 'gold', uses gold predictions", required=True)
-    parser.add_argument("--max_workers", type=int, default=4, help="Maximum number of workers (should be <= 75%% of CPU cores)")
-    parser.add_argument("--open_file_limit", type=int, default=4096, help="Open file limit")
+    parser = ArgumentParser(
+        description="Run evaluation harness for the given dataset and predictions.",
+        formatter_class=ArgumentDefaultsHelpFormatter,
+    )
     parser.add_argument(
-        "--timeout", type=int, default=1_800, help="Timeout (in seconds) for running tests for each instance"
-        )
+        "--dataset_name",
+        default="princeton-nlp/SWE-bench_Lite",
+        type=str,
+        help="Name of dataset or path to JSON file."
+    )
+    parser.add_argument(
+        "--split",
+        type=str,
+        default="test",
+        help="Split of the dataset",
+    )
+    parser.add_argument(
+        "--instance_ids",
+        nargs="+",
+        type=str,
+        help="Instance IDs to run (space separated)",
+    )
+    parser.add_argument(
+        "--predictions_path",
+        type=str,
+        help="Path to predictions file - if 'gold', uses gold predictions",
+        required=True,
+    )
+    parser.add_argument(
+        "--max_workers",
+        type=int,
+        default=4,
+        help="Maximum number of workers (should be <= 75%% of CPU cores)",
+    )
     parser.add_argument(
-        "--force_rebuild", type=str2bool, default=False, help="Force rebuild of all images"
+        "--open_file_limit",
+        type=int,
+        default=4096,
+        help="Open file limit",
+    )
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=1_800,
+        help="Timeout (in seconds) for running tests for each instance",
+    )
+    parser.add_argument(
+        "--force_rebuild",
+        type=str2bool,
+        default=False,
+        help="Force rebuild of all images",
     )
     parser.add_argument(
         "--cache_level",
@@ -574,9 +677,41 @@ def main(
     # if clean is true then we remove all images that are above the cache level
     # if clean is false, we only remove images above the cache level if they don't already exist
     parser.add_argument(
-        "--clean", type=str2bool, default=False, help="Clean images above cache level"
+        "--clean",
+        type=str2bool,
+        default=False,
+        help="Clean images above cache level",
+    )
+    parser.add_argument(
+        "--run_id",
+        type=str,
+        required=True,
+        help="Run ID - identifies the run",
+    )
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default=None,
+        help="Namespace for images",
+    )
+    parser.add_argument(
+        "--instance_image_tag",
+        type=str,
+        default='latest',
+        help="Instance image tag",
+    )
+    parser.add_argument(
+        "--rewrite_reports",
+        type=str2bool,
+        default=False,
+        help="Doesn't run new instances, only writes reports for instances with existing test outputs",
+    )
+    parser.add_argument(
+        "--report_dir",
+        type=str,
+        default=".",
+        help="Directory to write reports to",
     )
-    parser.add_argument("--run_id", type=str, required=True, help="Run ID - identifies the run")
     args = parser.parse_args()
 
     main(**vars(args))
diff --git a/swebench/harness/test_spec.py b/swebench/harness/test_spec.py
deleted file mode 100644
index bc5d2f23..00000000
--- a/swebench/harness/test_spec.py
+++ /dev/null
@@ -1,330 +0,0 @@
-from __future__ import annotations
-
-import hashlib
-import json
-import platform
-import re
-
-from dataclasses import dataclass
-from typing import Any, Union, cast
-
-from swebench.harness.constants import (
-    SWEbenchInstance,
-    KEY_INSTANCE_ID,
-    FAIL_TO_PASS,
-    PASS_TO_PASS,
-    MAP_REPO_TO_INSTALL,
-    MAP_REPO_VERSION_TO_SPECS,
-    USE_X86,
-    UTF8,
-)
-from swebench.harness.dockerfiles import (
-    get_dockerfile_base,
-    get_dockerfile_env,
-    get_dockerfile_instance,
-)
-from swebench.harness.utils import (
-    get_requirements,
-    get_environment_yml,
-    get_test_directives,
-)
-
-DIFF_MODIFIED_FILE_REGEX = r"--- a/(.*)"
-
-
-@dataclass
-class TestSpec:
-    """
-    A dataclass that represents a test specification for a single instance of SWE-bench.
-    """
-    instance_id: str
-    repo: str
-    version: str
-    repo_script_list: list[str]
-    eval_script_list: list[str]
-    env_script_list: list[str]
-    arch: str
-    FAIL_TO_PASS: list[str]
-    PASS_TO_PASS: list[str]
-
-    @property
-    def setup_env_script(self):
-        return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.env_script_list) + "\n"
-
-    @property
-    def eval_script(self):
-        return "\n".join(["#!/bin/bash", "set -uxo pipefail"] + self.eval_script_list) + "\n"
-        # Don't exit early because we need to revert tests at the end
-
-    @property
-    def install_repo_script(self):
-        return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.repo_script_list) + "\n"
-
-    @property
-    def base_image_key(self):
-        return f"sweb.base.{self.arch}:latest"
-
-    @property
-    def env_image_key(self):
-        """
-        The key for the environment image is based on the hash of the environment script list.
-        If the environment script list changes, the image will be rebuilt automatically.
-
-        Note that old images are not automatically deleted, so consider cleaning up old images periodically.
-        """
-        hash_object = hashlib.sha256()
-        hash_object.update(str(self.env_script_list).encode(UTF8))
-        hash_value = hash_object.hexdigest()
-        val = hash_value[:22]  # 22 characters is still very likely to be unique
-        return f"sweb.env.{self.arch}.{val}:latest"
-
-    @property
-    def instance_image_key(self):
-        return f"sweb.eval.{self.arch}.{self.instance_id}:latest"
-
-    def get_instance_container_name(self, run_id=None):
-        if not run_id:
-            return f"sweb.eval.{self.instance_id}"
-        return f"sweb.eval.{self.instance_id}.{run_id}"
-
-    @property
-    def base_dockerfile(self):
-        return get_dockerfile_base(self.platform, self.arch)
-
-    @property
-    def env_dockerfile(self):
-        return get_dockerfile_env(self.platform, self.arch)
-
-    @property
-    def instance_dockerfile(self):
-        return get_dockerfile_instance(self.platform, self.env_image_key)
-
-    @property
-    def platform(self):
-        if self.arch == "x86_64":
-            return "linux/x86_64"
-        elif self.arch == "arm64":
-            return "linux/arm64/v8"
-        else:
-            raise ValueError(f"Invalid architecture: {self.arch}")
-
-
-def get_test_specs_from_dataset(dataset: Union[list[SWEbenchInstance], list[TestSpec]]) -> list[TestSpec]:
-    """
-    Idempotent function that converts a list of SWEbenchInstance objects to a list of TestSpec objects.
-    """
-    if isinstance(dataset[0], TestSpec):
-        return cast(list[TestSpec], dataset)
-    return list(map(make_test_spec, cast(list[SWEbenchInstance], dataset)))
-
-
-def make_repo_script_list(specs, repo, repo_directory, base_commit, env_name):
-    """
-    Create a list of bash commands to set up the repository for testing.
-    This is the setup script for the instance image.
-    """
-    setup_commands = [
-        f"git clone -o origin https://github.com/{repo} {repo_directory}",
-        f"chmod -R 777 {repo_directory}",  # So nonroot user can run tests
-        f"cd {repo_directory}",
-        f"git reset --hard {base_commit}",
-        # Remove the remote so the agent won't see newer commits.
-        "git remote remove origin",
-        # Make sure conda is available for later use
-        "source /opt/miniconda3/bin/activate",
-        f"conda activate {env_name}",
-        'echo "Current environment: $CONDA_DEFAULT_ENV"',
-    ]
-    if repo in MAP_REPO_TO_INSTALL:
-        setup_commands.append(MAP_REPO_TO_INSTALL[repo])
-
-    # Run pre-install set up if provided
-    if "pre_install" in specs:
-        for pre_install in specs["pre_install"]:
-            setup_commands.append(pre_install)
-
-    if "install" in specs:
-        setup_commands.append(specs["install"])
-    return setup_commands
-
-
-def replace_uninstallable_packages_requirements_txt(requirement_str: str) -> str:
-    """Replaces certain packages in a requirements.txt-like string.
-    For example, some packages have been yanked and we need to replace them with compatible alternatives.
-    """
-    replacements = {
-        # See https://github.com/princeton-nlp/SWE-bench/issues/199
-        # This package was sinced yanked, so we need to force pip
-        # to install it.
-        "types-pkg_resources": "types-pkg-resources==0.1.3",
-    }
-    requirements = [req.strip() for req in requirement_str.split("\n") if req.strip()]
-    requirements_replaced = []
-    for requirement in requirements:
-        if requirement in replacements:
-            print(f"Replaced {requirement!r} with {replacements[requirement]!r} (replace_uninstallable_packages)")
-            requirements_replaced.append(replacements[requirement])
-        else:
-            requirements_replaced.append(requirement)
-    return "\n".join(requirements_replaced) + "\n"
-
-
-def make_env_script_list(instance: SWEbenchInstance, specs: dict, env_name: str) -> list[str]:
-    """
-    Creates the list of commands to set up the conda environment for testing.
-    This is the setup script for the environment image.
-
-    Returns:
-        list[str]: List of commands to set up the conda environment
-    """
-    HEREDOC_DELIMITER = "EOF_59812759871"
-    reqs_commands = [
-        "source /opt/miniconda3/bin/activate",
-    ]
-    # Create conda environment according to install instructinos
-    pkgs = specs.get("packages", "")
-    if pkgs == "requirements.txt":
-        # Create environment
-        cmd = f"conda create -n {env_name} python={specs['python']} -y"
-        reqs_commands.append(cmd)
-
-        # Install dependencies
-        reqs = replace_uninstallable_packages_requirements_txt(get_requirements(instance))
-        path_to_reqs = "$HOME/requirements.txt"
-        reqs_commands.append(
-            f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}"
-        )
-        cmd = f"conda activate {env_name} && python -m pip install -r {path_to_reqs}"
-        reqs_commands.append(cmd)
-        reqs_commands.append(f"rm {path_to_reqs}")
-    elif pkgs == "environment.yml":
-        # Create environment from yml
-        reqs = get_environment_yml(instance, env_name)
-        path_to_reqs = "environment.yml"
-        reqs_commands.append(
-            f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}"
-        )
-        if "no_use_env" in specs and specs["no_use_env"]:
-            # `conda create` based installation
-            cmd = f"conda create -c conda-forge -n {env_name} python={specs['python']} -y"
-            reqs_commands.append(cmd)
-
-            # Install dependencies
-            cmd = f"conda env update -f {path_to_reqs}"
-            reqs_commands.append(cmd)
-        else:
-            # `conda env create` based installation
-            cmd = f"conda env create --file {path_to_reqs}"
-            reqs_commands.append(cmd)
-
-            cmd = f"conda activate {env_name} && conda install python={specs['python']} -y"
-            reqs_commands.append(cmd)
-
-        # Remove environment.yml
-        reqs_commands.append(f"rm {path_to_reqs}")
-    else:
-        # Create environment + install dependencies
-        cmd = f"conda create -n {env_name} python={specs['python']} {pkgs} -y"
-        reqs_commands.append(cmd)
-
-    reqs_commands.append(f"conda activate {env_name}")
-
-    # Install additional packages if specified
-    if "pip_packages" in specs:
-        pip_packages = " ".join(specs["pip_packages"])
-        cmd = f"python -m pip install {pip_packages}"
-        reqs_commands.append(cmd)
-    return reqs_commands
-
-
-def make_eval_script_list(instance, specs, env_name, repo_directory, base_commit, test_patch):
-    """
-    Applies the test patch and runs the tests.
-    """
-    HEREDOC_DELIMITER = "EOF_114329324912"
-    test_files = re.findall(DIFF_MODIFIED_FILE_REGEX, test_patch)
-    # Reset test files to the state they should be in before the patch.
-    reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
-    apply_test_patch_command = (
-        f"git apply -v - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}"
-    )
-    test_command = " ".join(
-        [
-            MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"],
-            *get_test_directives(instance),
-        ]
-    )
-    eval_commands = [
-        "source /opt/miniconda3/bin/activate",
-        f"conda activate {env_name}",
-        f"cd {repo_directory}",
-    ]
-    if "eval_commands" in specs:
-        eval_commands += specs["eval_commands"]
-    eval_commands += [
-        f"git config --global --add safe.directory {repo_directory}",  # for nonroot user
-        f"cd {repo_directory}",
-        # This is just informational, so we have a record
-        "git status",
-        "git show",
-        f"git diff {base_commit}",
-        "source /opt/miniconda3/bin/activate",
-        f"conda activate {env_name}",
-    ]
-    if "install" in specs:
-        eval_commands.append(specs["install"])
-    eval_commands += [
-        reset_tests_command,
-        apply_test_patch_command,
-        test_command,
-        reset_tests_command,  # Revert tests after done, leave the repo in the same state as before
-    ]
-    return eval_commands
-
-
-def make_test_spec(instance: SWEbenchInstance) -> TestSpec:
-    if isinstance(instance, TestSpec):
-        return instance
-    instance_id = instance[KEY_INSTANCE_ID]
-    repo = instance["repo"]
-    version = instance["version"]
-    base_commit = instance["base_commit"]
-    problem_statement = instance["problem_statement"]
-    hints_text = instance["hints_text"]  # Unused
-    test_patch = instance["test_patch"]
-
-    def _from_json_or_obj(key: str) -> Any:
-        """If key points to string, load with json"""
-        if isinstance(instance[key], str):
-            return json.loads(instance[key])
-        return instance[key]
-
-    pass_to_pass = _from_json_or_obj(PASS_TO_PASS)
-    fail_to_pass = _from_json_or_obj(FAIL_TO_PASS)
-
-    env_name = "testbed"
-    repo_directory = f"/{env_name}"
-    specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
-
-    repo_script_list = make_repo_script_list(specs, repo, repo_directory, base_commit, env_name)
-    env_script_list = make_env_script_list(instance, specs, env_name)
-    eval_script_list = make_eval_script_list(
-        instance, specs, env_name, repo_directory, base_commit, test_patch
-    )
-    if platform.machine() in {"aarch64", "arm64"}:
-        # use arm64 unless explicitly specified
-        arch = "arm64" if instance_id not in USE_X86 else "x86_64"
-    else:
-        arch = "x86_64"
-
-    return TestSpec(
-        instance_id=instance_id,
-        repo=repo,
-        env_script_list=env_script_list,
-        repo_script_list=repo_script_list,
-        eval_script_list=eval_script_list,
-        version=version,
-        arch=arch,
-        FAIL_TO_PASS=fail_to_pass,
-        PASS_TO_PASS=pass_to_pass,
-    )
diff --git a/swebench/harness/test_spec/__init__.py b/swebench/harness/test_spec/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swebench/harness/test_spec/create_scripts.py b/swebench/harness/test_spec/create_scripts.py
new file mode 100644
index 00000000..7587b90f
--- /dev/null
+++ b/swebench/harness/test_spec/create_scripts.py
@@ -0,0 +1,51 @@
+from swebench.harness.test_spec.javascript import (
+    make_repo_script_list_js,
+    make_env_script_list_js,
+    make_eval_script_list_js,
+)
+from swebench.harness.test_spec.python import (
+    make_repo_script_list_py,
+    make_env_script_list_py,
+    make_eval_script_list_py,
+)
+from swebench.harness.constants import MAP_REPO_TO_EXT
+
+
+def make_repo_script_list(
+    specs, repo, repo_directory, base_commit, env_name) -> list:
+    """
+    Create a list of bash commands to set up the repository for testing.
+    This is the setup script for the instance image.
+    """
+    ext = MAP_REPO_TO_EXT[repo]
+    func = {
+        "js": make_repo_script_list_js,
+        "py": make_repo_script_list_py,
+    }[ext]
+    return func(specs, repo, repo_directory, base_commit, env_name)
+
+
+def make_env_script_list(instance, specs, env_name) -> list:
+    """
+    Creates the list of commands to set up the environment for testing.
+    This is the setup script for the environment image.
+    """
+    ext = MAP_REPO_TO_EXT[instance["repo"]]
+    func = {
+        "js": make_env_script_list_js,
+        "py": make_env_script_list_py,
+    }[ext]
+    return func(instance, specs, env_name)
+
+
+def make_eval_script_list(
+    instance, specs, env_name, repo_directory, base_commit, test_patch) -> list:
+    """
+    Applies the test patch and runs the tests.
+    """
+    ext = MAP_REPO_TO_EXT[instance["repo"]]
+    func = {
+        "js": make_eval_script_list_js,
+        "py": make_eval_script_list_py,
+    }[ext]
+    return func(instance, specs, env_name, repo_directory, base_commit, test_patch)
diff --git a/swebench/harness/test_spec/javascript.py b/swebench/harness/test_spec/javascript.py
new file mode 100644
index 00000000..acee9b7f
--- /dev/null
+++ b/swebench/harness/test_spec/javascript.py
@@ -0,0 +1,138 @@
+import re
+
+from pathlib import Path
+from swebench.harness.constants import (
+    END_TEST_OUTPUT,
+    MAP_REPO_VERSION_TO_SPECS,
+    START_TEST_OUTPUT,
+    TEST_XVFB_PREFIX,
+)
+from swebench.harness.utils import get_modified_files
+from unidiff import PatchSet
+
+
+# MARK: Test Command Creation Functions
+def get_test_cmds_calypso(instance) -> list:
+    test_paths = [x.path for x in PatchSet(instance['test_patch'])]
+    test_cmds = []
+    for test_path in test_paths:
+        if re.search(r"__snapshots__/(.*).js.snap$", test_path):
+            # Jest snapshots are not run directly
+            test_path = "/".join(test_path.split("/")[:-2])
+
+        # Determine which testing script to use
+        if any([test_path.startswith(x) for x in ["client", "packages"]]):
+            pkg = test_path.split("/")[0]
+            if instance['version'] in [
+                '10.10.0', '10.12.0', '10.13.0',
+                '10.14.0', '10.15.2', '10.16.3'
+            ]:
+                test_cmds.append(f"./node_modules/.bin/jest --verbose -c=test/{pkg}/jest.config.js '{test_path}'")
+            elif instance['version'] in [
+                '6.11.5', '8.9.1', '8.9.3', '8.9.4', '8.11.0', '8.11.2',
+                '10.4.1', '10.5.0', '10.6.0', '10.9.0',
+            ]:
+                test_cmds.append(f"./node_modules/.bin/jest --verbose -c=test/{pkg}/jest.config.json '{test_path}'")
+            else:
+                test_cmds.append(f"npm run test-{pkg} --verbose '{test_path}'")
+        elif any([test_path.startswith(x) for x in ["test/e2e"]]):
+            test_cmds.extend([
+                "cd test/e2e",
+                f"NODE_CONFIG_ENV=test npm run test {test_path}",
+                "cd ../..",
+            ])
+
+    return test_cmds
+
+
+MAP_REPO_TO_TEST_CMDS = {
+    "Automattic/wp-calypso": get_test_cmds_calypso,
+}
+
+
+def get_test_cmds(instance) -> list:
+    if instance["repo"] in MAP_REPO_TO_TEST_CMDS:
+        return MAP_REPO_TO_TEST_CMDS[instance["repo"]](instance)
+    test_cmd = MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"]
+    return [test_cmd] if isinstance(test_cmd, str) else test_cmd
+
+
+# MARK: Utility Functions
+
+def get_download_img_commands(instance) -> list:
+    cmds = []
+    for i in instance.get("image_assets", {}).get("test_patch", []):
+        folder = Path(i["path"]).parent
+        cmds.append(f"mkdir -p {folder}")
+        cmds.append(f"curl -o {i['path']} {i['url']}")
+        cmds.append(f"chmod 777 {i['path']}")
+    return cmds
+
+
+# MARK: Script Creation Functions
+
+def make_repo_script_list_js(specs, repo, repo_directory, base_commit, env_name) -> list:
+    """
+    Create a list of bash commands to set up the repository for testing.
+    This is the setup script for the instance image.
+    """
+    setup_commands = [
+        f"git clone -o origin https://github.com/{repo} {repo_directory}",
+        f"cd {repo_directory}",
+        f"git reset --hard {base_commit}",
+        f"chmod -R 777 {repo_directory}",  # So nonroot user can run tests
+        # Remove the remote so the agent won't see newer commits.
+        f"git remote remove origin",
+    ]
+    if "install" in specs:
+        setup_commands.extend(specs["install"])
+    return setup_commands
+
+
+def make_env_script_list_js(instance, specs, env_name) -> list:
+    """
+    Creates the list of commands to set up the environment for testing.
+    This is the setup script for the environment image.
+    """
+    reqs_commands = []
+    if "apt-pkgs" in specs:
+        reqs_commands += [
+            "apt-get update",
+            f"apt-get install -y {' '.join(specs['apt-pkgs'])}"
+        ]
+    return reqs_commands
+
+
+def make_eval_script_list_js(instance, specs, env_name, repo_directory, base_commit, test_patch) -> list:
+    """
+    Applies the test patch and runs the tests.
+    """
+    HEREDOC_DELIMITER = "EOF_114329324912"
+    test_files = get_modified_files(test_patch)
+    # Reset test files to the state they should be in before the patch.
+    if test_files:
+        reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
+    else:
+        reset_tests_command = f'echo "No test files to reset"'
+    
+    apply_test_patch_command = (
+        f"git apply --verbose --reject - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}"
+    )
+    test_commands = get_test_cmds(instance)
+    eval_commands = [
+        f"cd {repo_directory}",
+        f"git config --global --add safe.directory {repo_directory}",  # for nonroot user
+        f"cd {repo_directory}",
+        # This is just informational, so we have a record
+        # f"git status",
+        # f"git show",
+        # f"git -c core.fileMode=false diff {base_commit}",
+        reset_tests_command,
+        *get_download_img_commands(instance),
+        apply_test_patch_command,
+        f": '{START_TEST_OUTPUT}'",
+        *test_commands,
+        f": '{END_TEST_OUTPUT}'",
+        reset_tests_command,
+    ]
+    return eval_commands
diff --git a/swebench/harness/test_spec/python.py b/swebench/harness/test_spec/python.py
new file mode 100644
index 00000000..235751db
--- /dev/null
+++ b/swebench/harness/test_spec/python.py
@@ -0,0 +1,306 @@
+import os
+import posixpath
+import re
+import requests
+
+from swebench.harness.constants import (
+    SWEbenchInstance,
+    MAP_REPO_TO_ENV_YML_PATHS,
+    MAP_REPO_TO_INSTALL,
+    MAP_REPO_TO_REQS_PATHS,
+    MAP_REPO_VERSION_TO_SPECS,
+    NON_TEST_EXTS,
+    SWE_BENCH_URL_RAW,
+    START_TEST_OUTPUT,
+    END_TEST_OUTPUT,
+)
+from swebench.harness.utils import get_modified_files
+from functools import cache
+
+HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
+
+
+@cache
+def get_environment_yml_by_commit(repo: str, commit: str, env_name: str) -> str:
+    for req_path in MAP_REPO_TO_ENV_YML_PATHS[repo]:
+        reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path)
+        reqs = requests.get(reqs_url, headers=HEADERS)
+        if reqs.status_code == 200:
+            break
+    else:
+        raise ValueError(
+            f"Could not find environment.yml at paths {MAP_REPO_TO_ENV_YML_PATHS[repo]} for repo {repo} at commit {commit}"
+        )
+
+    lines = reqs.text.split("\n")
+    cleaned = []
+    for line in lines:
+        # Rename environment to given name
+        if line.startswith("name:"):
+            cleaned.append(f"name: {env_name}")
+            continue
+        cleaned.append(line)
+
+    return "\n".join(cleaned)
+
+
+def get_environment_yml(instance: SWEbenchInstance, env_name: str) -> str:
+    """
+    Get environment.yml for given task instance
+
+    Args:
+        instance (dict): SWE Bench Task instance
+        env_name (str): Rename retrieved environment.yml to this name
+    Returns:
+        environment.yml (str): Returns environment.yml as string
+    """
+    # Attempt to find environment.yml at each path based on task instance's repo
+    commit = (
+        instance["environment_setup_commit"]
+        if "environment_setup_commit" in instance
+        else instance["base_commit"]
+    )
+
+    return get_environment_yml_by_commit(instance["repo"], commit, env_name)
+
+
+@cache
+def get_requirements_by_commit(repo: str, commit: str) -> str:
+    for req_path in MAP_REPO_TO_REQS_PATHS[repo]:
+        reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path)
+        reqs = requests.get(reqs_url, headers=HEADERS)
+        if reqs.status_code == 200:
+            break
+    else:
+        raise ValueError(
+            f"Could not find requirements.txt at paths {MAP_REPO_TO_REQS_PATHS[repo]} for repo {repo} at commit {commit}"
+        )
+
+    lines = reqs.text
+    original_req = []
+    additional_reqs = []
+    req_dir = "/".join(req_path.split("/")[:-1])
+    exclude_line = lambda line: any(
+        [line.strip().startswith(x) for x in ["-e .", "#", ".[test"]]
+    )
+
+    for line in lines.split("\n"):
+        if line.strip().startswith("-r"):
+            # Handle recursive requirements
+            file_name = line[len("-r") :].strip()
+            reqs_url = os.path.join(
+                SWE_BENCH_URL_RAW,
+                repo,
+                commit,
+                req_dir,
+                file_name,
+            )
+            reqs = requests.get(reqs_url, headers=HEADERS)
+            if reqs.status_code == 200:
+                for line_extra in reqs.text.split("\n"):
+                    if not exclude_line(line_extra):
+                        additional_reqs.append(line_extra)
+        else:
+            if not exclude_line(line):
+                original_req.append(line)
+
+    # Combine all requirements into single text body
+    additional_reqs.append("\n".join(original_req))
+    all_reqs = "\n".join(additional_reqs)
+
+    return all_reqs
+
+
+def get_requirements(instance: SWEbenchInstance) -> str:
+    """
+    Get requirements.txt for given task instance
+
+    Args:
+        instance (dict): task instance
+    Returns:
+        requirements.txt (str): Returns requirements.txt as string
+    """
+    # Attempt to find requirements.txt at each path based on task instance's repo
+    commit = (
+        instance["environment_setup_commit"]
+        if "environment_setup_commit" in instance
+        else instance["base_commit"]
+    )
+
+    return get_requirements_by_commit(instance["repo"], commit)
+
+
+def get_test_directives(instance: SWEbenchInstance) -> list:
+    """
+    Get test directives from the test_patch of a task instance
+
+    Args:
+        instance (dict): task instance
+    Returns:
+        directives (list): List of test directives
+    """
+    # For seq2seq code repos, testing command is fixed
+    if instance["repo"] == "swe-bench/humaneval":
+        return ["test.py"]
+
+    # Get test directives from test patch and remove non-test files
+    diff_pat = r"diff --git a/.* b/(.*)"
+    test_patch = instance["test_patch"]
+    directives = re.findall(diff_pat, test_patch)
+    directives = [
+        d for d in directives if not any(d.endswith(ext) for ext in NON_TEST_EXTS)
+    ]
+
+    # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
+    if instance["repo"] == "django/django":
+        directives_transformed = []
+        for d in directives:
+            d = d[: -len(".py")] if d.endswith(".py") else d
+            d = d[len("tests/") :] if d.startswith("tests/") else d
+            d = d.replace("/", ".")
+            directives_transformed.append(d)
+        directives = directives_transformed
+
+    return directives
+
+
+def make_repo_script_list_py(specs, repo, repo_directory, base_commit, env_name) -> list:
+    """
+    Create a list of bash commands to set up the repository for testing.
+    This is the setup script for the instance image.
+    """
+    setup_commands = [
+        f"git clone -o origin https://github.com/{repo} {repo_directory}",
+        f"chmod -R 777 {repo_directory}",  # So nonroot user can run tests
+        f"cd {repo_directory}",
+        f"git reset --hard {base_commit}",
+        # Remove the remote so the agent won't see newer commits.
+        f"git remote remove origin",
+        # Make sure conda is available for later use
+        "source /opt/miniconda3/bin/activate",
+        f"conda activate {env_name}",
+        f'echo "Current environment: $CONDA_DEFAULT_ENV"',
+    ]
+    if repo in MAP_REPO_TO_INSTALL:
+        setup_commands.append(MAP_REPO_TO_INSTALL[repo])
+
+    # Run pre-install set up if provided
+    if "pre_install" in specs:
+        for pre_install in specs["pre_install"]:
+            setup_commands.append(pre_install)
+
+    if "install" in specs:
+        setup_commands.append(specs["install"])
+    return setup_commands
+
+
+def make_env_script_list_py(instance, specs, env_name) -> list:
+    """
+    Creates the list of commands to set up the conda environment for testing.
+    This is the setup script for the environment image.
+    """
+    HEREDOC_DELIMITER = "EOF_59812759871"
+    reqs_commands = [
+        "source /opt/miniconda3/bin/activate",
+    ]
+    # Create conda environment according to install instructinos
+    pkgs = specs.get("packages", "")
+    if pkgs == "requirements.txt":
+        # Create environment
+        cmd = f"conda create -n {env_name} python={specs['python']} -y"
+        reqs_commands.append(cmd)
+
+        # Install dependencies
+        reqs = get_requirements(instance)
+        path_to_reqs = "$HOME/requirements.txt"
+        reqs_commands.append(
+            f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}"
+        )
+        cmd = f"conda activate {env_name} && python -m pip install -r {path_to_reqs}"
+        reqs_commands.append(cmd)
+        reqs_commands.append(f"rm {path_to_reqs}")
+    elif pkgs == "environment.yml":
+        # Create environment from yml
+        reqs = get_environment_yml(instance, env_name)
+        path_to_reqs = "environment.yml"
+        reqs_commands.append(
+            f"cat <<'{HEREDOC_DELIMITER}' > {path_to_reqs}\n{reqs}\n{HEREDOC_DELIMITER}"
+        )
+        if "no_use_env" in specs and specs["no_use_env"]:
+            # `conda create` based installation
+            cmd = f"conda create -c conda-forge -n {env_name} python={specs['python']} -y"
+            reqs_commands.append(cmd)
+
+            # Install dependencies
+            cmd = f"conda env update -f {path_to_reqs}"
+            reqs_commands.append(cmd)
+        else:
+            # `conda env create` based installation
+            cmd = f"conda env create --file {path_to_reqs}"
+            reqs_commands.append(cmd)
+
+            cmd = f"conda activate {env_name} && conda install python={specs['python']} -y"
+            reqs_commands.append(cmd)
+
+        # Remove environment.yml
+        reqs_commands.append(f"rm {path_to_reqs}")
+    else:
+        # Create environment + install dependencies
+        cmd = f"conda create -n {env_name} python={specs['python']} {pkgs} -y"
+        reqs_commands.append(cmd)
+
+    reqs_commands.append(f"conda activate {env_name}")
+
+    # Install additional packages if specified
+    if "pip_packages" in specs:
+        pip_packages = " ".join(specs["pip_packages"])
+        cmd = f"python -m pip install {pip_packages}"
+        reqs_commands.append(cmd)
+    return reqs_commands
+
+
+def make_eval_script_list_py(instance, specs, env_name, repo_directory, base_commit, test_patch) -> list:
+    """
+    Applies the test patch and runs the tests.
+    """
+    HEREDOC_DELIMITER = "EOF_114329324912"
+    test_files = get_modified_files(test_patch)
+    # Reset test files to the state they should be in before the patch.
+    reset_tests_command = f"git checkout {base_commit} {' '.join(test_files)}"
+    apply_test_patch_command = (
+        f"git apply -v - <<'{HEREDOC_DELIMITER}'\n{test_patch}\n{HEREDOC_DELIMITER}"
+    )
+    test_command = " ".join(
+        [
+            MAP_REPO_VERSION_TO_SPECS[instance["repo"]][instance["version"]]["test_cmd"],
+            *get_test_directives(instance),
+        ]
+    )
+    eval_commands = [
+        f"source /opt/miniconda3/bin/activate",
+        f"conda activate {env_name}",
+        f"cd {repo_directory}",
+    ]
+    if "eval_commands" in specs:
+        eval_commands += specs["eval_commands"]
+    eval_commands += [
+        f"git config --global --add safe.directory {repo_directory}",  # for nonroot user
+        f"cd {repo_directory}",
+        # This is just informational, so we have a record
+        f"git status",
+        f"git show",
+        f"git -c core.fileMode=false diff {base_commit}",
+        "source /opt/miniconda3/bin/activate",
+        f"conda activate {env_name}",
+    ]
+    if "install" in specs:
+        eval_commands.append(specs["install"])
+    eval_commands += [
+        reset_tests_command,
+        apply_test_patch_command,
+        f": '{START_TEST_OUTPUT}'",
+        test_command,
+        f": '{END_TEST_OUTPUT}'",
+        reset_tests_command,  # Revert tests after done, leave the repo in the same state as before
+    ]
+    return eval_commands
diff --git a/swebench/harness/test_spec/test_spec.py b/swebench/harness/test_spec/test_spec.py
new file mode 100644
index 00000000..83abe995
--- /dev/null
+++ b/swebench/harness/test_spec/test_spec.py
@@ -0,0 +1,202 @@
+import hashlib
+import json
+import platform
+
+from dataclasses import dataclass
+from typing import Any, Union, cast
+
+from swebench.harness.constants import (
+    DEFAULT_DOCKER_SPECS,
+    KEY_INSTANCE_ID,
+    LATEST,
+    MAP_REPO_TO_EXT,
+    MAP_REPO_VERSION_TO_SPECS,
+    USE_X86,
+)
+from swebench.harness.constants.constants import SWEbenchInstance
+from swebench.harness.dockerfiles import (
+    get_dockerfile_base,
+    get_dockerfile_env,
+    get_dockerfile_instance,
+)
+from swebench.harness.test_spec.create_scripts import (
+    make_repo_script_list,
+    make_env_script_list,
+    make_eval_script_list,
+)
+
+
+@dataclass
+class TestSpec:
+    """
+    A dataclass that represents a test specification for a single instance of SWE-bench.
+    """
+    instance_id: str
+    repo: str
+    version: str
+    repo_script_list: list[str]
+    eval_script_list: list[str]
+    env_script_list: list[str]
+    arch: str
+    FAIL_TO_PASS: list[str]
+    PASS_TO_PASS: list[str]
+    language: str
+    docker_specs: dict
+    namespace: str
+    base_image_tag: str = LATEST
+    env_image_tag: str = LATEST
+    instance_image_tag: str = LATEST
+
+    @property
+    def setup_env_script(self):
+        return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.env_script_list) + "\n"
+
+    @property
+    def eval_script(self):
+        return "\n".join(["#!/bin/bash", "set -uxo pipefail"] + self.eval_script_list) + "\n"
+        # Don't exit early because we need to revert tests at the end
+
+    @property
+    def install_repo_script(self):
+        return "\n".join(["#!/bin/bash", "set -euxo pipefail"] + self.repo_script_list) + "\n"
+
+    @property
+    def base_image_key(self):
+        return f"sweb.base.{MAP_REPO_TO_EXT[self.repo]}.{self.arch}:{self.base_image_tag}"
+
+    @property
+    def env_image_key(self):
+        """
+        The key for the environment image is based on the hash of the environment script list.
+        If the environment script list changes, the image will be rebuilt automatically.
+
+        Note that old images are not automatically deleted, so consider cleaning up old images periodically.
+        """
+        hash_key = str(self.env_script_list)
+        if self.docker_specs != {}:
+            hash_key += str(self.docker_specs)
+        hash_object = hashlib.sha256()
+        hash_object.update(hash_key.encode("utf-8"))
+        hash_value = hash_object.hexdigest()
+        val = hash_value[:22]  # 22 characters is still very likely to be unique
+        return f"sweb.env.{MAP_REPO_TO_EXT[self.repo]}.{self.arch}.{val}:{self.env_image_tag}"
+
+    @property
+    def instance_image_key(self):
+        key = f"sweb.eval.{self.arch}.{self.instance_id.lower()}:{self.instance_image_tag}"
+        if self.is_remote_image:
+            key = f"{self.namespace}/{key}".replace("__", "_1776_")
+        return key
+    
+    @property
+    def is_remote_image(self):
+        return self.namespace is not None
+
+    def get_instance_container_name(self, run_id=None):
+        if not run_id:
+            return f"sweb.eval.{self.instance_id}"
+        return f"sweb.eval.{self.instance_id.lower()}.{run_id}"
+
+    @property
+    def base_dockerfile(self):
+        return get_dockerfile_base(self.platform, self.arch, self.language)
+
+    @property
+    def env_dockerfile(self):
+        return get_dockerfile_env(self.platform, self.arch, self.language, **{
+            **DEFAULT_DOCKER_SPECS,
+            **self.docker_specs
+        })
+
+    @property
+    def instance_dockerfile(self):
+        return get_dockerfile_instance(self.platform, self.language, self.env_image_key)
+
+    @property
+    def platform(self):
+        if self.arch == "x86_64":
+            return "linux/x86_64"
+        elif self.arch == "arm64":
+            return "linux/arm64/v8"
+        else:
+            raise ValueError(f"Invalid architecture: {self.arch}")
+
+
+def get_test_specs_from_dataset(
+    dataset: Union[list[SWEbenchInstance], list[TestSpec]],
+    namespace: str=None,
+    instance_image_tag: str=LATEST,
+) -> list[TestSpec]:
+    """
+    Idempotent function that converts a list of SWEbenchInstance objects to a list of TestSpec objects.
+    """
+    if isinstance(dataset[0], TestSpec):
+        return cast(list[TestSpec], dataset)
+    return list(map(lambda x: make_test_spec(x, namespace, instance_image_tag), cast(list[SWEbenchInstance], dataset)))
+
+
+def make_test_spec(
+        instance: SWEbenchInstance,
+        namespace: str=None,
+        base_image_tag: str=LATEST,
+        env_image_tag: str=LATEST,
+        instance_image_tag: str=LATEST,
+    ) -> TestSpec:
+    if isinstance(instance, TestSpec):
+        return instance
+    assert base_image_tag is not None, "base_image_tag cannot be None"
+    assert env_image_tag is not None, "env_image_tag cannot be None"
+    assert instance_image_tag is not None, "instance_image_tag cannot be None"
+    instance_id = instance[KEY_INSTANCE_ID]
+    repo = instance["repo"]
+    version = instance.get("version")
+    base_commit = instance["base_commit"]
+    problem_statement = instance.get("problem_statement")
+    hints_text = instance.get("hints_text")  # Unused
+    test_patch = instance["test_patch"]
+
+    def _from_json_or_obj(key: str) -> Any:
+        """If key points to string, load with json"""
+        if key not in instance:
+            # If P2P, F2P keys not found, it's a validation instance
+            return []
+        if isinstance(instance[key], str):
+            return json.loads(instance[key])
+        return instance[key]
+
+    pass_to_pass = _from_json_or_obj("PASS_TO_PASS")
+    fail_to_pass = _from_json_or_obj("FAIL_TO_PASS")
+
+    env_name = "testbed"
+    repo_directory = f"/{env_name}"
+    specs = MAP_REPO_VERSION_TO_SPECS[repo][version]
+    docker_specs = specs.get("docker_specs", {})
+
+    repo_script_list = make_repo_script_list(specs, repo, repo_directory, base_commit, env_name)
+    env_script_list = make_env_script_list(instance, specs, env_name)
+    eval_script_list = make_eval_script_list(
+        instance, specs, env_name, repo_directory, base_commit, test_patch
+    )
+    if platform.machine() in {"aarch64", "arm64"}:
+        # use arm64 unless explicitly specified
+        arch = "arm64" if instance_id not in USE_X86 else "x86_64"
+    else:
+        arch = "x86_64"
+
+    return TestSpec(
+        instance_id=instance_id,
+        repo=repo,
+        env_script_list=env_script_list,
+        repo_script_list=repo_script_list,
+        eval_script_list=eval_script_list,
+        version=version,
+        arch=arch,
+        FAIL_TO_PASS=fail_to_pass,
+        PASS_TO_PASS=pass_to_pass,
+        language=MAP_REPO_TO_EXT[repo],
+        docker_specs=docker_specs,
+        namespace=namespace,
+        base_image_tag=base_image_tag,
+        env_image_tag=env_image_tag,
+        instance_image_tag=instance_image_tag,
+    )
diff --git a/swebench/harness/utils.py b/swebench/harness/utils.py
index 9a5b7af9..55524071 100644
--- a/swebench/harness/utils.py
+++ b/swebench/harness/utils.py
@@ -1,28 +1,68 @@
 import json
-import os
-import posixpath
-from pathlib import Path
 import re
 import requests
+import traceback
 
 from argparse import ArgumentTypeError
-from datasets import Dataset, load_dataset
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datasets import Dataset, load_dataset, load_from_disk
 from dotenv import load_dotenv
-from functools import cache
+from pathlib import Path
+from tqdm import tqdm
 from typing import cast
-
-from swebench.harness.constants import (
-    SWEbenchInstance,
-    MAP_REPO_TO_ENV_YML_PATHS,
-    MAP_REPO_TO_REQS_PATHS,
-    NON_TEST_EXTS,
-    SWE_BENCH_URL_RAW,
-    KEY_INSTANCE_ID,
-)
+from swebench.harness.constants import SWEbenchInstance, KEY_INSTANCE_ID
+from unidiff import PatchSet
 
 load_dotenv()
 
-HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
+
+def run_threadpool(func, payloads, max_workers):
+    if max_workers <= 0:
+        return run_sequential(func, payloads)
+    succeeded, failed = [], []
+    with tqdm(total=len(payloads), smoothing=0) as pbar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Create a future for running each instance
+            futures = {
+                executor.submit(
+                    func,
+                    *payload
+                ): payload
+                for payload in payloads
+            }
+            # Wait for each future to complete
+            for future in as_completed(futures):
+                try:
+                    # Update progress bar, check if instance ran successfully
+                    future.result()
+                    succeeded.append(futures[future])
+                except Exception as e:
+                    print(f"{type(e)}: {e}")
+                    traceback.print_exc()
+                    failed.append(futures[future])
+                    continue
+                pbar.update(1)
+                pbar.set_description(f"{len(succeeded)} ran successfully, {len(failed)} failed")
+    return succeeded, failed
+
+
+def run_sequential(func, args_list):
+    """
+    Run a function with a list of arguments sequentially
+    """
+    succeeded, failed = [], []
+    pbar = tqdm(total=len(args_list), smoothing=0)
+    for args in args_list:
+        try:
+            func(*args)
+            succeeded.append(args)
+        except Exception as e:
+            traceback.print_exc()
+            failed.append(args)
+        pbar.update(1)
+        pbar.set_description(f"{len(succeeded)} ran successfully, {len(failed)} failed")
+    pbar.close()
+    return succeeded, failed
 
 
 def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]:
@@ -42,7 +82,10 @@ def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance
             name = "princeton-nlp/SWE-bench"
         elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}:
             name = "princeton-nlp/SWE-bench_Lite"
-        dataset = cast(Dataset, load_dataset(name, split=split))
+        if (Path(name) / split / 'dataset_info.json').exists():
+            dataset = cast(Dataset, load_from_disk(Path(name) / split))
+        else:
+            dataset = cast(Dataset, load_dataset(name, split=split))
         dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
     if instance_ids:
         if instance_ids - dataset_ids:
@@ -88,6 +131,8 @@ def strip_content(hunk):
     first_idx = get_first_idx(first_chars)
     last_idx = get_last_idx(first_chars)
     new_lines = list(map(lambda x: x.rstrip(), hunk.split("\n")[first_idx:last_idx]))
+    # should leave one space for empty context lines
+    new_lines = [line if line.strip() else " " for line in new_lines]
     new_hunk = "\n" + "\n".join(new_lines) + "\n"
     return new_hunk, first_idx - 1
 
@@ -174,151 +219,6 @@ def get_lines_with_word(text, target_word):
     return False
 
 
-@cache
-def get_environment_yml_by_commit(repo: str, commit: str, env_name: str) -> str:
-    for req_path in MAP_REPO_TO_ENV_YML_PATHS[repo]:
-        reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path)
-        reqs = requests.get(reqs_url, headers=HEADERS)
-        if reqs.status_code == 200:
-            break
-    else:
-        raise ValueError(
-            f"Could not find environment.yml at paths {MAP_REPO_TO_ENV_YML_PATHS[repo]} for repo {repo} at commit {commit}"
-        )
-
-    lines = reqs.text.split("\n")
-    cleaned = []
-    for line in lines:
-        # Rename environment to given name
-        if line.startswith("name:"):
-            cleaned.append(f"name: {env_name}")
-            continue
-        cleaned.append(line)
-
-    return "\n".join(cleaned)
-
-
-def get_environment_yml(instance: SWEbenchInstance, env_name: str) -> str:
-    """
-    Get environment.yml for given task instance
-
-    Args:
-        instance (dict): SWE Bench Task instance
-        env_name (str): Rename retrieved environment.yml to this name
-    Returns:
-        environment.yml (str): Returns environment.yml as string
-    """
-    # Attempt to find environment.yml at each path based on task instance's repo
-
-    commit = (
-        instance["environment_setup_commit"]
-        if "environment_setup_commit" in instance
-        else instance["base_commit"]
-    )
-
-    return get_environment_yml_by_commit(instance["repo"], commit, env_name)
-
-
-@cache
-def get_requirements_by_commit(repo: str, commit: str) -> str:
-    for req_path in MAP_REPO_TO_REQS_PATHS[repo]:
-        reqs_url = posixpath.join(SWE_BENCH_URL_RAW, repo, commit, req_path)
-        reqs = requests.get(reqs_url, headers=HEADERS)
-        if reqs.status_code == 200:
-            break
-    else:
-        raise ValueError(
-            f"Could not find requirements.txt at paths {MAP_REPO_TO_REQS_PATHS[repo]} for repo {repo} at commit {commit}"
-        )
-
-    lines = reqs.text
-    original_req = []
-    additional_reqs = []
-    req_dir = "/".join(req_path.split("/")[:-1])
-    exclude_line = lambda line: any(
-        [line.strip().startswith(x) for x in ["-e .", "#", ".[test"]]
-    )
-
-    for line in lines.split("\n"):
-        if line.strip().startswith("-r"):
-            # Handle recursive requirements
-            file_name = line[len("-r") :].strip()
-            reqs_url = os.path.join(
-                SWE_BENCH_URL_RAW,
-                repo,
-                commit,
-                req_dir,
-                file_name,
-            )
-            reqs = requests.get(reqs_url, headers=HEADERS)
-            if reqs.status_code == 200:
-                for line_extra in reqs.text.split("\n"):
-                    if not exclude_line(line_extra):
-                        additional_reqs.append(line_extra)
-        else:
-            if not exclude_line(line):
-                original_req.append(line)
-
-    # Combine all requirements into single text body
-    additional_reqs.append("\n".join(original_req))
-    all_reqs = "\n".join(additional_reqs)
-
-    return all_reqs
-
-
-def get_requirements(instance: SWEbenchInstance) -> str:
-    """
-    Get requirements.txt for given task instance
-
-    Args:
-        instance (dict): task instance
-    Returns:
-        requirements.txt (str): Returns requirements.txt as string
-    """
-    # Attempt to find requirements.txt at each path based on task instance's repo
-    commit = (
-        instance["environment_setup_commit"]
-        if "environment_setup_commit" in instance
-        else instance["base_commit"]
-    )
-
-    return get_requirements_by_commit(instance["repo"], commit)
-
-
-def get_test_directives(instance: SWEbenchInstance) -> list:
-    """
-    Get test directives from the test_patch of a task instance
-
-    Args:
-        instance (dict): task instance
-    Returns:
-        directives (list): List of test directives
-    """
-    # For seq2seq code repos, testing command is fixed
-    if instance["repo"] == "swe-bench/humaneval":
-        return ["test.py"]
-
-    # Get test directives from test patch and remove non-test files
-    diff_pat = r"diff --git a/.* b/(.*)"
-    test_patch = instance["test_patch"]
-    directives = re.findall(diff_pat, test_patch)
-    directives = [
-        d for d in directives if not any(d.endswith(ext) for ext in NON_TEST_EXTS)
-    ]
-
-    # For Django tests, remove extension + "tests/" prefix and convert slashes to dots (module referencing)
-    if instance["repo"] == "django/django":
-        directives_transformed = []
-        for d in directives:
-            d = d[: -len(".py")] if d.endswith(".py") else d
-            d = d[len("tests/") :] if d.startswith("tests/") else d
-            d = d.replace("/", ".")
-            directives_transformed.append(d)
-        directives = directives_transformed
-
-    return directives
-
-
 def str2bool(v):
     """
     Minor helper function to convert string to boolean
@@ -331,3 +231,26 @@ def str2bool(v):
         return False
     else:
         raise ArgumentTypeError("Boolean value expected.")
+
+
+def get_repo_file(repo, commit, filepath):
+    url = f'https://raw.githubusercontent.com/{repo}/{commit}/{filepath}'
+    try:
+        response = requests.get(url)
+        if response.status_code == 200:
+            return response.text
+        return None
+    except:
+        return None
+
+
+def get_modified_files(patch: str) -> list[str]:
+    """
+    Get the list of modified files in a patch
+    """
+    source_files = []
+    for file in PatchSet(patch):
+        if file.source_file != '/dev/null':
+            source_files.append(file.source_file)
+    source_files = [x[2:] for x in source_files if x.startswith('a/')]
+    return source_files