From 9a905437a67395c1a3e53dad1350f943d60baf74 Mon Sep 17 00:00:00 2001
From: Fabian Herzog <fabianfubel@gmail.com>
Date: Tue, 17 Sep 2024 22:24:43 +0200
Subject: [PATCH] Public code release.

---
 .gitignore                    | 171 +++++++++++++
 conf/config.yaml              |  83 ++++++
 conf/dataset/CityFlow.yaml    |   8 +
 conf/encoder/precomputed.yaml |   1 +
 conf/experiment/CityFlow.yaml |  44 ++++
 setup.py                      |  35 +++
 src/__init__.py               |   0
 src/datasets/dataset.py       | 389 ++++++++++++++++++++++++++++
 src/tracker/encoder.py        |  23 ++
 src/tracker/geometry.py       |  81 ++++++
 src/tracker/similarities.py   |  84 ++++++
 src/tracker/solver.py         |  65 +++++
 src/tracker/supertrack.py     | 254 ++++++++++++++++++
 src/tracker/tracker.py        | 467 ++++++++++++++++++++++++++++++++++
 src/utils/evaluate.py         | 260 +++++++++++++++++++
 src/utils/iotools.py          | 113 ++++++++
 src/utils/utils.py            | 208 +++++++++++++++
 tools/track.py                |  90 +++++++
 18 files changed, 2376 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 conf/config.yaml
 create mode 100644 conf/dataset/CityFlow.yaml
 create mode 100644 conf/encoder/precomputed.yaml
 create mode 100644 conf/experiment/CityFlow.yaml
 create mode 100644 setup.py
 create mode 100644 src/__init__.py
 create mode 100644 src/datasets/dataset.py
 create mode 100644 src/tracker/encoder.py
 create mode 100644 src/tracker/geometry.py
 create mode 100644 src/tracker/similarities.py
 create mode 100644 src/tracker/solver.py
 create mode 100644 src/tracker/supertrack.py
 create mode 100644 src/tracker/tracker.py
 create mode 100644 src/utils/evaluate.py
 create mode 100644 src/utils/iotools.py
 create mode 100644 src/utils/utils.py
 create mode 100644 tools/track.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e747d2f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,171 @@
+# Project-specific
+data/
+eval/
+resources/
+outputs/
+wandb/
+
+.ruff_cache
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
\ No newline at end of file
diff --git a/conf/config.yaml b/conf/config.yaml
new file mode 100644
index 0000000..78eb298
--- /dev/null
+++ b/conf/config.yaml
@@ -0,0 +1,83 @@
+# config.yaml
+hydra/hydra_logging: null
+
+defaults:
+  - dataset: CityFlow
+  - encoder: precomputed
+
+dataset_path: ./data/AICITY/
+output_path: ./outputs/
+
+device: cuda
+
+logging:
+  wandb:
+    enable: false
+    project: ggmc
+    upload_results: false
+    tags: null
+  tensorboard:
+    enable: false
+
+resources:
+  path: ./resources/
+  detector: YOLOX
+  reid: null
+
+visuals:
+  plot_interval: 1
+  plot_results: false
+  plot_ground_truth: false
+  plot_to_tensorboard: false
+  grid_rows: 2
+  store_files: true
+  border_size: 3
+
+solver:
+  backend: PD
+
+tracker:
+  matching:
+    distance_threshold: 0.02
+    rescale_threshold: 0.65
+    reid_decay: 1.0
+    rescale_weight: 0.5
+    distance_weight: 0.5
+  confidence_thresh: 0.7
+  low_confidence_thresh: null
+  patience: 1
+  memory: 15
+  fdim: 512
+  enable_accumulator: true
+  prematching:
+    enabled: true
+    iou_bias: 0.60
+    iou_threshold: 0.50
+    prune_remaining: false
+
+preprocess:
+  nms_thresh: null
+  roi_filter: true
+  bottom: true
+  box_projection_centers:
+    alpha_w: null
+    alpha_h: null
+
+postprocess:
+  expand_boxes:
+    enable: true
+    factor: 1.4
+  remove_borders:
+    enable: true
+    border_size: 5
+  size_filter:
+    enable: true
+    min_size: 6220
+    max_size: 622080
+
+evaluation:
+  inplace: true
+  evaluate_standard: true
+  evaluate_hota: false
+  evaluate_bev: false
+  evaluate_external: true
diff --git a/conf/dataset/CityFlow.yaml b/conf/dataset/CityFlow.yaml
new file mode 100644
index 0000000..fff199a
--- /dev/null
+++ b/conf/dataset/CityFlow.yaml
@@ -0,0 +1,8 @@
+name: AICITY
+scene_path: ./validation/S02
+camera_pattern: c00*
+img_path: ./img1/
+img_ext: jpg
+offsets: [0, 0, 3, 8]
+calibration_path: calibration.json
+roi_path: "./data/AICITY/eval/ROIs/validation"
diff --git a/conf/encoder/precomputed.yaml b/conf/encoder/precomputed.yaml
new file mode 100644
index 0000000..868e1e1
--- /dev/null
+++ b/conf/encoder/precomputed.yaml
@@ -0,0 +1 @@
+name: precomputed
diff --git a/conf/experiment/CityFlow.yaml b/conf/experiment/CityFlow.yaml
new file mode 100644
index 0000000..7cf2af1
--- /dev/null
+++ b/conf/experiment/CityFlow.yaml
@@ -0,0 +1,44 @@
+# @package _global_
+
+defaults:
+  - override /dataset: CityFlow
+  - override /encoder: precomputed
+
+dataset_path: ./data/AICITY/
+
+resources:
+  reid: LCFractal
+  detector: YOLOX
+
+tracker:
+  matching:
+    distance_threshold: 0.001
+    rescale_threshold: 0.7
+    reid_decay: 0.7
+    rescale_weight: 0.9
+  confidence_thresh: 0.70
+  low_confidence_thresh: null
+  patience: 0
+  memory: 160
+  fdim: 2048
+  prematching:
+    enabled: false
+    iou_bias: 0.50
+    iou_threshold: 0.70
+    prune_remaining: false
+
+preprocess:
+  nms_thresh: 0.7
+  roi_filter: true
+
+postprocess:
+  expand_boxes:
+    enable: true
+    factor: 1.4
+  remove_borders:
+    enable: true
+    border_size: 0
+  size_filter:
+    enable: true
+    min_size: 6000
+    max_size: 600000
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..dce079e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,35 @@
+from setuptools import find_packages, setup
+
+
+setup(
+    name="stmc",
+    version="0.1.0",
+    packages=find_packages(),
+    install_requires=[
+        "hydra-core",
+        "torch",
+        "wandb",
+        "loguru",
+        "omegaconf",
+        "qqdm",
+        "pillow",
+        "ramapy",
+    ],
+    entry_points={
+        "console_scripts": [
+            "track=tools.track:main",
+        ],
+    },
+    author="Fabian Herzog",
+    author_email="fabian.herzog@tum.de",
+    description="Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking",
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/fubel/stmc",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.8",
+)
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/datasets/dataset.py b/src/datasets/dataset.py
new file mode 100644
index 0000000..863fe95
--- /dev/null
+++ b/src/datasets/dataset.py
@@ -0,0 +1,389 @@
+import glob
+import os
+import pathlib
+import warnings
+from enum import IntEnum
+from typing import List, Optional
+
+import numpy as np
+import torch
+from loguru import logger
+from torch.utils.data import DataLoader
+from torchvision.io import ImageReadMode, read_image
+from torchvision.ops import nms
+
+from ..tracker.geometry import Projector
+from ..utils.utils import compute_centers, resize_transform, tlwh_to_tlbr
+
+
+class Annotation(IntEnum):
+    CAM_ID = 0
+    OBJ_ID = 1
+    FRAME_ID = 2
+    XMIN = 3
+    YMIN = 4
+    WIDTH = 5
+    HEIGHT = 6
+    CONF = 7
+    XWORLD = 8
+    YWORLD = 9
+
+
+class NMSTransform:
+    def __init__(self, iou_threshold: float):
+        """Initialize the NMSTransform which applied non-maximum suppression to the
+        input annotations based on the specified IoU threshold.
+
+        Args:
+            iou_threshold (float): The Intersection over Union (IoU) threshold for NMS.
+                Bounding boxes with IoU greater than this threshold will be suppressed.
+        """
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, annotations: torch.Tensor) -> torch.Tensor:
+        boxes = tlwh_to_tlbr(annotations[:, Annotation.XMIN : Annotation.HEIGHT + 1])
+        scores = annotations[:, Annotation.CONF]
+        keep = nms(boxes, scores, self.iou_threshold)
+        return keep
+
+
+class ROIFilter:
+    def __init__(self, roi_path: str):
+        """Initialize the ROIFilter.
+
+        Args:
+            roi_path (str): Path to the ROI image file.
+
+        The ROI (Region of Interest) image is loaded as a binary mask,
+        where 1 indicates areas of interest and 0 indicates areas to be filtered out.
+        """
+        self.roi = read_image(roi_path, ImageReadMode.GRAY).squeeze(0).bool()
+        self.size = self.roi.size()
+
+    def __call__(self, annotations: torch.Tensor) -> torch.Tensor:
+        centers = compute_centers(annotations[:, Annotation.XMIN - 1 : Annotation.HEIGHT]).int()
+        centers[:, 0] = torch.clamp(centers[:, 0], 0, self.size[1] - 1)
+        centers[:, 1] = torch.clamp(centers[:, 1], 0, self.size[0] - 1)
+        keep = self.roi[centers[:, 1], centers[:, 0]] == 1
+        return keep
+
+
+class MultiCamDataset:
+    def __init__(
+        self,
+        annotation_paths: List[str],
+        image_paths: List[str],
+        calibration_paths: List[str],
+        camera_names: List[int],
+        ground_truth_paths: Optional[List[str]] = None,
+        precomputed: bool = False,
+        nms_threshold: Optional[float] = 0.9,
+        time_offsets: Optional[List[int]] = None,
+        roi_paths: Optional[List[str]] = None,
+        normalize_bev: bool = False,
+        bottom: bool = True,
+        box_projection_centers=None,
+    ):
+        """Initialize the MultiCamDataset for data loading.
+
+        Args:
+            annotation_paths (List[str]): Paths to annotation files for each camera.
+            image_paths (List[str]): Paths to image directories for each camera.
+            calibration_paths (List[str]): Paths to calibration files for each camera.
+            camera_names (List[int]): Names or IDs of the cameras.
+            ground_truth_paths (Optional[List[str]], optional): Paths to ground truth files. Defaults to None.
+            precomputed (bool, optional): Whether to use precomputed features. Defaults to False.
+            nms_threshold (Optional[float], optional): Non-maximum suppression threshold. Defaults to 0.9.
+            time_offsets (Optional[List[int]], optional): Time offsets for each camera. Defaults to None.
+            roi_paths (Optional[List[str]], optional): Paths to region of interest mask images. Defaults to None.
+            normalize_bev (bool, optional): Whether to normalize bird's-eye view coordinates. Defaults to False.
+            bottom (bool, optional): Whether to use bottom of bounding box for projection. Defaults to True.
+            box_projection_centers (Optional[Tuple[float, float]], optional): Projection centers for bounding boxes. Defaults to None.
+        """
+        if time_offsets is None:
+            self.time_offsets = [0] * len(image_paths)
+        else:
+            self.time_offsets = time_offsets
+
+        self.annotation_paths = annotation_paths
+        self.image_paths = image_paths
+        self.calibration_paths = calibration_paths
+        self.camera_names = camera_names
+        self.precomputed = precomputed
+        self.nms_transform = NMSTransform(nms_threshold) if nms_threshold is not None else None
+        self.box_projection_centers = box_projection_centers
+        self.bottom = bottom
+
+        self.normalize_bev = normalize_bev
+
+        if roi_paths is not None:
+            self.roi_filters = [ROIFilter(roi_path) for roi_path in roi_paths]
+        else:
+            self.roi_filters = None
+
+        self._load_calibrations()
+        self._load_annotations()
+
+        if ground_truth_paths is not None:
+            self._load_ground_truth(ground_truth_paths)
+        else:
+            self._ground_truths = None
+            self.gts = None
+
+        self.length = max([len(list(pathlib.Path(image_path).glob("*.jpg"))) for image_path in self.image_paths])
+
+        if self.length == 0:
+            warnings.warn("No images found. Visualization tools will not be available.")
+
+        self.length = 2110
+
+        self._filtered_by_nms = 0
+        self._filtered_by_size = 0
+        self._filtered_by_roi = 0
+
+    def _load_ground_truth(self, ground_truth_paths):
+        self._ground_truths = [
+            torch.from_numpy(np.loadtxt(ground_truth_path, delimiter=",", dtype=np.float32))
+            for ground_truth_path in ground_truth_paths
+        ]
+
+        for gt in self._ground_truths:
+            if gt.shape[1] == 9:
+                # append another column of ones
+                gt = torch.cat((gt, torch.ones(gt.shape[0], 1)), dim=1)
+
+        _cat_gts = [g.clone() for g in self._ground_truths]
+        for i, gt in enumerate(_cat_gts):
+            col = torch.ones((gt.shape[0], 1)) * i
+            _cat_gts[i] = torch.cat((col, gt), dim=1)
+            _cat_gts[i][:, 1] += self.time_offsets[i]
+
+        self.gts = torch.cat(_cat_gts, dim=0)
+        self.gts[:, [1, 2]] = self.gts[:, [2, 1]]
+
+    def _load_calibrations(self):
+        self._projectors = [Projector(calibration_path) for calibration_path in self.calibration_paths]
+
+    def _load_annotations(self):
+        anns = [
+            torch.from_numpy(np.loadtxt(annotation_path, delimiter=",", dtype=np.float32))
+            for annotation_path in self.annotation_paths
+        ]
+
+        # todo: add to preprocess config
+        for i, ann in enumerate(anns):
+            keep = (ann[:, Annotation.WIDTH - 1] * ann[:, Annotation.HEIGHT - 1]) >= 1200
+            anns[i] = ann[keep]
+
+        # filter roi images
+        if self.roi_filters is not None:
+            keep = self.roi_filters[i](anns[i])
+            anns[i] = anns[i][keep]
+            logger.info(f"🔥 Filtered {keep.size(0) - keep.sum().item()} annotations by ROI.")
+
+        for i, ann in enumerate(anns):
+            col = torch.ones((ann.shape[0], 1)) * i
+            anns[i] = torch.cat((col, ann), dim=1)
+            anns[i][:, 1] += self.time_offsets[i]
+
+        positions_2d = []
+        for i, ann in enumerate(anns):
+            pos2d = compute_centers(
+                ann[:, Annotation.XMIN : Annotation.HEIGHT + 1], self.bottom, self.box_projection_centers
+            )
+            positions_2d.append(pos2d)
+
+        positions_3d = []
+        for i, pos2d in enumerate(positions_2d):
+            pos3d = self._projectors[i].image_to_world(pos2d)
+            positions_3d.append(pos3d)
+
+        anns = torch.cat(anns, dim=0)
+        positions_2d = torch.cat(positions_2d, dim=0)
+        positions_3d = torch.cat(positions_3d, dim=0)
+
+        if anns.shape[1] == 9:
+            # loaded from ground truth, append column of 1s as 7th column
+            anns = torch.cat(
+                (
+                    anns[:, :6],
+                    torch.ones(anns.shape[0], 1),
+                    anns[:, 6:],
+                ),
+                dim=1,
+            )
+        # swap columns frame and obj_id
+        anns[:, [1, 2]] = anns[:, [2, 1]]
+
+        self._annotations = anns
+        self._positions_2d = positions_2d
+        self._positions_3d = positions_3d
+
+        if self.normalize_bev:
+            self.apply_bev_norm()
+        else:
+            self._norm_factors = None
+
+        self._annotations.to("cuda")
+        self._positions_2d.to("cuda")
+        self._positions_3d.to("cuda")
+
+    def get_bev_ticks(self):
+        return [
+            float(torch.min(self._positions_3d[:, 0])),
+            float(torch.max(self._positions_3d[:, 0])),
+            float(torch.min(self._positions_3d[:, 1])),
+            float(torch.max(self._positions_3d[:, 1])),
+        ]
+
+    def get_crops(self, frame_annotations, frame_images):
+        crops = []
+        for ann in frame_annotations:
+            cam_id = int(ann[Annotation.CAM_ID])
+            x, y, w, h = ann[Annotation.XMIN : Annotation.HEIGHT + 1].int()
+            # clamp to image dimensions
+            x = torch.clamp(x, 0, frame_images[cam_id].size(1) - 1)
+            y = torch.clamp(y, 0, frame_images[cam_id].size(2) - 1)
+            w = torch.clamp(w, 0, frame_images[cam_id].size(1) - x)
+            h = torch.clamp(h, 0, frame_images[cam_id].size(2) - y)
+            crops.append(resize_transform(frame_images[cam_id][:, y : y + h, x : x + w]))
+        if len(crops) == 0:
+            return torch.empty(0)
+        return torch.stack(crops)
+
+    def apply_bev_norm(self):
+        # normalize BEV positions to [0, 1]
+        logger.info("📏 Normalizing BEV positions to [0, 1].")
+        min_x, min_y = torch.min(self._positions_3d, dim=0)[0]
+        max_x, max_y = torch.max(self._positions_3d, dim=0)[0]
+        self._norm_factors = torch.tensor([min_x, min_y, max_x, max_y])
+        self._positions_3d = (self._positions_3d - torch.tensor([min_x, min_y])) / torch.tensor(
+            [max_x - min_x, max_y - min_y]
+        )
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, idx):
+        frame = idx + 1
+
+        annotations = self._annotations[self._annotations[:, Annotation.FRAME_ID] == frame]
+        positions_2d = self._positions_2d[self._annotations[:, Annotation.FRAME_ID] == frame]
+        positions_3d = self._positions_3d[self._annotations[:, Annotation.FRAME_ID] == frame]
+
+        if self.gts is not None:
+            ground_truth = self.gts[self.gts[:, Annotation.FRAME_ID] == frame]
+        else:
+            ground_truth = torch.empty(0)
+
+        if self.nms_transform is not None:
+            keep = self.nms_transform(annotations)
+        else:
+            keep = torch.arange(annotations.size(0))
+
+        annotations = annotations[keep]
+        positions_2d = positions_2d[keep]
+        positions_3d = positions_3d[keep]
+
+        frame_images = []
+        for img_path, offset in zip(self.image_paths, self.time_offsets):
+            try:
+                frame_images.append(read_image(str(pathlib.Path(img_path) / f"{(frame - offset):06d}.jpg")))
+            except Exception:
+                frame_images.append(torch.zeros(3, 1080, 1920).to(torch.uint8))
+
+        if not self.precomputed:
+            frame_crops = self.get_crops(annotations, frame_images)
+        else:
+            frame_crops = torch.empty(0)
+
+        return {
+            "annotations": annotations,
+            "positions_2d": positions_2d,
+            "positions_3d": positions_3d,
+            "images": frame_images,
+            "crops": frame_crops,
+            "ground_truth": ground_truth,
+        }
+
+
+def create_dataloader(cfg):
+    scene_path = os.path.join(cfg.dataset_path, cfg.dataset.scene_path)
+    cameras = [
+        os.path.basename(f)
+        for f in sorted(glob.glob(os.path.join(scene_path, cfg.dataset.camera_pattern)))
+        if os.path.isdir(f)
+    ]
+
+    img_paths = [
+        os.path.join(cfg.dataset_path, cfg.dataset.scene_path, camera, cfg.dataset.img_path) for camera in cameras
+    ]
+    calibration_paths = [
+        os.path.join(
+            cfg.dataset_path,
+            cfg.dataset.scene_path,
+            camera,
+            cfg.dataset.calibration_path,
+        )
+        for camera in cameras
+    ]
+    annotation_paths = []
+    for camera in cameras:
+        if cfg.resources.reid is not None:
+            scene_path = "-".join(pathlib.Path(cfg.dataset.scene_path).parts)
+            if scene_path[-1] == "-":
+                scene_path = scene_path[:-1]
+            resource_name = (
+                f"{cfg.dataset.name}_{scene_path}-{camera}_{cfg.resources.detector}_{cfg.resources.reid}.txt"
+            )
+        else:
+            resource_name = f"{cfg.dataset.name}-{camera}_{cfg.resources.detector}.txt"
+        annotation_paths.append(os.path.join(cfg.resources.path, resource_name))
+
+    if cfg.preprocess.nms_thresh is not None:
+        nms_threshold = cfg.preprocess.nms_thresh
+    else:
+        nms_threshold = None
+
+    if cfg.preprocess.roi_filter is not None and "roi_path" in cfg.dataset:
+        roi_paths = [os.path.join(cfg.dataset.roi_path, camera, "roi.jpg") for camera in cameras]
+    else:
+        roi_paths = None
+
+    ground_truth_paths = None
+
+    time_offsets = None
+    if "offsets" in cfg.dataset:
+        if cfg.dataset.offsets is not None:
+            time_offsets = cfg.dataset.offsets
+
+    box_projection_centers = [
+        cfg.preprocess.box_projection_centers.alpha_w,
+        cfg.preprocess.box_projection_centers.alpha_h,
+    ]
+
+    if box_projection_centers[0] is None:
+        box_projection_centers = None
+    elif box_projection_centers[1] is None:
+        box_projection_centers[1] = 1 - box_projection_centers[0]
+
+    dataset = MultiCamDataset(
+        annotation_paths=annotation_paths,
+        image_paths=img_paths,
+        calibration_paths=calibration_paths,
+        camera_names=cameras,
+        ground_truth_paths=ground_truth_paths,
+        precomputed=cfg.encoder.name == "precomputed",
+        nms_threshold=nms_threshold,
+        time_offsets=time_offsets,
+        roi_paths=roi_paths,
+        bottom=cfg.preprocess.bottom,
+        box_projection_centers=box_projection_centers,
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=8,
+    )
+    return dataloader
diff --git a/src/tracker/encoder.py b/src/tracker/encoder.py
new file mode 100644
index 0000000..619660b
--- /dev/null
+++ b/src/tracker/encoder.py
@@ -0,0 +1,23 @@
+import sys
+import warnings
+
+import torch
+import torch.nn.functional as F
+import torchvision
+
+
+class Precomputed:
+    def __init__(self, cfg):
+        self.cfg = cfg
+
+    def __call__(self, x):
+        features = x["annotations"][:, 11:]
+        return F.normalize(features, p=2, dim=1)
+
+
+def create_encoder(cfg, device):
+    print(cfg)
+    if cfg.name == "precomputed":
+        return Precomputed(cfg)
+    else:
+        raise ValueError(f"Encoder {cfg.name} not found.")
diff --git a/src/tracker/geometry.py b/src/tracker/geometry.py
new file mode 100644
index 0000000..8a78e6c
--- /dev/null
+++ b/src/tracker/geometry.py
@@ -0,0 +1,81 @@
+import json
+import os
+
+import torch
+
+
+class Projector:
+    def __init__(self, calibration_path: str):
+        """
+        Initialize a Projector object. The projector is used to project points between image and world coordinates.
+
+        Args:
+            calibration_path (str): Path to the calibration file (JSON).
+
+        Raises:
+            FileNotFoundError: If the calibration file is not found.
+            ValueError: If the homography is not found in the calibration file.
+        """
+        if os.path.exists(calibration_path) is False:
+            raise FileNotFoundError(f"Calibration file not found at path: {calibration_path}")
+        self.calibration_path = calibration_path
+
+        with open(calibration_path, "r") as f:
+            calibration = json.load(f)
+            try:
+                homography_keys = [
+                    "homography",
+                    "H",
+                    "homography_matrix",
+                    "homography matrix",
+                ]
+                valid_homography_key = set(homography_keys).intersection(set(calibration.keys())).pop()
+            except KeyError:
+                raise ValueError("Homography not found in calibration file.")
+            self._homography = torch.Tensor(calibration[valid_homography_key])
+            self._inverse_homography = torch.inverse(self._homography)
+
+    def image_to_world(self, points: torch.Tensor) -> torch.Tensor:
+        """Projects image points to world coordinates.
+
+        Args:
+            points (torch.Tensor): Image points Nx2.
+
+        Returns:
+            torch.Tensor: World points Nx3.
+        """
+        if points.dim() != 2:
+            points = points.view(-1, 2)
+        if points.size(1) != 2:
+            raise ValueError(f"Expected image points to be of shape (N, 2), but got {points.shape}.")
+        return self._homography_image_to_world(points)
+
+    def world_to_image(self, points: torch.Tensor) -> torch.Tensor:
+        """Projects world points to image coordinates.
+
+        Args:
+            points (torch.Tensor): World points Nx3.
+
+        Returns:
+            torch.Tensor: Image points Nx2.
+        """
+        if points.dim() != 2:
+            points = points.view(-1, 3)
+        if points.size(1) != 3:
+            points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1)
+        return self._homography_world_to_image(points)
+
+    def _homography_image_to_world(self, points: torch.Tensor) -> torch.Tensor:
+        points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1)
+        device = points.device
+        homography = self._inverse_homography.to(device)
+        projected_points = torch.matmul(homography, points.t()).t()
+        projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1)
+        return projected_points
+
+    def _homography_world_to_image(self, points: torch.Tensor) -> torch.Tensor:
+        device = points.device
+        homography = self._homography.to(device)
+        projected_points = torch.matmul(homography, points.t()).t()
+        projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1)
+        return projected_points
diff --git a/src/tracker/similarities.py b/src/tracker/similarities.py
new file mode 100644
index 0000000..1f148b8
--- /dev/null
+++ b/src/tracker/similarities.py
@@ -0,0 +1,84 @@
+import torch
+from torchvision.ops import box_iou
+
+
+def cosine_similarity(a, b, eps=1e-8):
+    """
+    Compute pairwise appearance distance between features.
+    from https://stackoverflow.com/a/58144658
+    """
+    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
+    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
+    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
+    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
+    return sim_mt
+
+
+def batch_cosine_similarity(a, b, eps=1e-8):
+    """Compute batched pairwise appearance distance between features.
+
+    Args:
+        a (torch.Tensor): (B, N, feature_dim) tensor.
+        b (torch.Tensor): (B, N, feature_dim) tensor.
+        eps (float, optional): Epsilon to prevent division by zero. Defaults to 1e-8.
+
+    Returns:
+        torch.Tensor: (B, N, N) tensor of pairwise similarities.
+    """
+    # Compute norms along feature dimension and add new dimensions needed for broadcasting
+    a_n = a.norm(dim=2)[:, :, None]
+    b_n = b.norm(dim=2)[:, :, None]
+
+    # Perform normalization and prevent division by zero.
+    a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
+    b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
+
+    # Compute similarity matrix using batch matrix multiplication.
+    sim_mt = torch.bmm(a_norm, b_norm.transpose(1, 2))
+    return sim_mt
+
+
+def batched_box_iou(boxes):
+    """Compute batched pairwise IoU between boxes.
+
+    Args:
+        boxes (torch.Tensor): (B, N, 4) tensor of boxes.
+
+    Returns:
+        torch.Tensor: (B, N, N) tensor of pairwise IoU.
+    """
+    ious = []
+    for sub_boxes in boxes:
+        ious.append(box_iou(sub_boxes, sub_boxes))
+    return torch.stack(ious)
+
+
+def bev_distance(bev_positions):
+    """Compute distance between positions on ground plane.
+
+    Args:
+        bev_positions (torch.Tensor): (N, 2) tensor of positions.
+
+    Returns:
+        torch.Tensor: (N, N) tensor of pairwise similarities.
+    """
+    return torch.norm(bev_positions[:, None] - bev_positions[None, :], dim=2)
+
+
+def batch_bev_distance(bev_positions):
+    """Compute batched distance similarity between positions on ground plane.
+
+    Args:
+        bev_positions (torch.Tensor): (B, N, 2) tensor of positions.
+
+    Returns:
+        torch.Tensor: (B, N, N) tensor of pairwise similarities.
+    """
+    # Subtract positions across the batch, adding extra dimensions for broadcasting
+    diff = bev_positions[:, :, None] - bev_positions[:, None, :]
+
+    # Compute norm along the last dimension (x and y coordinates)
+    norm = torch.norm(diff, dim=-1)
+
+    # Return similarity
+    return norm
diff --git a/src/tracker/solver.py b/src/tracker/solver.py
new file mode 100644
index 0000000..352af93
--- /dev/null
+++ b/src/tracker/solver.py
@@ -0,0 +1,65 @@
+import rama_py
+import torch
+
+
+def multicut(edge_index, edge_weights, opts):
+    """Solves a multicut problem based on the RAMA algorithm.
+
+    The edge_index is expected in the usual torch_geometric format.
+    Note that RAMA requires u < v for each edge (u, v) in the graph.
+
+    Args:
+        edge_index (LongTensor): 2xE LongTensor of edge indices.
+        edge_weights (LongTensor): E LongTensor of edge weights.
+
+    Returns:
+        LongTensor: N LongTensor of node labels, where N is the number
+            of nodes in the graph.
+    """
+    if (edge_index[0] > edge_index[1]).any():
+        raise ValueError("Solver expects u < v for each edge (u, v) in the graph.")
+    if edge_index.device.index is None:
+        raise ValueError("Solver runs on CUDA device only. Please move data to CUDA.")
+    if edge_index.shape[1] == 0:
+        return torch.empty(0).to("cuda")
+    i = edge_index[0].to(torch.int32)
+    j = edge_index[1].to(torch.int32)
+    costs = edge_weights.to(torch.float32)
+    num_nodes = torch.max(edge_index) + 1
+    num_edges = edge_index.shape[1]
+    node_labels = torch.ones(num_nodes, device=i.device).to(torch.int32)
+    rama_py.rama_cuda_gpu_pointers(
+        i.data_ptr(),
+        j.data_ptr(),
+        costs.data_ptr(),
+        node_labels.data_ptr(),
+        num_nodes,
+        num_edges,
+        i.device.index,
+        opts,
+    )
+    return node_labels
+
+
+def scale_weights(weights, threshold=0.7):
+    """Scales the given weights to the range [-1, 1] based on the given threshold.
+
+    Args:
+        weights (FloatTensor): LongTensor of edge weights.
+        threshold (float, optional): Threshold for scaling. Defaults to 0.4.
+
+    Returns:
+        FloatTensor: LongTensor of scaled edge weights.
+    """
+    y = weights.clone()
+    z = weights.clone()
+    z[y == threshold] = 0.0
+    z[y > threshold] = (y[y > threshold] - threshold) / (1 - threshold)
+    z[y < threshold] = (y[y < threshold] - threshold) / (threshold)
+    return z
+
+
+def create_solver(backend):
+    opts = rama_py.multicut_solver_options(backend)
+    opts.verbose = False
+    return opts
diff --git a/src/tracker/supertrack.py b/src/tracker/supertrack.py
new file mode 100644
index 0000000..dffea0d
--- /dev/null
+++ b/src/tracker/supertrack.py
@@ -0,0 +1,254 @@
+from enum import IntEnum
+
+import torch
+
+from ..utils.utils import tlwh_to_tlbr
+
+
+class TrackState(IntEnum):
+    CREATED = 0  # Track is created but not confirmed yet
+    ACTIVE = 1  # Track is confirmed and active
+    LOST = 3  # Track is lost and not tracked, but kept in memory
+    KILLED = 4  # Track is killed (e.g. due to merging with another track)
+
+
+class SuperTrack:
+    def __init__(
+        self,
+        frame,
+        features,
+        boxes,
+        positions_2d,
+        positions_3d,
+        confidence=None,
+    ):
+        self.frame = frame
+        self.last_update = frame
+
+        self.n_cams = features.size(0)
+        self.features = features
+        self.boxes = boxes
+        self.positions_2d = positions_2d
+        self.positions_3d = positions_3d
+
+        self.label = None
+        self.__state = TrackState.CREATED  # private state variable
+
+        # inactivity counter: how many frames since last update at each camera
+        self.inactive_since = torch.zeros(self.n_cams, device=features.device)
+
+        self.lost_since = 0
+
+        # where to continue tracking: if False, track is not continued in this camera
+        self.track_where = torch.ones(self.n_cams, device=features.device).bool()
+        self.track_where[torch.isnan(features).any(dim=1)] = False
+
+        # cams the track hasn't been seen in yet
+        self.queries = torch.ones(self.n_cams, device=features.device).bool()
+
+        # count updates for each camera
+        self.ticks = torch.ones(self.n_cams, device=features.device)
+
+        self.confidence = confidence
+
+        self.velocities_2d = torch.zeros((self.n_cams, 4), device=features.device)
+        self.velocities_3d = torch.zeros((self.n_cams, 2), device=features.device)
+
+    @classmethod
+    def empty(cls, n_cams, fdim, device):
+        return cls(
+            frame=None,
+            features=torch.full((n_cams, fdim), float("nan"), device=device),
+            boxes=torch.full((n_cams, 4), float("nan"), device=device),
+            positions_2d=torch.full((n_cams, 2), float("nan"), device=device),
+            positions_3d=torch.full((n_cams, 3), float("nan"), device=device),
+        )
+
+    def activate(self):
+        self.__state = TrackState.ACTIVE
+
+    def deactivate(self):
+        self.__state = TrackState.LOST
+
+    def kill(self):
+        self.__state = TrackState.KILLED
+
+    def reset(self, cams=None):
+        if cams is None:
+            cams = range(self.n_cams)
+        for cam in cams:
+            self.track_where[cam] = False
+            # self.inactive_since[cam] = 0
+
+    def set_label(self, label):
+        if self.label is not None:
+            raise ValueError(f"Track {self} is already labeled.")
+        self.label = label
+
+    @property
+    def keys(self):
+        return ~self.queries
+
+    @property
+    def state(self):
+        return self.__state
+
+    @property
+    def tlbr(self):
+        return tlwh_to_tlbr(self.boxes)
+
+    def is_complete(self):
+        return ~torch.isnan(self.features).any()
+
+    @property
+    def p_features(self):
+        return self.phantomize(self.features)
+
+    @property
+    def p_positions(self):
+        return self.phantomize(self.positions_3d)
+
+    @property
+    def mean_positions_3d(self):
+        return torch.nanmean(self.positions_3d, dim=0)
+
+    @staticmethod
+    def phantomize(tensor):
+        """
+        Given a (B, n_cams, f_dim) tensor, replace nans with the average of
+        the non-nan values along the cam axis.
+        """
+        return torch.where(torch.isnan(tensor), torch.nanmean(tensor, dim=0, keepdim=True), tensor)
+
+    def update(self, other):
+        n_cams = self.features.size(0)
+        if self.frame == other.frame:
+            for cam in range(n_cams):
+                if torch.isnan(self.features[cam]).any():
+                    if torch.isnan(other.features[cam]).any():
+                        continue
+                    self.features[cam] = other.features[cam]
+                    self.boxes[cam] = other.boxes[cam]
+                    self.positions_2d[cam] = other.positions_2d[cam]
+                    self.positions_3d[cam] = other.positions_3d[cam]
+                    self.inactive_since[cam] = 0
+                    self.track_where[cam] = True
+                    self.queries[cam] = False
+                    self.ticks[cam] = other.ticks[cam]
+                else:
+                    if not torch.isnan(other.features[cam]).any():
+                        raise ValueError(f"Found violation of constraints for track update with {self}.")
+        elif self.frame < other.frame:
+            for cam in range(n_cams):
+                if not torch.isnan(other.features[cam]).any():
+                    if not torch.isnan(self.features[cam]).any():
+                        if self.velocities_2d[cam].sum() == 0:
+                            w = 1.0
+                        else:
+                            w = 0.8
+                        self.velocities_2d[cam] = (
+                            w * (other.boxes[cam] - self.boxes[cam]) / (other.frame - self.frame)
+                            + (1 - w) * self.velocities_2d[cam]
+                        )
+                        self.velocities_3d[cam] = (
+                            w * (other.positions_3d[cam] - self.positions_3d[cam]) / (other.frame - self.frame)
+                            + (1 - w) * self.velocities_3d[cam]
+                        )
+                        self.features[cam] = 0.9 * self.features[cam] + 0.1 * other.features[cam]
+                        self.boxes[cam] = other.boxes[cam]
+                        self.positions_2d[cam] = other.positions_2d[cam]
+                        self.positions_3d[cam] = other.positions_3d[cam]
+                        self.inactive_since[cam] = 0
+                        self.track_where[cam] = True
+                        self.queries[cam] = False
+                        self.ticks[cam] += 1
+                    else:
+                        self.features[cam] = other.features[cam]
+                        self.boxes[cam] = other.boxes[cam]
+                        self.positions_2d[cam] = other.positions_2d[cam]
+                        self.positions_3d[cam] = other.positions_3d[cam]
+                        self.inactive_since[cam] = 0
+                        self.track_where[cam] = True
+                        self.queries[cam] = False
+                        self.ticks[cam] = other.ticks[cam]
+                else:
+                    if self.track_where[cam]:
+                        self.inactive_since[cam] += 1
+        else:
+            raise ValueError(
+                f"Frame of other must be greater or equal to frame of self, but got {self.frame} and {other.frame}."
+            )
+        self.last_update = other.frame
+        self.frame = other.frame
+
+        if self.state == TrackState.LOST:
+            self.activate()
+
+    def predict(self):
+        for cam in range(self.n_cams):
+            if ~self.track_where[cam]:
+                continue
+            prd_box = self.boxes[cam] + self.velocities_2d[cam]
+            prd_pos = self.positions_3d[cam] + self.velocities_3d[cam]
+            if prd_box[2] <= 0 or prd_box[3] <= 0:
+                prd_box = self.boxes[cam]
+                prd_pos = self.positions_3d[cam]
+            self.boxes[cam] = prd_box
+            self.positions_3d[cam] = prd_pos
+
+    def merge(self, other):
+        if other.state == TrackState.KILLED or self.state == TrackState.KILLED:
+            raise ValueError("Cannot merge killed tracks.")
+        if other.frame < self.frame:
+            raise ValueError(
+                f"Other track must not be older than self, but "
+                f"self is at frame {self.frame} and other at frame {other.frame}."
+            )
+        self.update(
+            other.frame,
+            other.features,
+            other.boxes,
+            other.positions_2d,
+            other.positions_3d,
+        )
+        # other was merged into self, so it is killed
+        other.kill()
+
+    def split(self, where: torch.Tensor):
+        # keep the cams where "where" is True
+        other_features = self.features.clone()
+        other_boxes = self.boxes.clone()
+        other_positions_2d = self.positions_2d.clone()
+        other_positions_3d = self.positions_3d.clone()
+        for w in where:
+            if not w:
+                self.features[w] = torch.nan
+                self.boxes[w] = torch.nan
+                self.positions_2d[w] = torch.nan
+                self.positions_3d[w] = torch.nan
+            else:
+                other_features[w] = torch.nan
+                other_boxes[w] = torch.nan
+                other_positions_2d[w] = torch.nan
+                other_positions_3d[w] = torch.nan
+        return SuperTrack(
+            frame=self.frame,
+            features=other_features,
+            boxes=other_boxes,
+            positions_2d=other_positions_2d,
+            positions_3d=other_positions_3d,
+        )
+
+    def __repr__(self):
+        return f"Track {self.label}"
+
+    def to_tensor(self):
+        output = []
+        if self.state == TrackState.LOST:
+            return torch.Tensor(output)
+        for i, box in enumerate(self.boxes):
+            if ~self.track_where[i]:
+                continue
+            row = [i, self.label, self.frame, *box, *self.mean_positions_3d]
+            output.append(row)
+        return torch.Tensor(output)
diff --git a/src/tracker/tracker.py b/src/tracker/tracker.py
new file mode 100644
index 0000000..3e53430
--- /dev/null
+++ b/src/tracker/tracker.py
@@ -0,0 +1,467 @@
+import statistics
+import time
+from typing import Any, List, Optional, Tuple
+
+import motmetrics as mm
+import torch
+from omegaconf import DictConfig
+from scipy.optimize import linear_sum_assignment
+from torchvision.ops import box_iou
+
+from .similarities import batch_bev_distance, batch_cosine_similarity, batched_box_iou
+from .solver import multicut, scale_weights
+from .supertrack import SuperTrack, TrackState
+
+
+class Tracker:
+    def __init__(
+        self,
+        solver_opts: Any,
+        cfg: DictConfig,
+        n_cams: int,
+        feature_extractor: Optional[torch.nn.Module] = None,
+        device: Optional[torch.device] = "cpu",
+    ):
+        self.feature_extractor = feature_extractor
+        self.solver_opts = solver_opts
+        self.device = device
+
+        self.current_data = None
+
+        self.feature_dim = cfg.tracker.fdim
+        self.n_cams = n_cams
+        self.cfg = cfg.tracker
+
+        self.tracks: List[SuperTrack] = []
+
+        self.frame = 0
+        self.free_id = 1
+
+        self.latency = []
+
+        self.update_interval = 1
+        self.stats = {
+            "# Killed": 0,
+            "Latency": 0,
+        }
+
+        self.cumulative_execution_time = 0
+
+    def step(self, sample):
+        # move sample to device and remove batch dimension
+        t0 = time.time()
+        for key in sample.keys():
+            if key != "images":
+                sample[key] = sample[key].to(self.device).squeeze(0)
+        self.frame += 1
+        if self.frame % self.update_interval == 0:
+            if sample["annotations"].size(0) > 0:
+                matched, unmatched = self.update(sample)
+                self._handle_unmatched(unmatched)
+
+        t1 = time.time()
+        self.cumulative_execution_time += t1 - t0
+        self.latency.append(t1 - t0)
+
+        self._sanitize()
+
+        rresults = self.get_result()
+
+        self.predict()
+
+        presults = self.get_result()
+
+        return rresults, presults
+
+    def update(self, sample):
+        features = self.feature_extractor(sample)
+        superboxes = self._new_superboxes_from_data(sample, features)
+        superboxes = [s for s in superboxes if s.confidence >= self.cfg.confidence_thresh]
+
+        relevant_tracks = self.tracks + superboxes
+        _track_indices = torch.arange(len(self.tracks)).to(self.device)
+        _superbox_indices = torch.arange(len(self.tracks), len(relevant_tracks)).to(self.device)
+
+        low_conf_indices = None
+
+        if self.cfg.low_confidence_thresh is not None:
+            c1 = self.cfg.low_confidence_thresh
+            c2 = self.cfg.confidence_thresh
+            low_conf_superboxes = [s for s in superboxes if c1 <= s.confidence < c2]
+
+            if len(low_conf_superboxes) > 0:
+                n_relevant = len(relevant_tracks)
+                relevant_tracks += low_conf_superboxes
+                low_conf_indices = torch.arange(n_relevant, n_relevant + len(low_conf_superboxes))
+
+        if len(relevant_tracks) == 0:
+            return [], []
+
+        features = torch.stack([track.p_features for track in relevant_tracks])  # (n_tracks, n_cams, feature_dim)
+        positions = torch.stack([track.p_positions for track in relevant_tracks])  # (n_tracks, n_cams, 2)
+        boxes = torch.stack([track.tlbr for track in relevant_tracks])  # (n_tracks, n_cams, 4)
+
+        # compute (n_tracks) x (n_tracks) similarity matrix
+        similarities = self._compute_similarities(features, positions, boxes)
+
+        # compute weighted graph
+        rescale_thresh = self.cfg.matching.rescale_threshold
+        dist_thresh = self.cfg.matching.distance_threshold
+        iou_bias = self.cfg.prematching.iou_bias if self.cfg.prematching.enabled else 0
+        edge_index, edge_weights = self._build_weighted_graph(
+            relevant_tracks,
+            similarities,
+            rescale_thresh,
+            dist_thresh,
+            iou_bias,
+            reid_decay=self.cfg.matching.reid_decay,
+        )
+        labels = multicut(edge_index, edge_weights, self.solver_opts)
+
+        matched_tracks, unmatched_tracks = self._match(relevant_tracks, labels, low_conf_indices=low_conf_indices)
+
+        self.tracks = matched_tracks + unmatched_tracks
+        return matched_tracks, unmatched_tracks
+
+    def _handle_unmatched(self, unmatched_tracks):
+        for track in unmatched_tracks:
+            for cam in range(self.n_cams):
+                if track.track_where[cam]:
+                    track.inactive_since[cam] += 1
+
+    def predict(self):
+        """
+        Project existing tracks into the future.
+        """
+        for track in self.tracks:
+            track.predict()
+
+    def _new_superboxes_from_data(self, sample, sample_features):
+        """
+        Given a sample and its features, create new superboxes.
+        """
+        n_rows = sample_features.shape[0]
+
+        features = torch.full((n_rows, self.n_cams, self.feature_dim), float("nan"), device=self.device)
+        boxes = torch.full((n_rows, self.n_cams, 4), float("nan"), device=self.device)
+        positions_2d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device)
+        positions_3d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device)
+
+        cam_ids = sample["annotations"][:, 0].int()
+        features[torch.arange(n_rows), cam_ids] = sample_features
+        boxes[torch.arange(n_rows), cam_ids] = sample["annotations"][:, 3:7]
+        positions_2d[torch.arange(n_rows), cam_ids] = sample["positions_2d"]
+        positions_3d[torch.arange(n_rows), cam_ids] = sample["positions_3d"]
+        confidences = sample["annotations"][:, 7]
+
+        superboxes = [
+            SuperTrack(
+                frame=self.frame,
+                features=features[row],
+                boxes=boxes[row],
+                positions_2d=positions_2d[row],
+                positions_3d=positions_3d[row],
+                confidence=confidences[row],
+            )
+            for row in range(n_rows)
+        ]
+
+        return superboxes
+
+    def _merge_tracks(self, tracks):
+        _frames = sorted({track.frame for track in tracks})
+
+        newest_frame = _frames[-1]
+        if len(_frames) > 1:
+            penult_frame = _frames[-2]
+
+        assert tracks[-1].frame == newest_frame
+
+        newest_evidence = [track for track in tracks if track.frame == newest_frame]
+
+        features = (torch.ones(self.n_cams, self.feature_dim) * (torch.nan)).to(self.device)
+        boxes = (torch.ones(self.n_cams, 4) * (torch.nan)).to(self.device)
+        positions_2d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device)
+        positions_3d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device)
+        track_where = torch.zeros(self.n_cams, dtype=torch.bool).to(self.device)
+
+        for cam_id in range(self.n_cams):
+            for track in newest_evidence:
+                if not torch.isnan(track.features[cam_id]).any():
+                    features[cam_id] = track.features[cam_id]
+                    boxes[cam_id] = track.boxes[cam_id]
+                    positions_2d[cam_id] = track.positions_2d[cam_id]
+                    positions_3d[cam_id] = track.positions_3d[cam_id]
+                    track_where[cam_id] = True
+                    break
+
+        merged_track = SuperTrack(
+            frame=newest_frame,
+            features=features,
+            boxes=boxes,
+            positions_2d=positions_2d,
+            positions_3d=positions_3d,
+        )
+
+        if len(_frames) > 1:
+            penult_track = [track for track in tracks if track.frame == penult_frame][0]
+            penult_track.update(merged_track)
+            merged_track = penult_track
+
+        return merged_track
+
+    def _match(self, tracks, labels, low_conf_indices=None):
+        """
+        Match superboxes with superboxes, and merged
+        superboxes with existing supertracks in one cut.
+        """
+        new_tracks = []
+        unmatched_tracks = []
+
+        for label in torch.unique(labels):
+            track_indices = torch.where(labels == label)[0].tolist()
+            if len(track_indices) == 1:
+                track = tracks[track_indices[0]]
+                if low_conf_indices is not None and track_indices[0] in low_conf_indices:
+                    continue
+                if track.state == TrackState.CREATED:
+                    new_tracks.append(track)
+                else:
+                    unmatched_tracks.append(track)
+            else:
+                if low_conf_indices is None:
+                    relevant_tracks = sorted([tracks[i] for i in track_indices], key=lambda x: x.frame)
+                else:
+                    relevant_tracks = sorted(
+                        [tracks[i] for i in track_indices if i not in low_conf_indices], key=lambda x: x.frame
+                    )
+                merged_track = self._merge_tracks(relevant_tracks)
+                if low_conf_indices is not None and not merged_track.is_complete():
+                    relevant_low_conf_tracks = [tracks[i] for i in track_indices if i in low_conf_indices]
+                    merged_track = self._merge_tracks([merged_track] + relevant_low_conf_tracks)
+                new_tracks.append(merged_track)
+
+        return new_tracks, unmatched_tracks
+
+    @staticmethod
+    def _compute_similarities(features, positions, boxes):
+        """Compute similarity matrices for features, positions, and boxes.
+
+        Args:
+            features (torch.Tensor): (n_tracks, n_cams, feature_dim) tensor.
+            positions (torch.Tensor): (n_tracks, n_cams, 2) tensor.
+            boxes (torch.Tensor): (n_tracks, n_cams, 4) tensor.
+
+        Returns:
+            Tuple[torch.Tensor]: Tuple of similarity matrices.
+        """
+        # permute to (n_cams, n_tracks, feature_dim), (n_cams, n_tracks, 2), (n_cams, n_tracks, 4)
+        features = features.permute(1, 0, 2)
+        positions = positions.permute(1, 0, 2)
+        boxes = boxes.permute(1, 0, 2)
+
+        # compute pairwise similarities (n_cams, n_tracks, n_tracks)
+        feature_sim = batch_cosine_similarity(features, features)
+        position_dist = batch_bev_distance(positions)
+        iou_sim = batched_box_iou(boxes)
+
+        # average-pool similarities to (n_tracks, n_tracks)
+        feature_sim = torch.nanmean(feature_sim, dim=0)
+        position_dist = torch.nanmean(position_dist, dim=0)
+        iou_sim = torch.nanmean(iou_sim, dim=0)
+
+        return feature_sim, position_dist, iou_sim
+
+    def _build_weighted_graph(
+        self,
+        tracks: List[SuperTrack],
+        similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        rescale_thresh: float,
+        dist_thresh: float,
+        iou_bias: float,
+        reid_decay: float = 1,
+        penalty: float = -100,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Builds a weighted graph from the given tracks and similarity matrices.
+
+        Args:
+            tracks (List[SuperTrack]): List of tracks.
+            similarities (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Tuple of similarity matrices (appearance, position, IoU).
+            rescale_thresh (float): Threshold for rescaling weights.
+            dist_thresh (float): Distance threshold for feasibility.
+            iou_bias (float): Bias to add for IoU-based matching.
+            reid_decay (float, optional): Decay factor for ReID scores. Defaults to 1.
+            penalty (float, optional): Penalty for infeasible edges. Defaults to -100.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: Edge indices and edge weights of the graph.
+        """
+        adj = self._initialize_adjacency_matrix(similarities, tracks, reid_decay, rescale_thresh, dist_thresh)
+
+        if self.cfg.prematching.enabled:
+            adj = self._apply_prematching(adj, tracks, iou_bias)
+
+        adj = self._finalize_adjacency_matrix(adj, penalty, tracks)
+
+        return self._get_edge_index_and_weights(adj)
+
+    def _initialize_adjacency_matrix(
+        self,
+        similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+        tracks: List[SuperTrack],
+        reid_decay: float,
+        rescale_thresh: float,
+        dist_thresh: float,
+    ) -> torch.Tensor:
+        appearance_sim, position_dist, _ = similarities
+        device = appearance_sim.device
+
+        frame_support_pairs = [(track.frame, track.track_where) for track in tracks]
+        frames, supports = zip(*frame_support_pairs)
+
+        times = torch.tensor(frames, dtype=torch.int, device=device)
+        lost = torch.tensor([track.state == TrackState.LOST for track in tracks], device=device)
+        lost_since = torch.tensor([track.lost_since for track in tracks], device=device)
+
+        appearance_sim = appearance_sim * reid_decay**lost_since
+        appearance_sim = scale_weights(appearance_sim, rescale_thresh)
+
+        combined_sim = self.cfg.matching.rescale_weight * appearance_sim + self.cfg.matching.distance_weight * (
+            1 - position_dist / dist_thresh
+        )
+
+        adj = torch.zeros_like(appearance_sim)
+        lmask = lost[:, None] | lost[None, :]
+        same_time = times[:, None] == times[None, :]
+        feasible = (position_dist < dist_thresh) | lmask
+
+        adj[same_time & feasible] = torch.clip(combined_sim[same_time & feasible], min=0, max=1)
+        adj[~same_time] = combined_sim[~same_time]
+        adj[lmask] = combined_sim[lmask]
+
+        return adj
+
+    def _apply_prematching(self, adj: torch.Tensor, tracks: List[SuperTrack], iou_bias: float) -> torch.Tensor:
+        cur_frame = max(track.frame for track in tracks)
+        pen_frame = cur_frame - 1
+        cur_track_idx_by_cam = [[] for _ in range(self.n_cams)]
+        pen_track_idx_by_cam = [[] for _ in range(self.n_cams)]
+
+        for i, track in enumerate(tracks):
+            if track.frame == cur_frame:
+                for cam in range(self.n_cams):
+                    if not torch.isnan(track.boxes[cam]).any():
+                        cur_track_idx_by_cam[cam].append(i)
+            elif track.frame == pen_frame:
+                for cam in range(self.n_cams):
+                    if not torch.isnan(track.boxes[cam]).any():
+                        pen_track_idx_by_cam[cam].append(i)
+
+        for cam in range(self.n_cams):
+            cur_boxes_cam = [tracks[i].tlbr[cam] for i in cur_track_idx_by_cam[cam]]
+            pen_boxes_cam = [tracks[i].tlbr[cam] for i in pen_track_idx_by_cam[cam]]
+
+            if not cur_boxes_cam or not pen_boxes_cam:
+                continue
+
+            iou_dist = 1 - box_iou(torch.stack(cur_boxes_cam), torch.stack(pen_boxes_cam))
+            row_ind, col_ind = linear_sum_assignment(iou_dist.cpu().numpy())
+
+            for r, c in zip(row_ind, col_ind):
+                if iou_dist[r, c] > self.cfg.prematching.iou_threshold:
+                    continue
+                cur_idx = cur_track_idx_by_cam[cam][r]
+                if self.cfg.prematching.prune_remaining:
+                    adj[cur_idx] = 0
+                    adj[:, cur_idx] = 0
+                adj[cur_idx, pen_track_idx_by_cam[cam][c]] += iou_bias
+                adj[pen_track_idx_by_cam[cam][c], cur_idx] += iou_bias
+
+        return adj
+
+    def _finalize_adjacency_matrix(self, adj: torch.Tensor, penalty: float, tracks: List[SuperTrack]) -> torch.Tensor:
+        frame_support_pairs = [(track.frame, track.track_where) for track in tracks]
+        frames, supports = zip(*frame_support_pairs)
+
+        times = torch.tensor(frames, dtype=torch.int, device=adj.device)
+        supps = torch.stack(supports).to(adj.device)
+
+        same_time = times[:, None] == times[None, :]
+        same_supp = (supps[:, None] & supps[None, :]).any(dim=2)
+
+        adj[same_time & same_supp] = penalty
+        adj = adj * torch.triu(torch.ones_like(adj), diagonal=1)
+
+        return adj
+
+    def _get_edge_index_and_weights(self, adj: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        edge_index = torch.nonzero(adj).t().long()
+        edge_weights = adj[edge_index[0], edge_index[1]]
+        return edge_index, edge_weights
+
+    def _sanitize(self):
+        keep = []
+        for k, track in enumerate(self.tracks):
+            if track.state is TrackState.CREATED:
+                track.activate()
+            if track.label is None:
+                track.set_label(self.free_id)
+                self.free_id += 1
+            if torch.all(~track.track_where):
+                if torch.all(track.inactive_since[track.inactive_since > 0] > self.cfg.patience):
+                    track.deactivate()
+            if track.state is TrackState.LOST:
+                if track.lost_since > self.cfg.memory:
+                    track.kill()
+                else:
+                    track.lost_since += 1
+            if track.state is not TrackState.KILLED:
+                keep.append(track)
+            for cam in range(self.n_cams):
+                if track.inactive_since[cam] > self.cfg.patience:
+                    track.reset([cam])
+        killed = len(self.tracks) - len(keep)
+        self.tracks = keep
+        self.stats["# Tracks"] = len(self.tracks)
+        self.stats["# Lost"] = len([track for track in self.tracks if track.state == TrackState.LOST])
+        self.stats["# Killed"] += killed
+
+        latency = statistics.mean(self.latency) if len(self.latency) > 0 else 0
+        self.stats["FPS"] = int(1 / latency) if latency > 0 else 0
+
+    def _get_active_tracks(self):
+        return [track for track in self.tracks if track.state != TrackState.KILLED]
+
+    def get_result(self, normalization=None, scale=1.0):
+        """
+        Return the current online state of the tracker.
+        """
+        to_stack = [track.to_tensor() for track in self.tracks if track.state == TrackState.ACTIVE]
+        if len(to_stack) > 0:
+            result = torch.cat(to_stack)
+        else:
+            result = torch.empty(0)
+        if result.size(0) > 0:
+            if normalization is not None:
+                min_x, min_y, max_x, max_y = normalization
+                result[:, 7:9] = result[:, 7:9] * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor(
+                    [min_x, min_y]
+                )
+            result[:, 7:9] *= scale
+        return result
+
+    def _get_index_by_id(self, tid):
+        for i, track in enumerate(self.tracks):
+            if track.label == tid:
+                return i
+        return None
+
+
+def create_tracker(cfg, solver_cfg, feature_extractor, n_cams, device, writer=None):
+    return Tracker(
+        solver_opts=solver_cfg,
+        cfg=cfg,
+        feature_extractor=feature_extractor,
+        n_cams=n_cams,
+        device=device,
+    )
diff --git a/src/utils/evaluate.py b/src/utils/evaluate.py
new file mode 100644
index 0000000..aef63ed
--- /dev/null
+++ b/src/utils/evaluate.py
@@ -0,0 +1,260 @@
+import configparser
+import os
+import pathlib
+from typing import Dict, List, Optional, Union
+
+import motmetrics as mm
+import numpy as np
+import pandas as pd
+import torch
+from sklearn import metrics
+
+
+GT_COLUMNS = [
+    "frame",
+    "id",
+    "bb_left",
+    "bb_top",
+    "bb_width",
+    "bb_height",
+    "conf",
+    "x",
+    "y",
+    "z",
+]
+
+
+def get_hota_setup():
+    metrics = ["deta_alpha", "assa_alpha", "hota_alpha"]
+    namemap = mm.io.motchallenge_metric_names
+    namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"})
+    return metrics, namemap
+
+
+def evaluate_tracker(tracker_results, dataloader, hota_mode=False, bev_mode=False):
+    gt_dfs = [pd.DataFrame(gt, columns=GT_COLUMNS) for gt in dataloader.dataset._ground_truths]
+    ht_dfs = results_to_dfs(tracker_results)
+
+    n_frames = [int(df["frame"].max()) for df in gt_dfs]
+
+    gt_dfs = [mot_to_mm(df) for df in gt_dfs]
+    ht_dfs = [mot_to_mm(df) for df in ht_dfs]
+
+    gt_df = combine_dataframes(gt_dfs, n_frames)
+    ht_df = combine_dataframes(ht_dfs, n_frames)
+
+    # put column "x" to "X"
+    if bev_mode:
+        ht_df["X"] = ht_df["x"]
+        ht_df["Y"] = ht_df["y"]
+        gt_df["X"] = gt_df["x"]
+        gt_df["Y"] = gt_df["y"]
+
+    return evaluate_single_scene(ht_df, gt_df, hota_mode=hota_mode, bev_mode=bev_mode)
+
+
+def results_to_dfs(tracker_results: torch.Tensor) -> List[pd.DataFrame]:
+    """Converts a tensor of results to a list of dataframes. Input tensor has format
+
+        CAM_ID, OBJ_ID, FRAME_ID, X, Y, W, H, X_WORLD, Y_WORLD
+
+    and resulting (n_cams) dataframes have columns
+
+        frame, id, bb_left, bb_top, bb_width, bb_height, conf, x, y, z
+
+    Args:
+        tracker_results (torch.Tensor): Results tensor.
+    Returns:
+        List[pd.DataFrame]: List of dataframes.
+    """
+    results = tracker_results.clone()
+    results[:, [1, 2]] = results[:, [2, 1]]
+    results = torch.cat((results[:, :7], torch.ones(results.shape[0], 1), results[:, 7:]), dim=1)
+    results = torch.cat((results, -torch.ones(results.shape[0], 1)), dim=1)
+    cam_res = [results[results[:, 0] == c][:, 1:] for c in torch.unique(results[:, 0]).cpu().numpy()]
+    return [pd.DataFrame(res, columns=GT_COLUMNS) for res in cam_res]
+
+
+def evaluate_multi_scene(prediction_dfs, ground_truth_dfs, names=None, hota_mode=False, bev_mode=False):
+    """Takes prediction and ground truth dataframes and runs motmetrics evaluation
+    on a multiple scenes. For evaluation of multi-camera scenes, first combine a
+    list of single-camera predictions and ground truths using `combine_dataframes`
+    Args:
+        prediction_dfs (_type_): _description_
+        ground_truth_dfs (_type_): _description_
+        names (_type_, optional): _description_. Defaults to None.
+    Returns:
+        _type_: _description_
+    """
+    if names is None:
+        names = ["Untitled %s" % (i + 1) for i in range(len(prediction_dfs))]
+    ground_truths = dict(zip(names, ground_truth_dfs))
+    predictions = dict(zip(names, prediction_dfs))
+    accs = []
+    names = []
+
+    if bev_mode:
+        distfields = ["X", "Y"]
+        dist = "seuc"
+        distth = 1.0
+    else:
+        distfields = ["X", "Y", "Width", "Height"]
+        dist = "iou"
+        distth = 0.5
+
+    for name, prediction in predictions.items():
+        if hota_mode:
+            raise NotImplementedError
+        else:
+            accs.append(
+                mm.utils.compare_to_groundtruth(
+                    ground_truths[name], prediction, dist=dist, distfields=distfields, distth=distth
+                )
+            )
+            metrics = mm.metrics.motchallenge_metrics
+            namemap = mm.io.motchallenge_metric_names
+        names.append(name)
+
+    mh = mm.metrics.create()
+
+    summary = mh.compute_many(
+        accs,
+        names=names,
+        metrics=metrics,
+        generate_overall=True,
+    )
+    namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"})
+    print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap))
+    strsummary = mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap)
+    return summary, strsummary
+
+
+def evaluate_single_scene(prediction_df, ground_truth_df, hota_mode=False, bev_mode=False, name=None) -> pd.DataFrame:
+    """Takes a prediction and ground truth dataframe and runs motmetrics evaluation
+    on a single scene. For evaluation of multi-camera scenes, first combine a list
+    of single-camera predictions and ground truths using `combine_dataframes`.
+    Args:
+        prediction_df (_type_): Multi-camera predictions.
+        ground_truth_df (_type_): Multi-camera ground truth.
+        name (str): Scene name. Defaults to None.
+    """
+    return evaluate_multi_scene([prediction_df], [ground_truth_df], [name], hota_mode, bev_mode)
+
+
+def mot_to_mm(df: pd.DataFrame) -> pd.DataFrame:
+    """Takes a MOT-style dataframe (with named columns [frame, id, ...])
+    and converts it to a dataframe with column names required by motmetrics.
+    Args:
+        df (pd.DataFrame): Input MOT-style dataframe.
+    Returns:
+        pd.DataFrame: Output dataframe ready to use in motmetrics evaluation.
+    """
+    _df = df.rename(
+        columns={
+            "frame": "FrameId",
+            "id": "Id",
+            "bb_left": "X",
+            "bb_top": "Y",
+            "bb_width": "Width",
+            "bb_height": "Height",
+            "conf": "Confidence",
+        }
+    )
+    columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"]
+    columns_to_float = ["Confidence"]
+    _df[columns_to_int] = _df[columns_to_int].astype(int)
+    _df[columns_to_float] = _df[columns_to_float].astype(float)
+    return _df
+
+
+def read_txt(path: Union[str, pathlib.Path]) -> pd.DataFrame:
+    _df = pd.read_csv(path, names=GT_COLUMNS)
+    _df = _df.rename(
+        columns={
+            "frame": "FrameId",
+            "id": "Id",
+            "bb_left": "X",
+            "bb_top": "Y",
+            "bb_width": "Width",
+            "bb_height": "Height",
+            "conf": "Confidence",
+        }
+    )
+    columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"]
+    columns_to_float = ["Confidence"]
+    _df[columns_to_int] = _df[columns_to_int].astype(int)
+    _df[columns_to_float] = _df[columns_to_float].astype(float)
+    return _df
+
+
+def read_seqinfo(path: Union[str, pathlib.Path]) -> Dict:
+    parser = configparser.ConfigParser()
+    parser.read(path)
+    return dict(parser["Sequence"])
+
+
+def combine_dataframes(dataframes: List[pd.DataFrame], n_frames: Optional[List[int]] = None) -> pd.DataFrame:
+    """Takes a list of single-camera dataframes and combines them for
+    multi-camera evaluation.
+    Args:
+        dataframes (List[pd.DataFrame]): List of single-camera dataframes.
+        n_frames (Optional[List[int]], optional): Defaults to None.
+    Returns:
+        pd.DataFrame: Multi-camera dataframe.
+    """
+    if n_frames is None:
+        n_frames = [int(df["FrameId"].max()) for df in dataframes]
+    count_frames = 0
+    dfs = []
+    for j, df in enumerate(dataframes):
+        df["FrameId"] += count_frames
+        count_frames += int(n_frames[j])
+        dfs.append(df)
+    return pd.concat(dfs).set_index(["FrameId", "Id"])
+
+
+def evaluate_mtmc(
+    data_paths: List[Union[str, pathlib.Path]],
+    prediction_path: Union[str, pathlib.Path],
+    scene_name: str,
+    hota_mode=False,
+    bev_mode=False,
+):
+    seqinfos = [read_seqinfo(os.path.join(path, "seqinfo.ini")) for path in data_paths]
+    ground_truths = [read_txt(os.path.join(path, "gt", "gt.txt")) for path in data_paths]
+    prediction_paths = [os.path.join(prediction_path, seqinfo["name"] + ".txt") for seqinfo in seqinfos]
+    predictions = [read_txt(path) for path in prediction_paths]
+    ground_truth_df = combine_dataframes(ground_truths, [seqinfo["seqlength"] for seqinfo in seqinfos])
+    prediction_df = combine_dataframes(predictions, [seqinfo["seqlength"] for seqinfo in seqinfos])
+
+    ground_truths = {scene_name: ground_truth_df}
+    predictions = {scene_name: prediction_df}
+
+
+def evaluate_synthehicle_json(prediction, ground_truth):
+    preds_to_eval = []
+    truths_to_eval = []
+    names = []
+    for scene in ground_truth.keys():
+        if scene in prediction.keys():
+            gcams = ground_truth[scene]
+            pcams = prediction[scene]
+            preds_to_combine = []
+            truths_to_combine = []
+            for cam in gcams.keys():
+                if cam not in pcams.keys():
+                    prediction[scene][cam] = [[1, 1, 0, 0, 0, 0, 1, -1, -1, -1]]
+                preds_to_combine.append(mot_to_mm(pd.DataFrame(prediction[scene][cam], columns=GT_COLUMNS)))
+                truths_to_combine.append(mot_to_mm(pd.DataFrame(ground_truth[scene][cam], columns=GT_COLUMNS)))
+            names.append(scene)
+            preds_to_eval.append(combine_dataframes(preds_to_combine, n_frames=[1800] * len(preds_to_combine)))
+            truths_to_eval.append(combine_dataframes(truths_to_combine, n_frames=[1800] * len(truths_to_combine)))
+    return evaluate_multi_scene(preds_to_eval, truths_to_eval, names)
+
+
+def clustering_performance(y_true, y_pred):
+    y_t, y_p = y_true.cpu().numpy(), y_pred.cpu().numpy()
+    return {
+        "ARI": metrics.adjusted_rand_score(y_t, y_p),
+        "AMI": metrics.adjusted_mutual_info_score(y_t, y_p),
+    }
diff --git a/src/utils/iotools.py b/src/utils/iotools.py
new file mode 100644
index 0000000..5f472b8
--- /dev/null
+++ b/src/utils/iotools.py
@@ -0,0 +1,113 @@
+import os
+
+import numpy as np
+import torch
+from torch.utils.tensorboard import SummaryWriter
+
+from .utils import expand_boxes, remove_border_boxes, size_filter
+
+
+class ResultsWriter:
+    def __init__(self, output_path, cfg, normalization=None, camera_names=None):
+        self._results = []
+
+        self.cfg = cfg
+        self.output_path = output_path
+        self._norm_factors = normalization
+
+        self.rows = cfg.visuals.grid_rows
+        self.plot_results = cfg.visuals.plot_results
+        self.plot_every = cfg.visuals.plot_interval
+
+        self.camera_names = camera_names
+
+        self.writer = None
+
+        if cfg.logging.tensorboard.enable:
+            self.writer = SummaryWriter()
+
+        self.store_files = cfg.visuals.store_files
+        self.results_file = os.path.join(output_path, "results.txt")
+
+        self.offsets = cfg.dataset.offsets if hasattr(cfg.dataset, "offsets") else [0] * len(camera_names)
+
+        self.on_bev = True if cfg.dataset.name == "WildTrack" else False
+
+        self._save_function = self.get_save_function(cfg)
+
+        if os.path.exists(self.results_file):
+            os.remove(self.results_file)
+
+        os.makedirs(output_path, exist_ok=True)
+
+    @property
+    def results(self):
+        results = torch.cat(self._results, dim=0)
+        for i, offset in enumerate(self.offsets):
+            results[results[:, 0] == i, 2] -= offset
+        # multiply camera column by (-1)
+        results[:, 0] *= -1
+        for i, name in enumerate(self.camera_names):
+            # this is a bit hacky if camera does not start with letter
+            try:
+                name_int = int(name[1:])
+            except ValueError:
+                # fallback to index of camera
+                name_int = i
+            results[results[:, 0] == -i, 0] = name_int
+        if self.cfg.postprocess.expand_boxes.enable:
+            factor = self.cfg.postprocess.expand_boxes.factor
+            results[:, 3:7] = expand_boxes(results[:, 3:7], factor)
+        if self.cfg.postprocess.remove_borders.enable:
+            boxes = results[:, 3:7]
+            border = self.cfg.postprocess.remove_borders.border_size
+            keep = remove_border_boxes(boxes, border)
+            results = results[keep]
+        if self.cfg.postprocess.size_filter.enable:
+            boxes = results[:, 3:7]
+            keep = size_filter(
+                boxes, self.cfg.postprocess.size_filter.min_size, self.cfg.postprocess.size_filter.max_size
+            )
+            results = results[keep]
+        return results
+
+    def add(self, result):
+        _result = result.clone()
+        if self._norm_factors is not None:
+            _result = self.denormalize_bev(_result[:, 7:9])
+        self._results.append(result)
+
+    def save(self):
+        if self._results:
+            self._save_function(self.results.cpu().numpy())
+
+    def _to_aicity19(self, result):
+        # CAMERA_ID  OBJ_ID  FRAME  X Y  W  H  1  X_BEV  Y_BEV  -1
+        np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f")
+
+    def _to_aicity24(self, result):
+        # CAMERA_ID  OBJ_ID FRAME  X Y  W  H  1  X_BEV  Y_BEV  -1
+        np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f")
+
+    def _to_synthehicle(self, result):
+        # CAMERA, FRAME, ID, X, Y, W, H, SCORE, X_BEV, Y_BEV
+        np.savetxt(self.results_file, result[:, [2, 1]], fmt="%d", delimiter=",")
+
+    def get_save_function(self, cfg):
+        if "WildTrack" in cfg.dataset.name:
+            return self._to_wildtrack
+        elif "AICITY24" in cfg.dataset.name:
+            return self._to_aicity19
+        elif "AICITY" in cfg.dataset.name or "CityFlow" in cfg.dataset.name:
+            return self._to_aicity24
+        else:
+            return self._to_synthehicle
+
+    def denormalize_bev(self, positions):
+        min_x, min_y, max_x, max_y = self._norm_factors
+        return positions * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor([min_x, min_y])
+
+    def squeeze_batch(self, x: torch.Tensor):
+        if x.dim() == 4 and x.size(0) == 1:
+            return x.squeeze(0)
+        return x
diff --git a/src/utils/utils.py b/src/utils/utils.py
new file mode 100644
index 0000000..c363063
--- /dev/null
+++ b/src/utils/utils.py
@@ -0,0 +1,208 @@
+import math
+import random
+from typing import List, Optional, Tuple
+
+import matplotlib.pyplot as plt
+import torch
+from torch.utils.tensorboard import SummaryWriter
+from torchvision import transforms
+from torchvision.io import write_jpeg
+from torchvision.utils import draw_bounding_boxes, make_grid
+
+
+def resize_transform(img, size=(256, 128)):
+    """
+    Resize a torch image to the specified size.
+    Used before passing the image to reid model.
+    """
+    transform = transforms.Compose(
+        [
+            transforms.ToPILImage(),
+            transforms.Resize((size[0], size[1])),
+            transforms.ToTensor(),
+        ]
+    )
+    return transform(img)
+
+
+def compute_centers(boxes, bottom=True, box_projection_centers=None):
+    """
+    Compute the 2D centers of a torch tensor of bounding boxes.
+    """
+    if bottom is True and box_projection_centers is not None:
+        raise ValueError("Cannot project boxes to bottom and use box_projection_centers simultaneously.")
+    centers = torch.zeros((boxes.shape[0], 2))
+    centers[:, 0] = boxes[:, 0] + boxes[:, 2] / 2
+    if box_projection_centers is not None:
+        alpha_w, alpha_h = box_projection_centers
+        centers[:, 1] = boxes[:, 1] + alpha_h * boxes[:, 3]
+    elif bottom:
+        centers[:, 1] = boxes[:, 1] + boxes[:, 3]
+    else:
+        centers[:, 1] = boxes[:, 1] + boxes[:, 3] / 2
+    return centers
+
+
+def tlwh_to_xyah(tlwh):
+    """
+    Convert bounding box to format `(center x, center y, aspect ratio,
+    height)`, where the aspect ratio is `width / height`.
+    """
+    ret = tlwh.clone()
+    if ret.dim() == 1:
+        ret = ret.unsqueeze(0)
+    ret[:, :2] += ret[:, 2:] / 2
+    ret[:, 2] /= ret[:, 3]
+    return ret
+
+
+def xyah_to_tlwh(xyah):
+    """Get current position in bounding box format `(top left x, top left y,
+    width, height)`.
+    """
+    ret = xyah.clone()
+    if ret.dim() == 1:
+        ret = ret.unsqueeze(0)
+    ret[:, 2] *= ret[:, 3]
+    ret[:, :2] -= ret[:, 2:] / 2
+    return ret
+
+
+def tlwh_to_tlbr(tlwh):
+    """Convert bounding box to format `(top left x, top left y, bottom right
+    x, bottom right y)`.
+    """
+    ret = tlwh.clone()
+    if ret.dim() == 1:
+        ret = ret.unsqueeze(0)
+    ret[:, 2:] += ret[:, :2]
+    return ret
+
+
+def expand_boxes(in_boxes, factor):
+    boxes = in_boxes.clone()
+    cx, cy = boxes[:, 0] + boxes[:, 2] / 2, boxes[:, 1] + boxes[:, 3] / 2
+    w, h = boxes[:, 2] * factor, boxes[:, 3] * factor
+    boxes[:, 0] = cx - w / 2
+    boxes[:, 1] = cy - h / 2
+    boxes[:, 2] = w
+    boxes[:, 3] = h
+    return boxes
+
+
+def remove_border_boxes(boxes, border):
+    xy1x2y2 = tlwh_to_tlbr(boxes)
+    keep = (
+        (xy1x2y2[:, 0] > border)
+        & (xy1x2y2[:, 1] > border)
+        & (xy1x2y2[:, 2] < (1920 - border))
+        & (xy1x2y2[:, 3] < (1080 - border))
+    )
+    return keep
+
+
+def size_filter(boxes, size_min, size_max):
+    sizes = boxes[:, 2] * boxes[:, 3]
+    keep = (sizes >= size_min) & (sizes <= size_max)
+    return keep
+
+
+def mpl_cmap_to_rgb(cmap_name: str, seed: int = 0) -> List[Tuple[int, int, int]]:
+    """Returns a list of RGB values from a matplotlib colormap."""
+    cmap = plt.get_cmap(cmap_name)
+    colors = []
+    for i in range(cmap.N):
+        rgb = cmap(i)[:3]
+        colors.append(tuple(int(255 * c) for c in rgb))
+    random.seed(seed)
+    random.shuffle(colors)
+    return colors
+
+
+def render_image_grid(images: List[torch.Tensor], *args, **kwargs) -> torch.Tensor:
+    """Renders a grid of images.
+
+    Args:
+        images (List[torch.Tensor]): List of N images of shape (C, H, W).
+        *args: Additional arguments to pass to the make_grid function.
+        **kwargs: Additional keyword arguments to pass to the make_grid function.
+
+    Returns:
+        torch.Tensor: Image grid of shape (C, H, W).
+    """
+    images = torch.stack(images)
+    nrow = math.ceil(math.sqrt(len(images)))
+    return make_grid(images, nrow=nrow, *args, **kwargs)
+
+
+def render_images_with_boxes(
+    image: torch.Tensor,
+    boxes: Optional[torch.Tensor] = None,
+    labels: Optional[torch.Tensor] = None,
+    confs: Optional[torch.Tensor] = None,
+    colors: Optional[List[Tuple[int, int, int]]] = None,
+    *args,
+    **kwargs,
+) -> List[torch.Tensor]:
+    """Render image with bounding boxes. Colors correspond to the label index. Boxes are
+    expected to be in MOT-format, i.e., (bb_left, bb_top, bb_widht, bb_height).
+
+    Args:
+        images (torch.Tensor): Image of shape (C, H, W).
+        boxes (torch.Tensor): Boxes of shape (K, 4).
+        labels (torch.Tensor): Label of shape (K,).
+        colors (Optional[List[Tuple[int, int, int]]]): List of RGB colors. Defaults to None.
+        *args: Additional arguments to pass to the draw_bounding_boxes function.
+        **kwargs: Additional keyword arguments to pass to the draw_bounding_boxes function.
+
+    Returns:
+        torch.Tensor: Image with bounding boxes.
+    """
+    if boxes is None:
+        return image
+
+    if colors is None:
+        colors = mpl_cmap_to_rgb("rainbow")
+
+    if labels is None:
+        labels = torch.zeros(boxes.size(0))
+
+    color_palette = [colors[label % len(colors)] for label in labels]
+
+    _labels = [str(label.item()) for i, label in enumerate(labels)]
+
+    if confs is not None:
+        _labels = [f"{label} ({conf.item():.2f})" for label, conf in zip(_labels, confs)]
+
+    img = image.clone()
+    bxs = boxes.clone()
+    bxs[:, 2:] += bxs[:, :2]
+
+    img = draw_bounding_boxes(
+        img,
+        bxs,
+        labels=_labels,
+        colors=color_palette,
+        *args,
+        **kwargs,
+    )
+    return img
+
+
+def normalize_features(x):
+    # shape of x: (C, N, F)
+    # normalize features per channelg
+    mean = x.mean(dim=2, keepdim=True)
+    std = x.std(dim=2, keepdim=True) + 1e-8
+    return (x - mean) / std
+
+
+def nanmax(x, dim=None):
+    """Function like torch.nanmean for max."""
+    mask = torch.isnan(x)
+    x_masked = torch.where(mask, torch.tensor(float("-inf")).to(x.device), x)
+    max_vals, _ = torch.max(x_masked, dim=dim)
+
+    # Restore NaN values if max is -inf (because all were NaN along dimension)
+    max_vals = torch.where(max_vals == float("-inf"), torch.tensor(float("nan")).to(x.device), max_vals)
+    return max_vals
diff --git a/tools/track.py b/tools/track.py
new file mode 100644
index 0000000..6bc5ad0
--- /dev/null
+++ b/tools/track.py
@@ -0,0 +1,90 @@
+import json
+import os
+from subprocess import PIPE, run
+
+import hydra
+import torch
+from loguru import logger
+from omegaconf import DictConfig, OmegaConf
+from qqdm import format_str, qqdm
+
+import wandb
+from src.datasets.dataset import create_dataloader
+from src.tracker.encoder import create_encoder
+from src.tracker.solver import create_solver
+from src.tracker.tracker import create_tracker
+from src.utils.evaluate import evaluate_tracker
+from src.utils.iotools import ResultsWriter
+
+
+@hydra.main(version_base=None, config_path="../conf", config_name="config")
+def main(cfg: DictConfig) -> None:
+    if cfg.device == "cpu" or not torch.cuda.is_available():
+        raise ValueError("This code runs on CUDA only. Please set device to 'cuda'.")
+    else:
+        device = torch.device(cfg.device)
+        logger.info(f"🚀 Using device: {device}")
+
+    cfg.tracker.matching.distance_weight = 1 - cfg.tracker.matching.rescale_weight
+
+    # create output directories
+    output_path = os.path.join(cfg.output_path)
+    os.makedirs(output_path, exist_ok=True)
+    output_path = os.path.join(output_path, cfg.dataset.name)
+    logger.info(f"📂 Writing to output path: {output_path}")
+
+    # Initialize wandb and tensorboard
+    if cfg.logging.wandb.enable:
+        wandb.init(project=cfg.logging.wandb.project)
+        wandb.config.update(OmegaConf.to_container(cfg))
+        if cfg.logging.wandb.tags is not None:
+            wandb.run.tags = cfg.logging.wandb.tags
+
+    # Initialize solver
+    solver_opts = create_solver(cfg.solver.backend)
+    logger.info(f"✨ Initialized solver, using backend: {cfg.solver.backend}")
+
+    # Initialize dataset and dataloader
+    dataloader = create_dataloader(cfg)
+    logger.info("✨ Created dataloader.")
+
+    # Initialize encoder
+    encoder = create_encoder(cfg.encoder, device)
+    logger.info("✨ Created encoder.")
+
+    tracker = create_tracker(cfg, solver_opts, encoder, len(dataloader.dataset.camera_names), device)
+    logger.info("✨ Initialized tracker.")
+
+    results_writer = ResultsWriter(
+        output_path=output_path,
+        cfg=cfg,
+        normalization=dataloader.dataset._norm_factors,
+        camera_names=dataloader.dataset.camera_names,
+    )
+
+    tw = qqdm(range(len(dataloader)), desc=format_str("bold", "Description"))
+    for i, batch in enumerate(dataloader):
+        results, _ = tracker.step(batch)
+        results_writer.add(results)
+        stats = tracker.stats
+        tw.set_infos(stats)
+        tw.update()
+
+        if cfg.logging.wandb.enable:
+            _stats_str_to_float = {k: float(v) for k, v in stats.items()}
+            wandb.log(_stats_str_to_float, step=i)
+
+    logger.info(f"🕒 Cumulative execution time of tracker {tracker.cumulative_execution_time * 10}")
+    logger.info(f"🕒 Average time per frame {tracker.cumulative_execution_time / tracker.frame}")
+
+    results_writer.save()
+
+    logger.info("🚀 Tracking completed.")
+    logger.info(
+        f"📈 Results saved to {results_writer.results_file}. "
+        "Use the official evaluation script of the dataset for evaluation."
+    )
+
+
+if __name__ == "__main__":
+    main()