From 9a905437a67395c1a3e53dad1350f943d60baf74 Mon Sep 17 00:00:00 2001 From: Fabian Herzog Date: Tue, 17 Sep 2024 22:24:43 +0200 Subject: [PATCH] Public code release. --- .gitignore | 171 +++++++++++++ conf/config.yaml | 83 ++++++ conf/dataset/CityFlow.yaml | 8 + conf/encoder/precomputed.yaml | 1 + conf/experiment/CityFlow.yaml | 44 ++++ setup.py | 35 +++ src/__init__.py | 0 src/datasets/dataset.py | 389 ++++++++++++++++++++++++++++ src/tracker/encoder.py | 23 ++ src/tracker/geometry.py | 81 ++++++ src/tracker/similarities.py | 84 ++++++ src/tracker/solver.py | 65 +++++ src/tracker/supertrack.py | 254 ++++++++++++++++++ src/tracker/tracker.py | 467 ++++++++++++++++++++++++++++++++++ src/utils/evaluate.py | 260 +++++++++++++++++++ src/utils/iotools.py | 113 ++++++++ src/utils/utils.py | 208 +++++++++++++++ tools/track.py | 90 +++++++ 18 files changed, 2376 insertions(+) create mode 100644 .gitignore create mode 100644 conf/config.yaml create mode 100644 conf/dataset/CityFlow.yaml create mode 100644 conf/encoder/precomputed.yaml create mode 100644 conf/experiment/CityFlow.yaml create mode 100644 setup.py create mode 100644 src/__init__.py create mode 100644 src/datasets/dataset.py create mode 100644 src/tracker/encoder.py create mode 100644 src/tracker/geometry.py create mode 100644 src/tracker/similarities.py create mode 100644 src/tracker/solver.py create mode 100644 src/tracker/supertrack.py create mode 100644 src/tracker/tracker.py create mode 100644 src/utils/evaluate.py create mode 100644 src/utils/iotools.py create mode 100644 src/utils/utils.py create mode 100644 tools/track.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e747d2f --- /dev/null +++ b/.gitignore @@ -0,0 +1,171 @@ +# Project-specific +data/ +eval/ +resources/ +outputs/ +wandb/ + +.ruff_cache + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/conf/config.yaml b/conf/config.yaml new file mode 100644 index 0000000..78eb298 --- /dev/null +++ b/conf/config.yaml @@ -0,0 +1,83 @@ +# config.yaml +hydra/hydra_logging: null + +defaults: + - dataset: CityFlow + - encoder: precomputed + +dataset_path: ./data/AICITY/ +output_path: ./outputs/ + +device: cuda + +logging: + wandb: + enable: false + project: ggmc + upload_results: false + tags: null + tensorboard: + enable: false + +resources: + path: ./resources/ + detector: YOLOX + reid: null + +visuals: + plot_interval: 1 + plot_results: false + plot_ground_truth: false + plot_to_tensorboard: false + grid_rows: 2 + store_files: true + border_size: 3 + +solver: + backend: PD + +tracker: + matching: + distance_threshold: 0.02 + rescale_threshold: 0.65 + reid_decay: 1.0 + rescale_weight: 0.5 + distance_weight: 0.5 + confidence_thresh: 0.7 + low_confidence_thresh: null + patience: 1 + memory: 15 + fdim: 512 + enable_accumulator: true + prematching: + enabled: true + iou_bias: 0.60 + iou_threshold: 0.50 + prune_remaining: false + +preprocess: + nms_thresh: null + roi_filter: true + bottom: true + box_projection_centers: + alpha_w: null + alpha_h: null + +postprocess: + expand_boxes: + enable: true + factor: 1.4 + remove_borders: + enable: true + border_size: 5 + size_filter: + enable: true + min_size: 6220 + max_size: 622080 + +evaluation: + inplace: true + evaluate_standard: true + evaluate_hota: false + evaluate_bev: false + evaluate_external: true diff --git a/conf/dataset/CityFlow.yaml b/conf/dataset/CityFlow.yaml new file mode 100644 index 0000000..fff199a --- /dev/null +++ b/conf/dataset/CityFlow.yaml @@ -0,0 +1,8 @@ +name: AICITY +scene_path: ./validation/S02 +camera_pattern: c00* +img_path: ./img1/ +img_ext: jpg +offsets: [0, 0, 3, 8] +calibration_path: calibration.json +roi_path: "./data/AICITY/eval/ROIs/validation" diff --git a/conf/encoder/precomputed.yaml b/conf/encoder/precomputed.yaml new file mode 100644 index 0000000..868e1e1 --- /dev/null +++ b/conf/encoder/precomputed.yaml @@ -0,0 +1 @@ +name: precomputed diff --git a/conf/experiment/CityFlow.yaml b/conf/experiment/CityFlow.yaml new file mode 100644 index 0000000..7cf2af1 --- /dev/null +++ b/conf/experiment/CityFlow.yaml @@ -0,0 +1,44 @@ +# @package _global_ + +defaults: + - override /dataset: CityFlow + - override /encoder: precomputed + +dataset_path: ./data/AICITY/ + +resources: + reid: LCFractal + detector: YOLOX + +tracker: + matching: + distance_threshold: 0.001 + rescale_threshold: 0.7 + reid_decay: 0.7 + rescale_weight: 0.9 + confidence_thresh: 0.70 + low_confidence_thresh: null + patience: 0 + memory: 160 + fdim: 2048 + prematching: + enabled: false + iou_bias: 0.50 + iou_threshold: 0.70 + prune_remaining: false + +preprocess: + nms_thresh: 0.7 + roi_filter: true + +postprocess: + expand_boxes: + enable: true + factor: 1.4 + remove_borders: + enable: true + border_size: 0 + size_filter: + enable: true + min_size: 6000 + max_size: 600000 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..dce079e --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +from setuptools import find_packages, setup + + +setup( + name="stmc", + version="0.1.0", + packages=find_packages(), + install_requires=[ + "hydra-core", + "torch", + "wandb", + "loguru", + "omegaconf", + "qqdm", + "pillow", + "ramapy", + ], + entry_points={ + "console_scripts": [ + "track=tools.track:main", + ], + }, + author="Fabian Herzog", + author_email="fabian.herzog@tum.de", + description="Spatial-Temporal Multi-Cuts for Online Multiple-Camera Vehicle Tracking", + long_description=open("README.md").read(), + long_description_content_type="text/markdown", + url="https://github.com/fubel/stmc", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.8", +) diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/datasets/dataset.py b/src/datasets/dataset.py new file mode 100644 index 0000000..863fe95 --- /dev/null +++ b/src/datasets/dataset.py @@ -0,0 +1,389 @@ +import glob +import os +import pathlib +import warnings +from enum import IntEnum +from typing import List, Optional + +import numpy as np +import torch +from loguru import logger +from torch.utils.data import DataLoader +from torchvision.io import ImageReadMode, read_image +from torchvision.ops import nms + +from ..tracker.geometry import Projector +from ..utils.utils import compute_centers, resize_transform, tlwh_to_tlbr + + +class Annotation(IntEnum): + CAM_ID = 0 + OBJ_ID = 1 + FRAME_ID = 2 + XMIN = 3 + YMIN = 4 + WIDTH = 5 + HEIGHT = 6 + CONF = 7 + XWORLD = 8 + YWORLD = 9 + + +class NMSTransform: + def __init__(self, iou_threshold: float): + """Initialize the NMSTransform which applied non-maximum suppression to the + input annotations based on the specified IoU threshold. + + Args: + iou_threshold (float): The Intersection over Union (IoU) threshold for NMS. + Bounding boxes with IoU greater than this threshold will be suppressed. + """ + self.iou_threshold = iou_threshold + + def __call__(self, annotations: torch.Tensor) -> torch.Tensor: + boxes = tlwh_to_tlbr(annotations[:, Annotation.XMIN : Annotation.HEIGHT + 1]) + scores = annotations[:, Annotation.CONF] + keep = nms(boxes, scores, self.iou_threshold) + return keep + + +class ROIFilter: + def __init__(self, roi_path: str): + """Initialize the ROIFilter. + + Args: + roi_path (str): Path to the ROI image file. + + The ROI (Region of Interest) image is loaded as a binary mask, + where 1 indicates areas of interest and 0 indicates areas to be filtered out. + """ + self.roi = read_image(roi_path, ImageReadMode.GRAY).squeeze(0).bool() + self.size = self.roi.size() + + def __call__(self, annotations: torch.Tensor) -> torch.Tensor: + centers = compute_centers(annotations[:, Annotation.XMIN - 1 : Annotation.HEIGHT]).int() + centers[:, 0] = torch.clamp(centers[:, 0], 0, self.size[1] - 1) + centers[:, 1] = torch.clamp(centers[:, 1], 0, self.size[0] - 1) + keep = self.roi[centers[:, 1], centers[:, 0]] == 1 + return keep + + +class MultiCamDataset: + def __init__( + self, + annotation_paths: List[str], + image_paths: List[str], + calibration_paths: List[str], + camera_names: List[int], + ground_truth_paths: Optional[List[str]] = None, + precomputed: bool = False, + nms_threshold: Optional[float] = 0.9, + time_offsets: Optional[List[int]] = None, + roi_paths: Optional[List[str]] = None, + normalize_bev: bool = False, + bottom: bool = True, + box_projection_centers=None, + ): + """Initialize the MultiCamDataset for data loading. + + Args: + annotation_paths (List[str]): Paths to annotation files for each camera. + image_paths (List[str]): Paths to image directories for each camera. + calibration_paths (List[str]): Paths to calibration files for each camera. + camera_names (List[int]): Names or IDs of the cameras. + ground_truth_paths (Optional[List[str]], optional): Paths to ground truth files. Defaults to None. + precomputed (bool, optional): Whether to use precomputed features. Defaults to False. + nms_threshold (Optional[float], optional): Non-maximum suppression threshold. Defaults to 0.9. + time_offsets (Optional[List[int]], optional): Time offsets for each camera. Defaults to None. + roi_paths (Optional[List[str]], optional): Paths to region of interest mask images. Defaults to None. + normalize_bev (bool, optional): Whether to normalize bird's-eye view coordinates. Defaults to False. + bottom (bool, optional): Whether to use bottom of bounding box for projection. Defaults to True. + box_projection_centers (Optional[Tuple[float, float]], optional): Projection centers for bounding boxes. Defaults to None. + """ + if time_offsets is None: + self.time_offsets = [0] * len(image_paths) + else: + self.time_offsets = time_offsets + + self.annotation_paths = annotation_paths + self.image_paths = image_paths + self.calibration_paths = calibration_paths + self.camera_names = camera_names + self.precomputed = precomputed + self.nms_transform = NMSTransform(nms_threshold) if nms_threshold is not None else None + self.box_projection_centers = box_projection_centers + self.bottom = bottom + + self.normalize_bev = normalize_bev + + if roi_paths is not None: + self.roi_filters = [ROIFilter(roi_path) for roi_path in roi_paths] + else: + self.roi_filters = None + + self._load_calibrations() + self._load_annotations() + + if ground_truth_paths is not None: + self._load_ground_truth(ground_truth_paths) + else: + self._ground_truths = None + self.gts = None + + self.length = max([len(list(pathlib.Path(image_path).glob("*.jpg"))) for image_path in self.image_paths]) + + if self.length == 0: + warnings.warn("No images found. Visualization tools will not be available.") + + self.length = 2110 + + self._filtered_by_nms = 0 + self._filtered_by_size = 0 + self._filtered_by_roi = 0 + + def _load_ground_truth(self, ground_truth_paths): + self._ground_truths = [ + torch.from_numpy(np.loadtxt(ground_truth_path, delimiter=",", dtype=np.float32)) + for ground_truth_path in ground_truth_paths + ] + + for gt in self._ground_truths: + if gt.shape[1] == 9: + # append another column of ones + gt = torch.cat((gt, torch.ones(gt.shape[0], 1)), dim=1) + + _cat_gts = [g.clone() for g in self._ground_truths] + for i, gt in enumerate(_cat_gts): + col = torch.ones((gt.shape[0], 1)) * i + _cat_gts[i] = torch.cat((col, gt), dim=1) + _cat_gts[i][:, 1] += self.time_offsets[i] + + self.gts = torch.cat(_cat_gts, dim=0) + self.gts[:, [1, 2]] = self.gts[:, [2, 1]] + + def _load_calibrations(self): + self._projectors = [Projector(calibration_path) for calibration_path in self.calibration_paths] + + def _load_annotations(self): + anns = [ + torch.from_numpy(np.loadtxt(annotation_path, delimiter=",", dtype=np.float32)) + for annotation_path in self.annotation_paths + ] + + # todo: add to preprocess config + for i, ann in enumerate(anns): + keep = (ann[:, Annotation.WIDTH - 1] * ann[:, Annotation.HEIGHT - 1]) >= 1200 + anns[i] = ann[keep] + + # filter roi images + if self.roi_filters is not None: + keep = self.roi_filters[i](anns[i]) + anns[i] = anns[i][keep] + logger.info(f"🔥 Filtered {keep.size(0) - keep.sum().item()} annotations by ROI.") + + for i, ann in enumerate(anns): + col = torch.ones((ann.shape[0], 1)) * i + anns[i] = torch.cat((col, ann), dim=1) + anns[i][:, 1] += self.time_offsets[i] + + positions_2d = [] + for i, ann in enumerate(anns): + pos2d = compute_centers( + ann[:, Annotation.XMIN : Annotation.HEIGHT + 1], self.bottom, self.box_projection_centers + ) + positions_2d.append(pos2d) + + positions_3d = [] + for i, pos2d in enumerate(positions_2d): + pos3d = self._projectors[i].image_to_world(pos2d) + positions_3d.append(pos3d) + + anns = torch.cat(anns, dim=0) + positions_2d = torch.cat(positions_2d, dim=0) + positions_3d = torch.cat(positions_3d, dim=0) + + if anns.shape[1] == 9: + # loaded from ground truth, append column of 1s as 7th column + anns = torch.cat( + ( + anns[:, :6], + torch.ones(anns.shape[0], 1), + anns[:, 6:], + ), + dim=1, + ) + # swap columns frame and obj_id + anns[:, [1, 2]] = anns[:, [2, 1]] + + self._annotations = anns + self._positions_2d = positions_2d + self._positions_3d = positions_3d + + if self.normalize_bev: + self.apply_bev_norm() + else: + self._norm_factors = None + + self._annotations.to("cuda") + self._positions_2d.to("cuda") + self._positions_3d.to("cuda") + + def get_bev_ticks(self): + return [ + float(torch.min(self._positions_3d[:, 0])), + float(torch.max(self._positions_3d[:, 0])), + float(torch.min(self._positions_3d[:, 1])), + float(torch.max(self._positions_3d[:, 1])), + ] + + def get_crops(self, frame_annotations, frame_images): + crops = [] + for ann in frame_annotations: + cam_id = int(ann[Annotation.CAM_ID]) + x, y, w, h = ann[Annotation.XMIN : Annotation.HEIGHT + 1].int() + # clamp to image dimensions + x = torch.clamp(x, 0, frame_images[cam_id].size(1) - 1) + y = torch.clamp(y, 0, frame_images[cam_id].size(2) - 1) + w = torch.clamp(w, 0, frame_images[cam_id].size(1) - x) + h = torch.clamp(h, 0, frame_images[cam_id].size(2) - y) + crops.append(resize_transform(frame_images[cam_id][:, y : y + h, x : x + w])) + if len(crops) == 0: + return torch.empty(0) + return torch.stack(crops) + + def apply_bev_norm(self): + # normalize BEV positions to [0, 1] + logger.info("📏 Normalizing BEV positions to [0, 1].") + min_x, min_y = torch.min(self._positions_3d, dim=0)[0] + max_x, max_y = torch.max(self._positions_3d, dim=0)[0] + self._norm_factors = torch.tensor([min_x, min_y, max_x, max_y]) + self._positions_3d = (self._positions_3d - torch.tensor([min_x, min_y])) / torch.tensor( + [max_x - min_x, max_y - min_y] + ) + + def __len__(self): + return self.length + + def __getitem__(self, idx): + frame = idx + 1 + + annotations = self._annotations[self._annotations[:, Annotation.FRAME_ID] == frame] + positions_2d = self._positions_2d[self._annotations[:, Annotation.FRAME_ID] == frame] + positions_3d = self._positions_3d[self._annotations[:, Annotation.FRAME_ID] == frame] + + if self.gts is not None: + ground_truth = self.gts[self.gts[:, Annotation.FRAME_ID] == frame] + else: + ground_truth = torch.empty(0) + + if self.nms_transform is not None: + keep = self.nms_transform(annotations) + else: + keep = torch.arange(annotations.size(0)) + + annotations = annotations[keep] + positions_2d = positions_2d[keep] + positions_3d = positions_3d[keep] + + frame_images = [] + for img_path, offset in zip(self.image_paths, self.time_offsets): + try: + frame_images.append(read_image(str(pathlib.Path(img_path) / f"{(frame - offset):06d}.jpg"))) + except Exception: + frame_images.append(torch.zeros(3, 1080, 1920).to(torch.uint8)) + + if not self.precomputed: + frame_crops = self.get_crops(annotations, frame_images) + else: + frame_crops = torch.empty(0) + + return { + "annotations": annotations, + "positions_2d": positions_2d, + "positions_3d": positions_3d, + "images": frame_images, + "crops": frame_crops, + "ground_truth": ground_truth, + } + + +def create_dataloader(cfg): + scene_path = os.path.join(cfg.dataset_path, cfg.dataset.scene_path) + cameras = [ + os.path.basename(f) + for f in sorted(glob.glob(os.path.join(scene_path, cfg.dataset.camera_pattern))) + if os.path.isdir(f) + ] + + img_paths = [ + os.path.join(cfg.dataset_path, cfg.dataset.scene_path, camera, cfg.dataset.img_path) for camera in cameras + ] + calibration_paths = [ + os.path.join( + cfg.dataset_path, + cfg.dataset.scene_path, + camera, + cfg.dataset.calibration_path, + ) + for camera in cameras + ] + annotation_paths = [] + for camera in cameras: + if cfg.resources.reid is not None: + scene_path = "-".join(pathlib.Path(cfg.dataset.scene_path).parts) + if scene_path[-1] == "-": + scene_path = scene_path[:-1] + resource_name = ( + f"{cfg.dataset.name}_{scene_path}-{camera}_{cfg.resources.detector}_{cfg.resources.reid}.txt" + ) + else: + resource_name = f"{cfg.dataset.name}-{camera}_{cfg.resources.detector}.txt" + annotation_paths.append(os.path.join(cfg.resources.path, resource_name)) + + if cfg.preprocess.nms_thresh is not None: + nms_threshold = cfg.preprocess.nms_thresh + else: + nms_threshold = None + + if cfg.preprocess.roi_filter is not None and "roi_path" in cfg.dataset: + roi_paths = [os.path.join(cfg.dataset.roi_path, camera, "roi.jpg") for camera in cameras] + else: + roi_paths = None + + ground_truth_paths = None + + time_offsets = None + if "offsets" in cfg.dataset: + if cfg.dataset.offsets is not None: + time_offsets = cfg.dataset.offsets + + box_projection_centers = [ + cfg.preprocess.box_projection_centers.alpha_w, + cfg.preprocess.box_projection_centers.alpha_h, + ] + + if box_projection_centers[0] is None: + box_projection_centers = None + elif box_projection_centers[1] is None: + box_projection_centers[1] = 1 - box_projection_centers[0] + + dataset = MultiCamDataset( + annotation_paths=annotation_paths, + image_paths=img_paths, + calibration_paths=calibration_paths, + camera_names=cameras, + ground_truth_paths=ground_truth_paths, + precomputed=cfg.encoder.name == "precomputed", + nms_threshold=nms_threshold, + time_offsets=time_offsets, + roi_paths=roi_paths, + bottom=cfg.preprocess.bottom, + box_projection_centers=box_projection_centers, + ) + dataloader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + num_workers=8, + ) + return dataloader diff --git a/src/tracker/encoder.py b/src/tracker/encoder.py new file mode 100644 index 0000000..619660b --- /dev/null +++ b/src/tracker/encoder.py @@ -0,0 +1,23 @@ +import sys +import warnings + +import torch +import torch.nn.functional as F +import torchvision + + +class Precomputed: + def __init__(self, cfg): + self.cfg = cfg + + def __call__(self, x): + features = x["annotations"][:, 11:] + return F.normalize(features, p=2, dim=1) + + +def create_encoder(cfg, device): + print(cfg) + if cfg.name == "precomputed": + return Precomputed(cfg) + else: + raise ValueError(f"Encoder {cfg.name} not found.") diff --git a/src/tracker/geometry.py b/src/tracker/geometry.py new file mode 100644 index 0000000..8a78e6c --- /dev/null +++ b/src/tracker/geometry.py @@ -0,0 +1,81 @@ +import json +import os + +import torch + + +class Projector: + def __init__(self, calibration_path: str): + """ + Initialize a Projector object. The projector is used to project points between image and world coordinates. + + Args: + calibration_path (str): Path to the calibration file (JSON). + + Raises: + FileNotFoundError: If the calibration file is not found. + ValueError: If the homography is not found in the calibration file. + """ + if os.path.exists(calibration_path) is False: + raise FileNotFoundError(f"Calibration file not found at path: {calibration_path}") + self.calibration_path = calibration_path + + with open(calibration_path, "r") as f: + calibration = json.load(f) + try: + homography_keys = [ + "homography", + "H", + "homography_matrix", + "homography matrix", + ] + valid_homography_key = set(homography_keys).intersection(set(calibration.keys())).pop() + except KeyError: + raise ValueError("Homography not found in calibration file.") + self._homography = torch.Tensor(calibration[valid_homography_key]) + self._inverse_homography = torch.inverse(self._homography) + + def image_to_world(self, points: torch.Tensor) -> torch.Tensor: + """Projects image points to world coordinates. + + Args: + points (torch.Tensor): Image points Nx2. + + Returns: + torch.Tensor: World points Nx3. + """ + if points.dim() != 2: + points = points.view(-1, 2) + if points.size(1) != 2: + raise ValueError(f"Expected image points to be of shape (N, 2), but got {points.shape}.") + return self._homography_image_to_world(points) + + def world_to_image(self, points: torch.Tensor) -> torch.Tensor: + """Projects world points to image coordinates. + + Args: + points (torch.Tensor): World points Nx3. + + Returns: + torch.Tensor: Image points Nx2. + """ + if points.dim() != 2: + points = points.view(-1, 3) + if points.size(1) != 3: + points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1) + return self._homography_world_to_image(points) + + def _homography_image_to_world(self, points: torch.Tensor) -> torch.Tensor: + points = torch.cat([points, torch.ones((points.shape[0], 1))], dim=1) + device = points.device + homography = self._inverse_homography.to(device) + projected_points = torch.matmul(homography, points.t()).t() + projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1) + return projected_points + + def _homography_world_to_image(self, points: torch.Tensor) -> torch.Tensor: + device = points.device + homography = self._homography.to(device) + projected_points = torch.matmul(homography, points.t()).t() + projected_points = projected_points[:, :2] / projected_points[:, 2].reshape(-1, 1) + return projected_points diff --git a/src/tracker/similarities.py b/src/tracker/similarities.py new file mode 100644 index 0000000..1f148b8 --- /dev/null +++ b/src/tracker/similarities.py @@ -0,0 +1,84 @@ +import torch +from torchvision.ops import box_iou + + +def cosine_similarity(a, b, eps=1e-8): + """ + Compute pairwise appearance distance between features. + from https://stackoverflow.com/a/58144658 + """ + a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None] + a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n)) + b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n)) + sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1)) + return sim_mt + + +def batch_cosine_similarity(a, b, eps=1e-8): + """Compute batched pairwise appearance distance between features. + + Args: + a (torch.Tensor): (B, N, feature_dim) tensor. + b (torch.Tensor): (B, N, feature_dim) tensor. + eps (float, optional): Epsilon to prevent division by zero. Defaults to 1e-8. + + Returns: + torch.Tensor: (B, N, N) tensor of pairwise similarities. + """ + # Compute norms along feature dimension and add new dimensions needed for broadcasting + a_n = a.norm(dim=2)[:, :, None] + b_n = b.norm(dim=2)[:, :, None] + + # Perform normalization and prevent division by zero. + a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n)) + b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n)) + + # Compute similarity matrix using batch matrix multiplication. + sim_mt = torch.bmm(a_norm, b_norm.transpose(1, 2)) + return sim_mt + + +def batched_box_iou(boxes): + """Compute batched pairwise IoU between boxes. + + Args: + boxes (torch.Tensor): (B, N, 4) tensor of boxes. + + Returns: + torch.Tensor: (B, N, N) tensor of pairwise IoU. + """ + ious = [] + for sub_boxes in boxes: + ious.append(box_iou(sub_boxes, sub_boxes)) + return torch.stack(ious) + + +def bev_distance(bev_positions): + """Compute distance between positions on ground plane. + + Args: + bev_positions (torch.Tensor): (N, 2) tensor of positions. + + Returns: + torch.Tensor: (N, N) tensor of pairwise similarities. + """ + return torch.norm(bev_positions[:, None] - bev_positions[None, :], dim=2) + + +def batch_bev_distance(bev_positions): + """Compute batched distance similarity between positions on ground plane. + + Args: + bev_positions (torch.Tensor): (B, N, 2) tensor of positions. + + Returns: + torch.Tensor: (B, N, N) tensor of pairwise similarities. + """ + # Subtract positions across the batch, adding extra dimensions for broadcasting + diff = bev_positions[:, :, None] - bev_positions[:, None, :] + + # Compute norm along the last dimension (x and y coordinates) + norm = torch.norm(diff, dim=-1) + + # Return similarity + return norm diff --git a/src/tracker/solver.py b/src/tracker/solver.py new file mode 100644 index 0000000..352af93 --- /dev/null +++ b/src/tracker/solver.py @@ -0,0 +1,65 @@ +import rama_py +import torch + + +def multicut(edge_index, edge_weights, opts): + """Solves a multicut problem based on the RAMA algorithm. + + The edge_index is expected in the usual torch_geometric format. + Note that RAMA requires u < v for each edge (u, v) in the graph. + + Args: + edge_index (LongTensor): 2xE LongTensor of edge indices. + edge_weights (LongTensor): E LongTensor of edge weights. + + Returns: + LongTensor: N LongTensor of node labels, where N is the number + of nodes in the graph. + """ + if (edge_index[0] > edge_index[1]).any(): + raise ValueError("Solver expects u < v for each edge (u, v) in the graph.") + if edge_index.device.index is None: + raise ValueError("Solver runs on CUDA device only. Please move data to CUDA.") + if edge_index.shape[1] == 0: + return torch.empty(0).to("cuda") + i = edge_index[0].to(torch.int32) + j = edge_index[1].to(torch.int32) + costs = edge_weights.to(torch.float32) + num_nodes = torch.max(edge_index) + 1 + num_edges = edge_index.shape[1] + node_labels = torch.ones(num_nodes, device=i.device).to(torch.int32) + rama_py.rama_cuda_gpu_pointers( + i.data_ptr(), + j.data_ptr(), + costs.data_ptr(), + node_labels.data_ptr(), + num_nodes, + num_edges, + i.device.index, + opts, + ) + return node_labels + + +def scale_weights(weights, threshold=0.7): + """Scales the given weights to the range [-1, 1] based on the given threshold. + + Args: + weights (FloatTensor): LongTensor of edge weights. + threshold (float, optional): Threshold for scaling. Defaults to 0.4. + + Returns: + FloatTensor: LongTensor of scaled edge weights. + """ + y = weights.clone() + z = weights.clone() + z[y == threshold] = 0.0 + z[y > threshold] = (y[y > threshold] - threshold) / (1 - threshold) + z[y < threshold] = (y[y < threshold] - threshold) / (threshold) + return z + + +def create_solver(backend): + opts = rama_py.multicut_solver_options(backend) + opts.verbose = False + return opts diff --git a/src/tracker/supertrack.py b/src/tracker/supertrack.py new file mode 100644 index 0000000..dffea0d --- /dev/null +++ b/src/tracker/supertrack.py @@ -0,0 +1,254 @@ +from enum import IntEnum + +import torch + +from ..utils.utils import tlwh_to_tlbr + + +class TrackState(IntEnum): + CREATED = 0 # Track is created but not confirmed yet + ACTIVE = 1 # Track is confirmed and active + LOST = 3 # Track is lost and not tracked, but kept in memory + KILLED = 4 # Track is killed (e.g. due to merging with another track) + + +class SuperTrack: + def __init__( + self, + frame, + features, + boxes, + positions_2d, + positions_3d, + confidence=None, + ): + self.frame = frame + self.last_update = frame + + self.n_cams = features.size(0) + self.features = features + self.boxes = boxes + self.positions_2d = positions_2d + self.positions_3d = positions_3d + + self.label = None + self.__state = TrackState.CREATED # private state variable + + # inactivity counter: how many frames since last update at each camera + self.inactive_since = torch.zeros(self.n_cams, device=features.device) + + self.lost_since = 0 + + # where to continue tracking: if False, track is not continued in this camera + self.track_where = torch.ones(self.n_cams, device=features.device).bool() + self.track_where[torch.isnan(features).any(dim=1)] = False + + # cams the track hasn't been seen in yet + self.queries = torch.ones(self.n_cams, device=features.device).bool() + + # count updates for each camera + self.ticks = torch.ones(self.n_cams, device=features.device) + + self.confidence = confidence + + self.velocities_2d = torch.zeros((self.n_cams, 4), device=features.device) + self.velocities_3d = torch.zeros((self.n_cams, 2), device=features.device) + + @classmethod + def empty(cls, n_cams, fdim, device): + return cls( + frame=None, + features=torch.full((n_cams, fdim), float("nan"), device=device), + boxes=torch.full((n_cams, 4), float("nan"), device=device), + positions_2d=torch.full((n_cams, 2), float("nan"), device=device), + positions_3d=torch.full((n_cams, 3), float("nan"), device=device), + ) + + def activate(self): + self.__state = TrackState.ACTIVE + + def deactivate(self): + self.__state = TrackState.LOST + + def kill(self): + self.__state = TrackState.KILLED + + def reset(self, cams=None): + if cams is None: + cams = range(self.n_cams) + for cam in cams: + self.track_where[cam] = False + # self.inactive_since[cam] = 0 + + def set_label(self, label): + if self.label is not None: + raise ValueError(f"Track {self} is already labeled.") + self.label = label + + @property + def keys(self): + return ~self.queries + + @property + def state(self): + return self.__state + + @property + def tlbr(self): + return tlwh_to_tlbr(self.boxes) + + def is_complete(self): + return ~torch.isnan(self.features).any() + + @property + def p_features(self): + return self.phantomize(self.features) + + @property + def p_positions(self): + return self.phantomize(self.positions_3d) + + @property + def mean_positions_3d(self): + return torch.nanmean(self.positions_3d, dim=0) + + @staticmethod + def phantomize(tensor): + """ + Given a (B, n_cams, f_dim) tensor, replace nans with the average of + the non-nan values along the cam axis. + """ + return torch.where(torch.isnan(tensor), torch.nanmean(tensor, dim=0, keepdim=True), tensor) + + def update(self, other): + n_cams = self.features.size(0) + if self.frame == other.frame: + for cam in range(n_cams): + if torch.isnan(self.features[cam]).any(): + if torch.isnan(other.features[cam]).any(): + continue + self.features[cam] = other.features[cam] + self.boxes[cam] = other.boxes[cam] + self.positions_2d[cam] = other.positions_2d[cam] + self.positions_3d[cam] = other.positions_3d[cam] + self.inactive_since[cam] = 0 + self.track_where[cam] = True + self.queries[cam] = False + self.ticks[cam] = other.ticks[cam] + else: + if not torch.isnan(other.features[cam]).any(): + raise ValueError(f"Found violation of constraints for track update with {self}.") + elif self.frame < other.frame: + for cam in range(n_cams): + if not torch.isnan(other.features[cam]).any(): + if not torch.isnan(self.features[cam]).any(): + if self.velocities_2d[cam].sum() == 0: + w = 1.0 + else: + w = 0.8 + self.velocities_2d[cam] = ( + w * (other.boxes[cam] - self.boxes[cam]) / (other.frame - self.frame) + + (1 - w) * self.velocities_2d[cam] + ) + self.velocities_3d[cam] = ( + w * (other.positions_3d[cam] - self.positions_3d[cam]) / (other.frame - self.frame) + + (1 - w) * self.velocities_3d[cam] + ) + self.features[cam] = 0.9 * self.features[cam] + 0.1 * other.features[cam] + self.boxes[cam] = other.boxes[cam] + self.positions_2d[cam] = other.positions_2d[cam] + self.positions_3d[cam] = other.positions_3d[cam] + self.inactive_since[cam] = 0 + self.track_where[cam] = True + self.queries[cam] = False + self.ticks[cam] += 1 + else: + self.features[cam] = other.features[cam] + self.boxes[cam] = other.boxes[cam] + self.positions_2d[cam] = other.positions_2d[cam] + self.positions_3d[cam] = other.positions_3d[cam] + self.inactive_since[cam] = 0 + self.track_where[cam] = True + self.queries[cam] = False + self.ticks[cam] = other.ticks[cam] + else: + if self.track_where[cam]: + self.inactive_since[cam] += 1 + else: + raise ValueError( + f"Frame of other must be greater or equal to frame of self, but got {self.frame} and {other.frame}." + ) + self.last_update = other.frame + self.frame = other.frame + + if self.state == TrackState.LOST: + self.activate() + + def predict(self): + for cam in range(self.n_cams): + if ~self.track_where[cam]: + continue + prd_box = self.boxes[cam] + self.velocities_2d[cam] + prd_pos = self.positions_3d[cam] + self.velocities_3d[cam] + if prd_box[2] <= 0 or prd_box[3] <= 0: + prd_box = self.boxes[cam] + prd_pos = self.positions_3d[cam] + self.boxes[cam] = prd_box + self.positions_3d[cam] = prd_pos + + def merge(self, other): + if other.state == TrackState.KILLED or self.state == TrackState.KILLED: + raise ValueError("Cannot merge killed tracks.") + if other.frame < self.frame: + raise ValueError( + f"Other track must not be older than self, but " + f"self is at frame {self.frame} and other at frame {other.frame}." + ) + self.update( + other.frame, + other.features, + other.boxes, + other.positions_2d, + other.positions_3d, + ) + # other was merged into self, so it is killed + other.kill() + + def split(self, where: torch.Tensor): + # keep the cams where "where" is True + other_features = self.features.clone() + other_boxes = self.boxes.clone() + other_positions_2d = self.positions_2d.clone() + other_positions_3d = self.positions_3d.clone() + for w in where: + if not w: + self.features[w] = torch.nan + self.boxes[w] = torch.nan + self.positions_2d[w] = torch.nan + self.positions_3d[w] = torch.nan + else: + other_features[w] = torch.nan + other_boxes[w] = torch.nan + other_positions_2d[w] = torch.nan + other_positions_3d[w] = torch.nan + return SuperTrack( + frame=self.frame, + features=other_features, + boxes=other_boxes, + positions_2d=other_positions_2d, + positions_3d=other_positions_3d, + ) + + def __repr__(self): + return f"Track {self.label}" + + def to_tensor(self): + output = [] + if self.state == TrackState.LOST: + return torch.Tensor(output) + for i, box in enumerate(self.boxes): + if ~self.track_where[i]: + continue + row = [i, self.label, self.frame, *box, *self.mean_positions_3d] + output.append(row) + return torch.Tensor(output) diff --git a/src/tracker/tracker.py b/src/tracker/tracker.py new file mode 100644 index 0000000..3e53430 --- /dev/null +++ b/src/tracker/tracker.py @@ -0,0 +1,467 @@ +import statistics +import time +from typing import Any, List, Optional, Tuple + +import motmetrics as mm +import torch +from omegaconf import DictConfig +from scipy.optimize import linear_sum_assignment +from torchvision.ops import box_iou + +from .similarities import batch_bev_distance, batch_cosine_similarity, batched_box_iou +from .solver import multicut, scale_weights +from .supertrack import SuperTrack, TrackState + + +class Tracker: + def __init__( + self, + solver_opts: Any, + cfg: DictConfig, + n_cams: int, + feature_extractor: Optional[torch.nn.Module] = None, + device: Optional[torch.device] = "cpu", + ): + self.feature_extractor = feature_extractor + self.solver_opts = solver_opts + self.device = device + + self.current_data = None + + self.feature_dim = cfg.tracker.fdim + self.n_cams = n_cams + self.cfg = cfg.tracker + + self.tracks: List[SuperTrack] = [] + + self.frame = 0 + self.free_id = 1 + + self.latency = [] + + self.update_interval = 1 + self.stats = { + "# Killed": 0, + "Latency": 0, + } + + self.cumulative_execution_time = 0 + + def step(self, sample): + # move sample to device and remove batch dimension + t0 = time.time() + for key in sample.keys(): + if key != "images": + sample[key] = sample[key].to(self.device).squeeze(0) + self.frame += 1 + if self.frame % self.update_interval == 0: + if sample["annotations"].size(0) > 0: + matched, unmatched = self.update(sample) + self._handle_unmatched(unmatched) + + t1 = time.time() + self.cumulative_execution_time += t1 - t0 + self.latency.append(t1 - t0) + + self._sanitize() + + rresults = self.get_result() + + self.predict() + + presults = self.get_result() + + return rresults, presults + + def update(self, sample): + features = self.feature_extractor(sample) + superboxes = self._new_superboxes_from_data(sample, features) + superboxes = [s for s in superboxes if s.confidence >= self.cfg.confidence_thresh] + + relevant_tracks = self.tracks + superboxes + _track_indices = torch.arange(len(self.tracks)).to(self.device) + _superbox_indices = torch.arange(len(self.tracks), len(relevant_tracks)).to(self.device) + + low_conf_indices = None + + if self.cfg.low_confidence_thresh is not None: + c1 = self.cfg.low_confidence_thresh + c2 = self.cfg.confidence_thresh + low_conf_superboxes = [s for s in superboxes if c1 <= s.confidence < c2] + + if len(low_conf_superboxes) > 0: + n_relevant = len(relevant_tracks) + relevant_tracks += low_conf_superboxes + low_conf_indices = torch.arange(n_relevant, n_relevant + len(low_conf_superboxes)) + + if len(relevant_tracks) == 0: + return [], [] + + features = torch.stack([track.p_features for track in relevant_tracks]) # (n_tracks, n_cams, feature_dim) + positions = torch.stack([track.p_positions for track in relevant_tracks]) # (n_tracks, n_cams, 2) + boxes = torch.stack([track.tlbr for track in relevant_tracks]) # (n_tracks, n_cams, 4) + + # compute (n_tracks) x (n_tracks) similarity matrix + similarities = self._compute_similarities(features, positions, boxes) + + # compute weighted graph + rescale_thresh = self.cfg.matching.rescale_threshold + dist_thresh = self.cfg.matching.distance_threshold + iou_bias = self.cfg.prematching.iou_bias if self.cfg.prematching.enabled else 0 + edge_index, edge_weights = self._build_weighted_graph( + relevant_tracks, + similarities, + rescale_thresh, + dist_thresh, + iou_bias, + reid_decay=self.cfg.matching.reid_decay, + ) + labels = multicut(edge_index, edge_weights, self.solver_opts) + + matched_tracks, unmatched_tracks = self._match(relevant_tracks, labels, low_conf_indices=low_conf_indices) + + self.tracks = matched_tracks + unmatched_tracks + return matched_tracks, unmatched_tracks + + def _handle_unmatched(self, unmatched_tracks): + for track in unmatched_tracks: + for cam in range(self.n_cams): + if track.track_where[cam]: + track.inactive_since[cam] += 1 + + def predict(self): + """ + Project existing tracks into the future. + """ + for track in self.tracks: + track.predict() + + def _new_superboxes_from_data(self, sample, sample_features): + """ + Given a sample and its features, create new superboxes. + """ + n_rows = sample_features.shape[0] + + features = torch.full((n_rows, self.n_cams, self.feature_dim), float("nan"), device=self.device) + boxes = torch.full((n_rows, self.n_cams, 4), float("nan"), device=self.device) + positions_2d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device) + positions_3d = torch.full((n_rows, self.n_cams, 2), float("nan"), device=self.device) + + cam_ids = sample["annotations"][:, 0].int() + features[torch.arange(n_rows), cam_ids] = sample_features + boxes[torch.arange(n_rows), cam_ids] = sample["annotations"][:, 3:7] + positions_2d[torch.arange(n_rows), cam_ids] = sample["positions_2d"] + positions_3d[torch.arange(n_rows), cam_ids] = sample["positions_3d"] + confidences = sample["annotations"][:, 7] + + superboxes = [ + SuperTrack( + frame=self.frame, + features=features[row], + boxes=boxes[row], + positions_2d=positions_2d[row], + positions_3d=positions_3d[row], + confidence=confidences[row], + ) + for row in range(n_rows) + ] + + return superboxes + + def _merge_tracks(self, tracks): + _frames = sorted({track.frame for track in tracks}) + + newest_frame = _frames[-1] + if len(_frames) > 1: + penult_frame = _frames[-2] + + assert tracks[-1].frame == newest_frame + + newest_evidence = [track for track in tracks if track.frame == newest_frame] + + features = (torch.ones(self.n_cams, self.feature_dim) * (torch.nan)).to(self.device) + boxes = (torch.ones(self.n_cams, 4) * (torch.nan)).to(self.device) + positions_2d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device) + positions_3d = (torch.ones(self.n_cams, 2) * (torch.nan)).to(self.device) + track_where = torch.zeros(self.n_cams, dtype=torch.bool).to(self.device) + + for cam_id in range(self.n_cams): + for track in newest_evidence: + if not torch.isnan(track.features[cam_id]).any(): + features[cam_id] = track.features[cam_id] + boxes[cam_id] = track.boxes[cam_id] + positions_2d[cam_id] = track.positions_2d[cam_id] + positions_3d[cam_id] = track.positions_3d[cam_id] + track_where[cam_id] = True + break + + merged_track = SuperTrack( + frame=newest_frame, + features=features, + boxes=boxes, + positions_2d=positions_2d, + positions_3d=positions_3d, + ) + + if len(_frames) > 1: + penult_track = [track for track in tracks if track.frame == penult_frame][0] + penult_track.update(merged_track) + merged_track = penult_track + + return merged_track + + def _match(self, tracks, labels, low_conf_indices=None): + """ + Match superboxes with superboxes, and merged + superboxes with existing supertracks in one cut. + """ + new_tracks = [] + unmatched_tracks = [] + + for label in torch.unique(labels): + track_indices = torch.where(labels == label)[0].tolist() + if len(track_indices) == 1: + track = tracks[track_indices[0]] + if low_conf_indices is not None and track_indices[0] in low_conf_indices: + continue + if track.state == TrackState.CREATED: + new_tracks.append(track) + else: + unmatched_tracks.append(track) + else: + if low_conf_indices is None: + relevant_tracks = sorted([tracks[i] for i in track_indices], key=lambda x: x.frame) + else: + relevant_tracks = sorted( + [tracks[i] for i in track_indices if i not in low_conf_indices], key=lambda x: x.frame + ) + merged_track = self._merge_tracks(relevant_tracks) + if low_conf_indices is not None and not merged_track.is_complete(): + relevant_low_conf_tracks = [tracks[i] for i in track_indices if i in low_conf_indices] + merged_track = self._merge_tracks([merged_track] + relevant_low_conf_tracks) + new_tracks.append(merged_track) + + return new_tracks, unmatched_tracks + + @staticmethod + def _compute_similarities(features, positions, boxes): + """Compute similarity matrices for features, positions, and boxes. + + Args: + features (torch.Tensor): (n_tracks, n_cams, feature_dim) tensor. + positions (torch.Tensor): (n_tracks, n_cams, 2) tensor. + boxes (torch.Tensor): (n_tracks, n_cams, 4) tensor. + + Returns: + Tuple[torch.Tensor]: Tuple of similarity matrices. + """ + # permute to (n_cams, n_tracks, feature_dim), (n_cams, n_tracks, 2), (n_cams, n_tracks, 4) + features = features.permute(1, 0, 2) + positions = positions.permute(1, 0, 2) + boxes = boxes.permute(1, 0, 2) + + # compute pairwise similarities (n_cams, n_tracks, n_tracks) + feature_sim = batch_cosine_similarity(features, features) + position_dist = batch_bev_distance(positions) + iou_sim = batched_box_iou(boxes) + + # average-pool similarities to (n_tracks, n_tracks) + feature_sim = torch.nanmean(feature_sim, dim=0) + position_dist = torch.nanmean(position_dist, dim=0) + iou_sim = torch.nanmean(iou_sim, dim=0) + + return feature_sim, position_dist, iou_sim + + def _build_weighted_graph( + self, + tracks: List[SuperTrack], + similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + rescale_thresh: float, + dist_thresh: float, + iou_bias: float, + reid_decay: float = 1, + penalty: float = -100, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Builds a weighted graph from the given tracks and similarity matrices. + + Args: + tracks (List[SuperTrack]): List of tracks. + similarities (Tuple[torch.Tensor, torch.Tensor, torch.Tensor]): Tuple of similarity matrices (appearance, position, IoU). + rescale_thresh (float): Threshold for rescaling weights. + dist_thresh (float): Distance threshold for feasibility. + iou_bias (float): Bias to add for IoU-based matching. + reid_decay (float, optional): Decay factor for ReID scores. Defaults to 1. + penalty (float, optional): Penalty for infeasible edges. Defaults to -100. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: Edge indices and edge weights of the graph. + """ + adj = self._initialize_adjacency_matrix(similarities, tracks, reid_decay, rescale_thresh, dist_thresh) + + if self.cfg.prematching.enabled: + adj = self._apply_prematching(adj, tracks, iou_bias) + + adj = self._finalize_adjacency_matrix(adj, penalty, tracks) + + return self._get_edge_index_and_weights(adj) + + def _initialize_adjacency_matrix( + self, + similarities: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], + tracks: List[SuperTrack], + reid_decay: float, + rescale_thresh: float, + dist_thresh: float, + ) -> torch.Tensor: + appearance_sim, position_dist, _ = similarities + device = appearance_sim.device + + frame_support_pairs = [(track.frame, track.track_where) for track in tracks] + frames, supports = zip(*frame_support_pairs) + + times = torch.tensor(frames, dtype=torch.int, device=device) + lost = torch.tensor([track.state == TrackState.LOST for track in tracks], device=device) + lost_since = torch.tensor([track.lost_since for track in tracks], device=device) + + appearance_sim = appearance_sim * reid_decay**lost_since + appearance_sim = scale_weights(appearance_sim, rescale_thresh) + + combined_sim = self.cfg.matching.rescale_weight * appearance_sim + self.cfg.matching.distance_weight * ( + 1 - position_dist / dist_thresh + ) + + adj = torch.zeros_like(appearance_sim) + lmask = lost[:, None] | lost[None, :] + same_time = times[:, None] == times[None, :] + feasible = (position_dist < dist_thresh) | lmask + + adj[same_time & feasible] = torch.clip(combined_sim[same_time & feasible], min=0, max=1) + adj[~same_time] = combined_sim[~same_time] + adj[lmask] = combined_sim[lmask] + + return adj + + def _apply_prematching(self, adj: torch.Tensor, tracks: List[SuperTrack], iou_bias: float) -> torch.Tensor: + cur_frame = max(track.frame for track in tracks) + pen_frame = cur_frame - 1 + cur_track_idx_by_cam = [[] for _ in range(self.n_cams)] + pen_track_idx_by_cam = [[] for _ in range(self.n_cams)] + + for i, track in enumerate(tracks): + if track.frame == cur_frame: + for cam in range(self.n_cams): + if not torch.isnan(track.boxes[cam]).any(): + cur_track_idx_by_cam[cam].append(i) + elif track.frame == pen_frame: + for cam in range(self.n_cams): + if not torch.isnan(track.boxes[cam]).any(): + pen_track_idx_by_cam[cam].append(i) + + for cam in range(self.n_cams): + cur_boxes_cam = [tracks[i].tlbr[cam] for i in cur_track_idx_by_cam[cam]] + pen_boxes_cam = [tracks[i].tlbr[cam] for i in pen_track_idx_by_cam[cam]] + + if not cur_boxes_cam or not pen_boxes_cam: + continue + + iou_dist = 1 - box_iou(torch.stack(cur_boxes_cam), torch.stack(pen_boxes_cam)) + row_ind, col_ind = linear_sum_assignment(iou_dist.cpu().numpy()) + + for r, c in zip(row_ind, col_ind): + if iou_dist[r, c] > self.cfg.prematching.iou_threshold: + continue + cur_idx = cur_track_idx_by_cam[cam][r] + if self.cfg.prematching.prune_remaining: + adj[cur_idx] = 0 + adj[:, cur_idx] = 0 + adj[cur_idx, pen_track_idx_by_cam[cam][c]] += iou_bias + adj[pen_track_idx_by_cam[cam][c], cur_idx] += iou_bias + + return adj + + def _finalize_adjacency_matrix(self, adj: torch.Tensor, penalty: float, tracks: List[SuperTrack]) -> torch.Tensor: + frame_support_pairs = [(track.frame, track.track_where) for track in tracks] + frames, supports = zip(*frame_support_pairs) + + times = torch.tensor(frames, dtype=torch.int, device=adj.device) + supps = torch.stack(supports).to(adj.device) + + same_time = times[:, None] == times[None, :] + same_supp = (supps[:, None] & supps[None, :]).any(dim=2) + + adj[same_time & same_supp] = penalty + adj = adj * torch.triu(torch.ones_like(adj), diagonal=1) + + return adj + + def _get_edge_index_and_weights(self, adj: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + edge_index = torch.nonzero(adj).t().long() + edge_weights = adj[edge_index[0], edge_index[1]] + return edge_index, edge_weights + + def _sanitize(self): + keep = [] + for k, track in enumerate(self.tracks): + if track.state is TrackState.CREATED: + track.activate() + if track.label is None: + track.set_label(self.free_id) + self.free_id += 1 + if torch.all(~track.track_where): + if torch.all(track.inactive_since[track.inactive_since > 0] > self.cfg.patience): + track.deactivate() + if track.state is TrackState.LOST: + if track.lost_since > self.cfg.memory: + track.kill() + else: + track.lost_since += 1 + if track.state is not TrackState.KILLED: + keep.append(track) + for cam in range(self.n_cams): + if track.inactive_since[cam] > self.cfg.patience: + track.reset([cam]) + killed = len(self.tracks) - len(keep) + self.tracks = keep + self.stats["# Tracks"] = len(self.tracks) + self.stats["# Lost"] = len([track for track in self.tracks if track.state == TrackState.LOST]) + self.stats["# Killed"] += killed + + latency = statistics.mean(self.latency) if len(self.latency) > 0 else 0 + self.stats["FPS"] = int(1 / latency) if latency > 0 else 0 + + def _get_active_tracks(self): + return [track for track in self.tracks if track.state != TrackState.KILLED] + + def get_result(self, normalization=None, scale=1.0): + """ + Return the current online state of the tracker. + """ + to_stack = [track.to_tensor() for track in self.tracks if track.state == TrackState.ACTIVE] + if len(to_stack) > 0: + result = torch.cat(to_stack) + else: + result = torch.empty(0) + if result.size(0) > 0: + if normalization is not None: + min_x, min_y, max_x, max_y = normalization + result[:, 7:9] = result[:, 7:9] * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor( + [min_x, min_y] + ) + result[:, 7:9] *= scale + return result + + def _get_index_by_id(self, tid): + for i, track in enumerate(self.tracks): + if track.label == tid: + return i + return None + + +def create_tracker(cfg, solver_cfg, feature_extractor, n_cams, device, writer=None): + return Tracker( + solver_opts=solver_cfg, + cfg=cfg, + feature_extractor=feature_extractor, + n_cams=n_cams, + device=device, + ) diff --git a/src/utils/evaluate.py b/src/utils/evaluate.py new file mode 100644 index 0000000..aef63ed --- /dev/null +++ b/src/utils/evaluate.py @@ -0,0 +1,260 @@ +import configparser +import os +import pathlib +from typing import Dict, List, Optional, Union + +import motmetrics as mm +import numpy as np +import pandas as pd +import torch +from sklearn import metrics + + +GT_COLUMNS = [ + "frame", + "id", + "bb_left", + "bb_top", + "bb_width", + "bb_height", + "conf", + "x", + "y", + "z", +] + + +def get_hota_setup(): + metrics = ["deta_alpha", "assa_alpha", "hota_alpha"] + namemap = mm.io.motchallenge_metric_names + namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"}) + return metrics, namemap + + +def evaluate_tracker(tracker_results, dataloader, hota_mode=False, bev_mode=False): + gt_dfs = [pd.DataFrame(gt, columns=GT_COLUMNS) for gt in dataloader.dataset._ground_truths] + ht_dfs = results_to_dfs(tracker_results) + + n_frames = [int(df["frame"].max()) for df in gt_dfs] + + gt_dfs = [mot_to_mm(df) for df in gt_dfs] + ht_dfs = [mot_to_mm(df) for df in ht_dfs] + + gt_df = combine_dataframes(gt_dfs, n_frames) + ht_df = combine_dataframes(ht_dfs, n_frames) + + # put column "x" to "X" + if bev_mode: + ht_df["X"] = ht_df["x"] + ht_df["Y"] = ht_df["y"] + gt_df["X"] = gt_df["x"] + gt_df["Y"] = gt_df["y"] + + return evaluate_single_scene(ht_df, gt_df, hota_mode=hota_mode, bev_mode=bev_mode) + + +def results_to_dfs(tracker_results: torch.Tensor) -> List[pd.DataFrame]: + """Converts a tensor of results to a list of dataframes. Input tensor has format + + CAM_ID, OBJ_ID, FRAME_ID, X, Y, W, H, X_WORLD, Y_WORLD + + and resulting (n_cams) dataframes have columns + + frame, id, bb_left, bb_top, bb_width, bb_height, conf, x, y, z + + Args: + tracker_results (torch.Tensor): Results tensor. + Returns: + List[pd.DataFrame]: List of dataframes. + """ + results = tracker_results.clone() + results[:, [1, 2]] = results[:, [2, 1]] + results = torch.cat((results[:, :7], torch.ones(results.shape[0], 1), results[:, 7:]), dim=1) + results = torch.cat((results, -torch.ones(results.shape[0], 1)), dim=1) + cam_res = [results[results[:, 0] == c][:, 1:] for c in torch.unique(results[:, 0]).cpu().numpy()] + return [pd.DataFrame(res, columns=GT_COLUMNS) for res in cam_res] + + +def evaluate_multi_scene(prediction_dfs, ground_truth_dfs, names=None, hota_mode=False, bev_mode=False): + """Takes prediction and ground truth dataframes and runs motmetrics evaluation + on a multiple scenes. For evaluation of multi-camera scenes, first combine a + list of single-camera predictions and ground truths using `combine_dataframes` + Args: + prediction_dfs (_type_): _description_ + ground_truth_dfs (_type_): _description_ + names (_type_, optional): _description_. Defaults to None. + Returns: + _type_: _description_ + """ + if names is None: + names = ["Untitled %s" % (i + 1) for i in range(len(prediction_dfs))] + ground_truths = dict(zip(names, ground_truth_dfs)) + predictions = dict(zip(names, prediction_dfs)) + accs = [] + names = [] + + if bev_mode: + distfields = ["X", "Y"] + dist = "seuc" + distth = 1.0 + else: + distfields = ["X", "Y", "Width", "Height"] + dist = "iou" + distth = 0.5 + + for name, prediction in predictions.items(): + if hota_mode: + raise NotImplementedError + else: + accs.append( + mm.utils.compare_to_groundtruth( + ground_truths[name], prediction, dist=dist, distfields=distfields, distth=distth + ) + ) + metrics = mm.metrics.motchallenge_metrics + namemap = mm.io.motchallenge_metric_names + names.append(name) + + mh = mm.metrics.create() + + summary = mh.compute_many( + accs, + names=names, + metrics=metrics, + generate_overall=True, + ) + namemap.update({"hota_alpha": "HOTA", "assa_alpha": "ASSA", "deta_alpha": "DETA"}) + print(mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap)) + strsummary = mm.io.render_summary(summary, formatters=mh.formatters, namemap=namemap) + return summary, strsummary + + +def evaluate_single_scene(prediction_df, ground_truth_df, hota_mode=False, bev_mode=False, name=None) -> pd.DataFrame: + """Takes a prediction and ground truth dataframe and runs motmetrics evaluation + on a single scene. For evaluation of multi-camera scenes, first combine a list + of single-camera predictions and ground truths using `combine_dataframes`. + Args: + prediction_df (_type_): Multi-camera predictions. + ground_truth_df (_type_): Multi-camera ground truth. + name (str): Scene name. Defaults to None. + """ + return evaluate_multi_scene([prediction_df], [ground_truth_df], [name], hota_mode, bev_mode) + + +def mot_to_mm(df: pd.DataFrame) -> pd.DataFrame: + """Takes a MOT-style dataframe (with named columns [frame, id, ...]) + and converts it to a dataframe with column names required by motmetrics. + Args: + df (pd.DataFrame): Input MOT-style dataframe. + Returns: + pd.DataFrame: Output dataframe ready to use in motmetrics evaluation. + """ + _df = df.rename( + columns={ + "frame": "FrameId", + "id": "Id", + "bb_left": "X", + "bb_top": "Y", + "bb_width": "Width", + "bb_height": "Height", + "conf": "Confidence", + } + ) + columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"] + columns_to_float = ["Confidence"] + _df[columns_to_int] = _df[columns_to_int].astype(int) + _df[columns_to_float] = _df[columns_to_float].astype(float) + return _df + + +def read_txt(path: Union[str, pathlib.Path]) -> pd.DataFrame: + _df = pd.read_csv(path, names=GT_COLUMNS) + _df = _df.rename( + columns={ + "frame": "FrameId", + "id": "Id", + "bb_left": "X", + "bb_top": "Y", + "bb_width": "Width", + "bb_height": "Height", + "conf": "Confidence", + } + ) + columns_to_int = ["FrameId", "Id", "X", "Y", "Width", "Height"] + columns_to_float = ["Confidence"] + _df[columns_to_int] = _df[columns_to_int].astype(int) + _df[columns_to_float] = _df[columns_to_float].astype(float) + return _df + + +def read_seqinfo(path: Union[str, pathlib.Path]) -> Dict: + parser = configparser.ConfigParser() + parser.read(path) + return dict(parser["Sequence"]) + + +def combine_dataframes(dataframes: List[pd.DataFrame], n_frames: Optional[List[int]] = None) -> pd.DataFrame: + """Takes a list of single-camera dataframes and combines them for + multi-camera evaluation. + Args: + dataframes (List[pd.DataFrame]): List of single-camera dataframes. + n_frames (Optional[List[int]], optional): Defaults to None. + Returns: + pd.DataFrame: Multi-camera dataframe. + """ + if n_frames is None: + n_frames = [int(df["FrameId"].max()) for df in dataframes] + count_frames = 0 + dfs = [] + for j, df in enumerate(dataframes): + df["FrameId"] += count_frames + count_frames += int(n_frames[j]) + dfs.append(df) + return pd.concat(dfs).set_index(["FrameId", "Id"]) + + +def evaluate_mtmc( + data_paths: List[Union[str, pathlib.Path]], + prediction_path: Union[str, pathlib.Path], + scene_name: str, + hota_mode=False, + bev_mode=False, +): + seqinfos = [read_seqinfo(os.path.join(path, "seqinfo.ini")) for path in data_paths] + ground_truths = [read_txt(os.path.join(path, "gt", "gt.txt")) for path in data_paths] + prediction_paths = [os.path.join(prediction_path, seqinfo["name"] + ".txt") for seqinfo in seqinfos] + predictions = [read_txt(path) for path in prediction_paths] + ground_truth_df = combine_dataframes(ground_truths, [seqinfo["seqlength"] for seqinfo in seqinfos]) + prediction_df = combine_dataframes(predictions, [seqinfo["seqlength"] for seqinfo in seqinfos]) + + ground_truths = {scene_name: ground_truth_df} + predictions = {scene_name: prediction_df} + + +def evaluate_synthehicle_json(prediction, ground_truth): + preds_to_eval = [] + truths_to_eval = [] + names = [] + for scene in ground_truth.keys(): + if scene in prediction.keys(): + gcams = ground_truth[scene] + pcams = prediction[scene] + preds_to_combine = [] + truths_to_combine = [] + for cam in gcams.keys(): + if cam not in pcams.keys(): + prediction[scene][cam] = [[1, 1, 0, 0, 0, 0, 1, -1, -1, -1]] + preds_to_combine.append(mot_to_mm(pd.DataFrame(prediction[scene][cam], columns=GT_COLUMNS))) + truths_to_combine.append(mot_to_mm(pd.DataFrame(ground_truth[scene][cam], columns=GT_COLUMNS))) + names.append(scene) + preds_to_eval.append(combine_dataframes(preds_to_combine, n_frames=[1800] * len(preds_to_combine))) + truths_to_eval.append(combine_dataframes(truths_to_combine, n_frames=[1800] * len(truths_to_combine))) + return evaluate_multi_scene(preds_to_eval, truths_to_eval, names) + + +def clustering_performance(y_true, y_pred): + y_t, y_p = y_true.cpu().numpy(), y_pred.cpu().numpy() + return { + "ARI": metrics.adjusted_rand_score(y_t, y_p), + "AMI": metrics.adjusted_mutual_info_score(y_t, y_p), + } diff --git a/src/utils/iotools.py b/src/utils/iotools.py new file mode 100644 index 0000000..5f472b8 --- /dev/null +++ b/src/utils/iotools.py @@ -0,0 +1,113 @@ +import os + +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter + +from .utils import expand_boxes, remove_border_boxes, size_filter + + +class ResultsWriter: + def __init__(self, output_path, cfg, normalization=None, camera_names=None): + self._results = [] + + self.cfg = cfg + self.output_path = output_path + self._norm_factors = normalization + + self.rows = cfg.visuals.grid_rows + self.plot_results = cfg.visuals.plot_results + self.plot_every = cfg.visuals.plot_interval + + self.camera_names = camera_names + + self.writer = None + + if cfg.logging.tensorboard.enable: + self.writer = SummaryWriter() + + self.store_files = cfg.visuals.store_files + self.results_file = os.path.join(output_path, "results.txt") + + self.offsets = cfg.dataset.offsets if hasattr(cfg.dataset, "offsets") else [0] * len(camera_names) + + self.on_bev = True if cfg.dataset.name == "WildTrack" else False + + self._save_function = self.get_save_function(cfg) + + if os.path.exists(self.results_file): + os.remove(self.results_file) + + os.makedirs(output_path, exist_ok=True) + + @property + def results(self): + results = torch.cat(self._results, dim=0) + for i, offset in enumerate(self.offsets): + results[results[:, 0] == i, 2] -= offset + # multiply camera column by (-1) + results[:, 0] *= -1 + for i, name in enumerate(self.camera_names): + # this is a bit hacky if camera does not start with letter + try: + name_int = int(name[1:]) + except ValueError: + # fallback to index of camera + name_int = i + results[results[:, 0] == -i, 0] = name_int + if self.cfg.postprocess.expand_boxes.enable: + factor = self.cfg.postprocess.expand_boxes.factor + results[:, 3:7] = expand_boxes(results[:, 3:7], factor) + if self.cfg.postprocess.remove_borders.enable: + boxes = results[:, 3:7] + border = self.cfg.postprocess.remove_borders.border_size + keep = remove_border_boxes(boxes, border) + results = results[keep] + if self.cfg.postprocess.size_filter.enable: + boxes = results[:, 3:7] + keep = size_filter( + boxes, self.cfg.postprocess.size_filter.min_size, self.cfg.postprocess.size_filter.max_size + ) + results = results[keep] + return results + + def add(self, result): + _result = result.clone() + if self._norm_factors is not None: + _result = self.denormalize_bev(_result[:, 7:9]) + self._results.append(result) + + def save(self): + if self._results: + self._save_function(self.results.cpu().numpy()) + + def _to_aicity19(self, result): + # CAMERA_ID OBJ_ID FRAME X Y W H 1 X_BEV Y_BEV -1 + np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f") + + def _to_aicity24(self, result): + # CAMERA_ID OBJ_ID FRAME X Y W H 1 X_BEV Y_BEV -1 + np.savetxt(self.results_file, result, fmt="%d %d %d %d %d %d %d %f %f") + + def _to_synthehicle(self, result): + # CAMERA, FRAME, ID, X, Y, W, H, SCORE, X_BEV, Y_BEV + np.savetxt(self.results_file, result[:, [2, 1]], fmt="%d", delimiter=",") + + def get_save_function(self, cfg): + if "WildTrack" in cfg.dataset.name: + return self._to_wildtrack + elif "AICITY24" in cfg.dataset.name: + return self._to_aicity19 + elif "AICITY" in cfg.dataset.name or "CityFlow" in cfg.dataset.name: + return self._to_aicity24 + else: + return self._to_synthehicle + + def denormalize_bev(self, positions): + min_x, min_y, max_x, max_y = self._norm_factors + return positions * torch.tensor([max_x - min_x, max_y - min_y]) + torch.tensor([min_x, min_y]) + + def squeeze_batch(self, x: torch.Tensor): + if x.dim() == 4 and x.size(0) == 1: + return x.squeeze(0) + return x diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000..c363063 --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,208 @@ +import math +import random +from typing import List, Optional, Tuple + +import matplotlib.pyplot as plt +import torch +from torch.utils.tensorboard import SummaryWriter +from torchvision import transforms +from torchvision.io import write_jpeg +from torchvision.utils import draw_bounding_boxes, make_grid + + +def resize_transform(img, size=(256, 128)): + """ + Resize a torch image to the specified size. + Used before passing the image to reid model. + """ + transform = transforms.Compose( + [ + transforms.ToPILImage(), + transforms.Resize((size[0], size[1])), + transforms.ToTensor(), + ] + ) + return transform(img) + + +def compute_centers(boxes, bottom=True, box_projection_centers=None): + """ + Compute the 2D centers of a torch tensor of bounding boxes. + """ + if bottom is True and box_projection_centers is not None: + raise ValueError("Cannot project boxes to bottom and use box_projection_centers simultaneously.") + centers = torch.zeros((boxes.shape[0], 2)) + centers[:, 0] = boxes[:, 0] + boxes[:, 2] / 2 + if box_projection_centers is not None: + alpha_w, alpha_h = box_projection_centers + centers[:, 1] = boxes[:, 1] + alpha_h * boxes[:, 3] + elif bottom: + centers[:, 1] = boxes[:, 1] + boxes[:, 3] + else: + centers[:, 1] = boxes[:, 1] + boxes[:, 3] / 2 + return centers + + +def tlwh_to_xyah(tlwh): + """ + Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = tlwh.clone() + if ret.dim() == 1: + ret = ret.unsqueeze(0) + ret[:, :2] += ret[:, 2:] / 2 + ret[:, 2] /= ret[:, 3] + return ret + + +def xyah_to_tlwh(xyah): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + """ + ret = xyah.clone() + if ret.dim() == 1: + ret = ret.unsqueeze(0) + ret[:, 2] *= ret[:, 3] + ret[:, :2] -= ret[:, 2:] / 2 + return ret + + +def tlwh_to_tlbr(tlwh): + """Convert bounding box to format `(top left x, top left y, bottom right + x, bottom right y)`. + """ + ret = tlwh.clone() + if ret.dim() == 1: + ret = ret.unsqueeze(0) + ret[:, 2:] += ret[:, :2] + return ret + + +def expand_boxes(in_boxes, factor): + boxes = in_boxes.clone() + cx, cy = boxes[:, 0] + boxes[:, 2] / 2, boxes[:, 1] + boxes[:, 3] / 2 + w, h = boxes[:, 2] * factor, boxes[:, 3] * factor + boxes[:, 0] = cx - w / 2 + boxes[:, 1] = cy - h / 2 + boxes[:, 2] = w + boxes[:, 3] = h + return boxes + + +def remove_border_boxes(boxes, border): + xy1x2y2 = tlwh_to_tlbr(boxes) + keep = ( + (xy1x2y2[:, 0] > border) + & (xy1x2y2[:, 1] > border) + & (xy1x2y2[:, 2] < (1920 - border)) + & (xy1x2y2[:, 3] < (1080 - border)) + ) + return keep + + +def size_filter(boxes, size_min, size_max): + sizes = boxes[:, 2] * boxes[:, 3] + keep = (sizes >= size_min) & (sizes <= size_max) + return keep + + +def mpl_cmap_to_rgb(cmap_name: str, seed: int = 0) -> List[Tuple[int, int, int]]: + """Returns a list of RGB values from a matplotlib colormap.""" + cmap = plt.get_cmap(cmap_name) + colors = [] + for i in range(cmap.N): + rgb = cmap(i)[:3] + colors.append(tuple(int(255 * c) for c in rgb)) + random.seed(seed) + random.shuffle(colors) + return colors + + +def render_image_grid(images: List[torch.Tensor], *args, **kwargs) -> torch.Tensor: + """Renders a grid of images. + + Args: + images (List[torch.Tensor]): List of N images of shape (C, H, W). + *args: Additional arguments to pass to the make_grid function. + **kwargs: Additional keyword arguments to pass to the make_grid function. + + Returns: + torch.Tensor: Image grid of shape (C, H, W). + """ + images = torch.stack(images) + nrow = math.ceil(math.sqrt(len(images))) + return make_grid(images, nrow=nrow, *args, **kwargs) + + +def render_images_with_boxes( + image: torch.Tensor, + boxes: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + confs: Optional[torch.Tensor] = None, + colors: Optional[List[Tuple[int, int, int]]] = None, + *args, + **kwargs, +) -> List[torch.Tensor]: + """Render image with bounding boxes. Colors correspond to the label index. Boxes are + expected to be in MOT-format, i.e., (bb_left, bb_top, bb_widht, bb_height). + + Args: + images (torch.Tensor): Image of shape (C, H, W). + boxes (torch.Tensor): Boxes of shape (K, 4). + labels (torch.Tensor): Label of shape (K,). + colors (Optional[List[Tuple[int, int, int]]]): List of RGB colors. Defaults to None. + *args: Additional arguments to pass to the draw_bounding_boxes function. + **kwargs: Additional keyword arguments to pass to the draw_bounding_boxes function. + + Returns: + torch.Tensor: Image with bounding boxes. + """ + if boxes is None: + return image + + if colors is None: + colors = mpl_cmap_to_rgb("rainbow") + + if labels is None: + labels = torch.zeros(boxes.size(0)) + + color_palette = [colors[label % len(colors)] for label in labels] + + _labels = [str(label.item()) for i, label in enumerate(labels)] + + if confs is not None: + _labels = [f"{label} ({conf.item():.2f})" for label, conf in zip(_labels, confs)] + + img = image.clone() + bxs = boxes.clone() + bxs[:, 2:] += bxs[:, :2] + + img = draw_bounding_boxes( + img, + bxs, + labels=_labels, + colors=color_palette, + *args, + **kwargs, + ) + return img + + +def normalize_features(x): + # shape of x: (C, N, F) + # normalize features per channelg + mean = x.mean(dim=2, keepdim=True) + std = x.std(dim=2, keepdim=True) + 1e-8 + return (x - mean) / std + + +def nanmax(x, dim=None): + """Function like torch.nanmean for max.""" + mask = torch.isnan(x) + x_masked = torch.where(mask, torch.tensor(float("-inf")).to(x.device), x) + max_vals, _ = torch.max(x_masked, dim=dim) + + # Restore NaN values if max is -inf (because all were NaN along dimension) + max_vals = torch.where(max_vals == float("-inf"), torch.tensor(float("nan")).to(x.device), max_vals) + return max_vals diff --git a/tools/track.py b/tools/track.py new file mode 100644 index 0000000..6bc5ad0 --- /dev/null +++ b/tools/track.py @@ -0,0 +1,90 @@ +import json +import os +from subprocess import PIPE, run + +import hydra +import torch +from loguru import logger +from omegaconf import DictConfig, OmegaConf +from qqdm import format_str, qqdm + +import wandb +from src.datasets.dataset import create_dataloader +from src.tracker.encoder import create_encoder +from src.tracker.solver import create_solver +from src.tracker.tracker import create_tracker +from src.utils.evaluate import evaluate_tracker +from src.utils.iotools import ResultsWriter + + +@hydra.main(version_base=None, config_path="../conf", config_name="config") +def main(cfg: DictConfig) -> None: + if cfg.device == "cpu" or not torch.cuda.is_available(): + raise ValueError("This code runs on CUDA only. Please set device to 'cuda'.") + else: + device = torch.device(cfg.device) + logger.info(f"🚀 Using device: {device}") + + cfg.tracker.matching.distance_weight = 1 - cfg.tracker.matching.rescale_weight + + # create output directories + output_path = os.path.join(cfg.output_path) + os.makedirs(output_path, exist_ok=True) + output_path = os.path.join(output_path, cfg.dataset.name) + logger.info(f"📂 Writing to output path: {output_path}") + + # Initialize wandb and tensorboard + if cfg.logging.wandb.enable: + wandb.init(project=cfg.logging.wandb.project) + wandb.config.update(OmegaConf.to_container(cfg)) + if cfg.logging.wandb.tags is not None: + wandb.run.tags = cfg.logging.wandb.tags + + # Initialize solver + solver_opts = create_solver(cfg.solver.backend) + logger.info(f"✨ Initialized solver, using backend: {cfg.solver.backend}") + + # Initialize dataset and dataloader + dataloader = create_dataloader(cfg) + logger.info("✨ Created dataloader.") + + # Initialize encoder + encoder = create_encoder(cfg.encoder, device) + logger.info("✨ Created encoder.") + + tracker = create_tracker(cfg, solver_opts, encoder, len(dataloader.dataset.camera_names), device) + logger.info("✨ Initialized tracker.") + + results_writer = ResultsWriter( + output_path=output_path, + cfg=cfg, + normalization=dataloader.dataset._norm_factors, + camera_names=dataloader.dataset.camera_names, + ) + + tw = qqdm(range(len(dataloader)), desc=format_str("bold", "Description")) + for i, batch in enumerate(dataloader): + results, _ = tracker.step(batch) + results_writer.add(results) + stats = tracker.stats + tw.set_infos(stats) + tw.update() + + if cfg.logging.wandb.enable: + _stats_str_to_float = {k: float(v) for k, v in stats.items()} + wandb.log(_stats_str_to_float, step=i) + + logger.info(f"🕒 Cumulative execution time of tracker {tracker.cumulative_execution_time * 10}") + logger.info(f"🕒 Average time per frame {tracker.cumulative_execution_time / tracker.frame}") + + results_writer.save() + + logger.info("🚀 Tracking completed.") + logger.info( + f"📈 Results saved to {results_writer.results_file}. " + "Use the official evaluation script of the dataset for evaluation." + ) + + +if __name__ == "__main__": + main()