train_ace.py

#!/usr/bin/env python3
# Copyright © Niantic, Inc. 2022.

import argparse
import logging
from distutils.util import strtobool
from pathlib import Path

from ace_trainer import TrainerACE
from ace_trainer_with_mask import TrainerACE as TrainerACE_mask


def _strtobool(x):
    return bool(strtobool(x))


if __name__ == "__main__":
    # Setup logging levels.
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(
        description="Fast training of a scene coordinate regression network.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    parser.add_argument(
        "scene",
        type=Path,
        help='path to a scene in the dataset folder, e.g. "datasets/Cambridge_GreatCourt"',
    )

    parser.add_argument(
        "output_map_file", type=Path, help="target file for the trained network"
    )

    parser.add_argument(
        "--constraint_mask",
        type=int,
        default=1,
        help="file containing pre-trained encoder weights",
    )

    parser.add_argument(
        "--sampling_radius",
        type=int,
        default=5,
        help="file containing pre-trained encoder weights",
    )

    parser.add_argument(
        "--use_mask",
        type=bool,
        default=True,
        help="file containing pre-trained encoder weights",
    )

    parser.add_argument(
        "--mask_dir",
        type=Path,
        default=False,
        help="file containing pre-trained encoder weights",
    )

    parser.add_argument(
        "--encoder_path",
        type=Path,
        default=Path(__file__).parent / "ace_encoder_pretrained.pt",
        help="file containing pre-trained encoder weights",
    )

    parser.add_argument(
        "--num_head_blocks",
        type=int,
        default=1,
        help="depth of the regression head, defines the map size",
    )

    parser.add_argument(
        "--learning_rate_min",
        type=float,
        default=0.0005,
        help="lowest learning rate of 1 cycle scheduler",
    )

    parser.add_argument(
        "--learning_rate_max",
        type=float,
        default=0.005,
        help="highest learning rate of 1 cycle scheduler",
    )

    parser.add_argument(
        "--training_buffer_size",
        type=int,
        default=8000000,
        help="number of patches in the training buffer",
    )

    parser.add_argument(
        "--samples_per_image",
        type=int,
        default=1024,
        help="number of patches drawn from each image when creating the buffer",
    )

    parser.add_argument(
        "--batch_size",
        type=int,
        default=5120,
        help="number of patches for each parameter update (has to be a multiple of 512)",
    )

    parser.add_argument(
        "--epochs",
        type=int,
        default=16,
        help="number of runs through the training buffer",
    )

    parser.add_argument(
        "--repro_loss_hard_clamp",
        type=int,
        default=1000,
        help="hard clamping threshold for the reprojection losses",
    )

    parser.add_argument(
        "--repro_loss_soft_clamp",
        type=int,
        default=50,
        help="soft clamping threshold for the reprojection losses",
    )

    parser.add_argument(
        "--repro_loss_soft_clamp_min",
        type=int,
        default=1,
        help="minimum value of the soft clamping threshold when using a schedule",
    )

    parser.add_argument(
        "--use_half", type=_strtobool, default=True, help="train with half precision"
    )

    parser.add_argument(
        "--use_homogeneous",
        type=_strtobool,
        default=True,
        help="train with half precision",
    )

    parser.add_argument(
        "--use_aug", type=_strtobool, default=True, help="Use any augmentation."
    )

    parser.add_argument(
        "--aug_rotation", type=int, default=15, help="max inplane rotation angle"
    )

    parser.add_argument("--aug_scale", type=float, default=1.5, help="max scale factor")

    parser.add_argument(
        "--image_resolution", type=int, default=480, help="base image resolution"
    )

    parser.add_argument(
        "--repro_loss_type",
        type=str,
        default="dyntanh",
        choices=["l1", "l1+sqrt", "l1+log", "tanh", "dyntanh"],
        help="Loss function on the reprojection error. Dyn varies the soft clamping threshold",
    )

    parser.add_argument(
        "--repro_loss_schedule",
        type=str,
        default="circle",
        choices=["circle", "linear"],
        help="How to decrease the softclamp threshold during training, circle is slower first",
    )

    parser.add_argument(
        "--depth_min",
        type=float,
        default=0.1,
        help="enforce minimum depth of network predictions",
    )

    parser.add_argument(
        "--depth_target",
        type=float,
        default=10,
        help="default depth to regularize training",
    )

    parser.add_argument(
        "--depth_max",
        type=float,
        default=1000,
        help="enforce maximum depth of network predictions",
    )

    # Clustering params, for the ensemble training used in the Cambridge experiments. Disabled by default.
    parser.add_argument(
        "--num_clusters",
        type=int,
        default=None,
        help="split the training sequence in this number of clusters. disabled by default",
    )

    parser.add_argument(
        "--cluster_idx",
        type=int,
        default=None,
        help="train on images part of this cluster. required only if --num_clusters is set.",
    )

    # Params for the visualization. If enabled, it will slow down training considerably. But you get a nice video :)
    parser.add_argument(
        "--render_visualization",
        type=_strtobool,
        default=False,
        help="create a video of the mapping process",
    )

    parser.add_argument(
        "--render_target_path",
        type=Path,
        default="renderings",
        help="target folder for renderings, visualizer will create a subfolder with the map name",
    )

    parser.add_argument(
        "--render_flipped_portrait",
        type=_strtobool,
        default=False,
        help="flag for wayspots dataset where images are sideways portrait",
    )

    parser.add_argument(
        "--render_map_error_threshold",
        type=int,
        default=10,
        help="reprojection error threshold for the visualisation in px",
    )

    parser.add_argument(
        "--render_map_depth_filter",
        type=int,
        default=10,
        help="to clean up the ACE point cloud remove points too far away",
    )

    parser.add_argument(
        "--render_camera_z_offset",
        type=int,
        default=4,
        help="zoom out of the scene by moving render camera backwards, in meters",
    )

    options = parser.parse_args()
    if options.constraint_mask == 1:
        trainer = TrainerACE_mask(options)
    else:
        trainer = TrainerACE(options)
    trainer.train()