From 9b03299d409a8c6dcbca6ad2e61ef5fe6663ba6d Mon Sep 17 00:00:00 2001
From: Frederik Warburg <32867328+FrederikWarburg@users.noreply.github.com>
Date: Fri, 4 Aug 2023 16:11:55 +0200
Subject: [PATCH] Train and eval data splits (Nerfbusters) (#2207)

* nerfbusters dataparser

* nerfbuster dataparser and visualization of eval views

* resolved some errors from  pyright

* enable ns-process-data to take two videos as input and use one for eval and the other for train

* resolve some pyright issues

* instead of data being a tuple, I added eval_data as another argument

* implemented train eval split for all cases

* implemented train eval split for all cases

* reverting equi changes

* reverting reality capture changes

* raise valueerror

* revert comment

* added split override code back to dataparser

* reverted one change

* using image border now

* adding docs

---------

Co-authored-by: Ethan Weber <eweb0124@gmail.com>
---
 docs/quickstart/custom_dataset.md             |   4 +
 .../data/dataparsers/nerfstudio_dataparser.py |  59 +++++++---
 nerfstudio/data/utils/dataparsers_utils.py    | 101 ++++++++++++++++++
 nerfstudio/engine/trainer.py                  |   3 +-
 .../base_converter_to_nerfstudio_dataset.py   |   5 +-
 .../colmap_converter_to_nerfstudio_dataset.py |   4 +-
 .../images_to_nerfstudio_dataset.py           |  23 +++-
 nerfstudio/process_data/process_data_utils.py |  57 ++++++----
 .../video_to_nerfstudio_dataset.py            |  27 ++++-
 nerfstudio/scripts/process_data.py            |  12 +--
 nerfstudio/scripts/viewer/run_viewer.py       |   4 +-
 nerfstudio/viewer/server/viewer_state.py      |  49 ++++++---
 nerfstudio/viewer_beta/viewer.py              |  13 ++-
 tests/process_data/test_process_images.py     |  10 +-
 14 files changed, 288 insertions(+), 83 deletions(-)
 create mode 100644 nerfstudio/data/utils/dataparsers_utils.py

diff --git a/docs/quickstart/custom_dataset.md b/docs/quickstart/custom_dataset.md
index 1b6c240717..c826979360 100644
--- a/docs/quickstart/custom_dataset.md
+++ b/docs/quickstart/custom_dataset.md
@@ -46,6 +46,10 @@ ns-process-data {images, video} --data {DATA_PATH} --output-dir {PROCESSED_DATA_
 ns-train nerfacto --data {PROCESSED_DATA_DIR}
 ```
 
+### Training and evaluation on separate data
+
+For `ns-process-data {images, video}`, you can optionally use a separate image directory or video for training and evaluation, as suggested in [Nerfbusters](https://ethanweber.me/nerfbusters/). To do this, run `ns-process-data {images, video} --data {DATA_PATH} --eval-data {EVAL_DATA_PATH} --output-dir {PROCESSED_DATA_DIR}`. Then when running nerfacto, run `ns-train nerfacto --data {PROCESSED_DATA_DIR} nerfstudio-data --eval-mode filename`.
+
 ### Installing COLMAP
 
 There are many ways to install COLMAP, unfortunately it can sometimes be a bit finicky. If the following commands do not work, please refer to the [COLMAP installation guide](https://colmap.github.io/install.html) for additional installation methods. COLMAP install issues are common! Feel free to ask for help in on our [Discord](https://discord.gg/uMbNqcraFc).
diff --git a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
index 008fedfa2d..eaeda0bce6 100644
--- a/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
+++ b/nerfstudio/data/dataparsers/nerfstudio_dataparser.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-import math
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Literal, Optional, Type
@@ -26,12 +25,14 @@
 
 from nerfstudio.cameras import camera_utils
 from nerfstudio.cameras.cameras import CAMERA_MODEL_TO_TYPE, Cameras, CameraType
-from nerfstudio.data.dataparsers.base_dataparser import (
-    DataParser,
-    DataParserConfig,
-    DataparserOutputs,
-)
+from nerfstudio.data.dataparsers.base_dataparser import DataParser, DataParserConfig, DataparserOutputs
 from nerfstudio.data.scene_box import SceneBox
+from nerfstudio.data.utils.dataparsers_utils import (
+    get_train_eval_split_filename,
+    get_train_eval_split_fraction,
+    get_train_eval_split_interval,
+    get_train_eval_split_all,
+)
 from nerfstudio.utils.io import load_from_json
 from nerfstudio.utils.rich_utils import CONSOLE
 
@@ -58,8 +59,18 @@ class NerfstudioDataParserConfig(DataParserConfig):
     """The method to use to center the poses."""
     auto_scale_poses: bool = True
     """Whether to automatically scale the poses to fit in +/- 1 bounding box."""
+    eval_mode: Literal["fraction", "filename", "interval", "all"] = "fraction"
+    """
+    The method to use for splitting the dataset into train and eval. 
+    Fraction splits based on a percentage for train and the remaining for eval.
+    Filename splits based on filenames containing train/eval.
+    Interval uses every nth frame for eval.
+    All uses all the images for any split.
+    """
     train_split_fraction: float = 0.9
-    """The fraction of images to use for training. The remaining images are for eval."""
+    """The percentage of the dataset to use for training. Only used when eval_mode is train-split-fraction."""
+    eval_interval: int = 8
+    """The interval between frames to use for eval. Only used when eval_mode is eval-interval."""
     depth_unit_scale_factor: float = 1e-3
     """Scales the depth values to meters. Default value is 0.001 for a millimeter to meter conversion."""
 
@@ -105,9 +116,18 @@ def _generate_dataparser_outputs(self, split="train"):
         width = []
         distort = []
 
+        # sort the frames by fname
+        fnames = []
         for frame in meta["frames"]:
             filepath = Path(frame["file_path"])
             fname = self._get_fname(filepath, data_dir)
+            fnames.append(fname)
+        inds = np.argsort(fnames)
+        frames = [meta["frames"][ind] for ind in inds]
+
+        for frame in frames:
+            filepath = Path(frame["file_path"])
+            fname = self._get_fname(filepath, data_dir)
 
             if not fx_fixed:
                 assert "fl_x" in frame, "fx not specified in frame"
@@ -182,16 +202,21 @@ def _generate_dataparser_outputs(self, split="train"):
         elif has_split_files_spec:
             raise RuntimeError(f"The dataset's list of filenames for split {split} is missing.")
         else:
-            # filter image_filenames and poses based on train/eval split percentage
-            num_images = len(image_filenames)
-            num_train_images = math.ceil(num_images * self.config.train_split_fraction)
-            num_eval_images = num_images - num_train_images
-            i_all = np.arange(num_images)
-            i_train = np.linspace(
-                0, num_images - 1, num_train_images, dtype=int
-            )  # equally spaced training images starting and ending at 0 and num_images-1
-            i_eval = np.setdiff1d(i_all, i_train)  # eval images are the remaining images
-            assert len(i_eval) == num_eval_images
+            # find train and eval indices based on the eval_mode specified
+            if self.config.eval_mode == "fraction":
+                i_train, i_eval = get_train_eval_split_fraction(image_filenames, self.config.train_split_fraction)
+            elif self.config.eval_mode == "filename":
+                i_train, i_eval = get_train_eval_split_filename(image_filenames)
+            elif self.config.eval_mode == "interval":
+                i_train, i_eval = get_train_eval_split_interval(image_filenames, self.config.eval_interval)
+            elif self.config.eval_mode == "all":
+                CONSOLE.log(
+                    "[yellow] Be careful with '--eval-mode=all'. If using camera optimization, the cameras may diverge in the current implementation, giving unpredictable results."
+                )
+                i_train, i_eval = get_train_eval_split_all(image_filenames)
+            else:
+                raise ValueError(f"Unknown eval mode {self.config.eval_mode}")
+
             if split == "train":
                 indices = i_train
             elif split in ["val", "test"]:
diff --git a/nerfstudio/data/utils/dataparsers_utils.py b/nerfstudio/data/utils/dataparsers_utils.py
new file mode 100644
index 0000000000..b48323f21e
--- /dev/null
+++ b/nerfstudio/data/utils/dataparsers_utils.py
@@ -0,0 +1,101 @@
+# Copyright 2022 the Regents of the University of California, Nerfstudio Team and contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data parser utils for nerfstudio datasets. """
+
+import math
+import os
+from typing import List, Tuple
+
+import numpy as np
+
+
+def get_train_eval_split_fraction(image_filenames: List, train_split_fraction: float) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get the train/eval split fraction based on the number of images and the train split fraction.
+
+    Args:
+        image_filenames: list of image filenames
+        train_split_fraction: fraction of images to use for training
+    """
+
+    # filter image_filenames and poses based on train/eval split percentage
+    num_images = len(image_filenames)
+    num_train_images = math.ceil(num_images * train_split_fraction)
+    num_eval_images = num_images - num_train_images
+    i_all = np.arange(num_images)
+    i_train = np.linspace(
+        0, num_images - 1, num_train_images, dtype=int
+    )  # equally spaced training images starting and ending at 0 and num_images-1
+    i_eval = np.setdiff1d(i_all, i_train)  # eval images are the remaining images
+    assert len(i_eval) == num_eval_images
+
+    return i_train, i_eval
+
+
+def get_train_eval_split_filename(image_filenames: List) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get the train/eval split based on the filename of the images.
+
+    Args:
+        image_filenames: list of image filenames
+    """
+
+    num_images = len(image_filenames)
+    basenames = [os.path.basename(image_filename) for image_filename in image_filenames]
+    i_all = np.arange(num_images)
+    i_train = []
+    i_eval = []
+    for idx, basename in zip(i_all, basenames):
+        # check the frame index
+        if "train" in basename:
+            i_train.append(idx)
+        elif "eval" in basename:
+            i_eval.append(idx)
+        else:
+            raise ValueError("frame should contain train/eval in its name to use this eval-frame-index eval mode")
+
+    return np.array(i_train), np.array(i_eval)
+
+
+def get_train_eval_split_interval(image_filenames: List, eval_interval: float) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get the train/eval split based on the interval of the images.
+
+    Args:
+        image_filenames: list of image filenames
+        eval_interval: interval of images to use for eval
+    """
+
+    num_images = len(image_filenames)
+    all_indices = np.arange(num_images)
+    train_indices = all_indices[all_indices % eval_interval != 0]
+    eval_indices = all_indices[all_indices % eval_interval == 0]
+    i_train = train_indices
+    i_eval = eval_indices
+
+    return i_train, i_eval
+
+
+def get_train_eval_split_all(image_filenames: List) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Get the train/eval split where all indices are used for both train and eval.
+
+    Args:
+        image_filenames: list of image filenames
+    """
+    num_images = len(image_filenames)
+    i_all = np.arange(num_images)
+    i_train = i_all
+    i_eval = i_all
+    return i_train, i_eval
diff --git a/nerfstudio/engine/trainer.py b/nerfstudio/engine/trainer.py
index 489543f0a3..94256809cf 100644
--- a/nerfstudio/engine/trainer.py
+++ b/nerfstudio/engine/trainer.py
@@ -338,8 +338,9 @@ def _init_viewer_state(self) -> None:
         """Initializes viewer scene with given train dataset"""
         assert self.viewer_state and self.pipeline.datamanager.train_dataset
         self.viewer_state.init_scene(
-            dataset=self.pipeline.datamanager.train_dataset,
+            train_dataset=self.pipeline.datamanager.train_dataset,
             train_state="training",
+            eval_dataset=self.pipeline.datamanager.eval_dataset,
         )
 
     @check_viewer_enabled
diff --git a/nerfstudio/process_data/base_converter_to_nerfstudio_dataset.py b/nerfstudio/process_data/base_converter_to_nerfstudio_dataset.py
index e46601479d..1c7f8cfbdd 100644
--- a/nerfstudio/process_data/base_converter_to_nerfstudio_dataset.py
+++ b/nerfstudio/process_data/base_converter_to_nerfstudio_dataset.py
@@ -19,16 +19,19 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Optional
 
 
 @dataclass
 class BaseConverterToNerfstudioDataset(ABC):
-    """Base class to process images or video into a nerfstudio dataset"""
+    """Base class to process images or video into a nerfstudio dataset."""
 
     data: Path
     """Path the data, either a video file or a directory of images."""
     output_dir: Path
     """Path to the output directory."""
+    eval_data: Optional[Path] = None
+    """Path the eval data, either a video file or a directory of images. If set to None, the first will be used both for training and eval"""
     verbose: bool = False
     """If True, print extra logging."""
 
diff --git a/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py b/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py
index 13ed7da66f..7ab711b46e 100644
--- a/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py
+++ b/nerfstudio/process_data/colmap_converter_to_nerfstudio_dataset.py
@@ -19,9 +19,7 @@
 from typing import Dict, List, Literal, Optional, Tuple
 
 from nerfstudio.process_data import colmap_utils, hloc_utils, process_data_utils
-from nerfstudio.process_data.base_converter_to_nerfstudio_dataset import (
-    BaseConverterToNerfstudioDataset,
-)
+from nerfstudio.process_data.base_converter_to_nerfstudio_dataset import BaseConverterToNerfstudioDataset
 from nerfstudio.process_data.process_data_utils import CAMERA_MODELS
 from nerfstudio.utils import install_checks
 from nerfstudio.utils.rich_utils import CONSOLE
diff --git a/nerfstudio/process_data/images_to_nerfstudio_dataset.py b/nerfstudio/process_data/images_to_nerfstudio_dataset.py
index 3f47b70ac3..e6cf501bfb 100644
--- a/nerfstudio/process_data/images_to_nerfstudio_dataset.py
+++ b/nerfstudio/process_data/images_to_nerfstudio_dataset.py
@@ -18,9 +18,7 @@
 from typing import Optional
 
 from nerfstudio.process_data import equirect_utils, process_data_utils
-from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import (
-    ColmapConverterToNerfstudioDataset,
-)
+from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import ColmapConverterToNerfstudioDataset
 from nerfstudio.utils.rich_utils import CONSOLE
 
 
@@ -47,11 +45,15 @@ def main(self) -> None:
 
         # Generate planar projections if equirectangular
         if self.camera_type == "equirectangular":
+            if self.eval_data is not None:
+                raise ValueError("Cannot use eval_data with camera_type equirectangular.")
+
             pers_size = equirect_utils.compute_resolution_from_equirect(self.data, self.images_per_equirect)
             CONSOLE.log(f"Generating {self.images_per_equirect} {pers_size} sized images per equirectangular image")
             self.data = equirect_utils.generate_planar_projections_from_equirectangular(
                 self.data, pers_size, self.images_per_equirect, crop_factor=self.crop_factor
             )
+
             self.camera_type = "perspective"
 
         summary_log = []
@@ -63,10 +65,25 @@ def main(self) -> None:
                 self.data,
                 image_dir=self.image_dir,
                 crop_factor=self.crop_factor,
+                image_prefix="frame_train_" if self.eval_data is not None else "frame_",
                 verbose=self.verbose,
                 num_downscales=self.num_downscales,
                 same_dimensions=self.same_dimensions,
+                keep_image_dir=False,
             )
+            if self.eval_data is not None:
+                eval_image_rename_map_paths = process_data_utils.copy_images(
+                    self.eval_data,
+                    image_dir=self.image_dir,
+                    crop_factor=self.crop_factor,
+                    image_prefix="frame_eval_",
+                    verbose=self.verbose,
+                    num_downscales=self.num_downscales,
+                    same_dimensions=self.same_dimensions,
+                    keep_image_dir=True,
+                )
+                image_rename_map_paths.update(eval_image_rename_map_paths)
+
             image_rename_map = dict((a.name, b.name) for a, b in image_rename_map_paths.items())
             num_frames = len(image_rename_map)
             summary_log.append(f"Starting with {num_frames} images")
diff --git a/nerfstudio/process_data/process_data_utils.py b/nerfstudio/process_data/process_data_utils.py
index 6a10153e8d..6db731e4fb 100644
--- a/nerfstudio/process_data/process_data_utils.py
+++ b/nerfstudio/process_data/process_data_utils.py
@@ -15,17 +15,16 @@
 """Helper utils for processing data into the nerfstudio format."""
 
 import math
+import re
 import shutil
 import sys
-import re
 from enum import Enum
 from pathlib import Path
 from typing import List, Literal, Optional, OrderedDict, Tuple, Union
 
 import cv2
-import numpy as np
-
 import imageio
+import numpy as np
 import rawpy
 
 from nerfstudio.utils.rich_utils import CONSOLE, status
@@ -114,6 +113,8 @@ def convert_video_to_images(
     num_downscales: int,
     crop_factor: Tuple[float, float, float, float] = (0.0, 0.0, 0.0, 0.0),
     verbose: bool = False,
+    image_prefix: str = "frame_",
+    keep_image_dir: bool = False,
 ) -> Tuple[List[str], int]:
     """Converts a video into a sequence of images.
 
@@ -124,10 +125,19 @@ def convert_video_to_images(
         num_downscales: Number of times to downscale the images. Downscales by 2 each time.
         crop_factor: Portion of the image to crop. Should be in [0,1] (top, bottom, left, right)
         verbose: If True, logs the output of the command.
+        image_prefix: Prefix to use for the image filenames.
+        keep_image_dir: If True, don't delete the output directory if it already exists.
     Returns:
         A tuple containing summary of the conversion and the number of extracted frames.
     """
 
+    # If keep_image_dir is False, then remove the output image directory and its downscaled versions
+    if not keep_image_dir:
+        for i in range(num_downscales + 1):
+            dir_to_remove = image_dir if i == 0 else f"{image_dir}_{2**i}"
+            shutil.rmtree(dir_to_remove, ignore_errors=True)
+    image_dir.mkdir(exist_ok=True, parents=True)
+
     for i in crop_factor:
         if i < 0 or i > 1:
             CONSOLE.print("[bold red]Error: Invalid crop factor. All crops must be in [0,1].")
@@ -141,12 +151,6 @@ def convert_video_to_images(
         sys.exit(1)
 
     with status(msg="Converting video to images...", spinner="bouncingBall", verbose=verbose):
-        # delete existing images in folder
-        for img in image_dir.glob("*.png"):
-            if verbose:
-                CONSOLE.log(f"Deleting {img}")
-            img.unlink()
-
         num_frames = get_num_frames_in_video(video_path)
         if num_frames == 0:
             CONSOLE.print(f"[bold red]Error: Video has no frames: {video_path}")
@@ -168,7 +172,7 @@ def convert_video_to_images(
 
         downscale_chains = [f"[t{i}]scale=iw/{2**i}:ih/{2**i}[out{i}]" for i in range(num_downscales + 1)]
         downscale_dirs = [Path(str(image_dir) + (f"_{2**i}" if i > 0 else "")) for i in range(num_downscales + 1)]
-        downscale_paths = [downscale_dirs[i] / "frame_%05d.png" for i in range(num_downscales + 1)]
+        downscale_paths = [downscale_dirs[i] / f"{image_prefix}%05d.png" for i in range(num_downscales + 1)]
 
         for dir in downscale_dirs:
             dir.mkdir(parents=True, exist_ok=True)
@@ -200,7 +204,7 @@ def convert_video_to_images(
         num_final_frames = len(list(image_dir.glob("*.png")))
         summary_log = []
         summary_log.append(f"Starting with {num_frames} video frames")
-        summary_log.append(f"We extracted {num_final_frames} images")
+        summary_log.append(f"We extracted {num_final_frames} images with prefix '{image_prefix}'")
         CONSOLE.log("[bold green]:tada: Done converting video to images.")
 
         return summary_log, num_final_frames
@@ -210,9 +214,11 @@ def copy_images_list(
     image_paths: List[Path],
     image_dir: Path,
     num_downscales: int,
+    image_prefix: str = "frame_",
     crop_border_pixels: Optional[int] = None,
     crop_factor: Tuple[float, float, float, float] = (0.0, 0.0, 0.0, 0.0),
     verbose: bool = False,
+    keep_image_dir: bool = False,
     upscale_factor: Optional[int] = None,
     nearest_neighbor: bool = False,
     same_dimensions: bool = True,
@@ -222,19 +228,24 @@ def copy_images_list(
         image_paths: List of Paths of images to copy to a new directory.
         image_dir: Path to the output directory.
         num_downscales: Number of times to downscale the images. Downscales by 2 each time.
+        image_prefix: Prefix for the image filenames.
         crop_border_pixels: If not None, crops each edge by the specified number of pixels.
         crop_factor: Portion of the image to crop. Should be in [0,1] (top, bottom, left, right)
         verbose: If True, print extra logging.
+        keep_image_dir: If True, don't delete the output directory if it already exists.
     Returns:
         A list of the copied image Paths.
     """
 
-    # Remove original directory only if we provide a proper image folder path
-    if image_dir.is_dir() and len(image_paths):
+    # Remove original directory and its downscaled versions
+    # only if we provide a proper image folder path and keep_image_dir is False
+    if image_dir.is_dir() and len(image_paths) and not keep_image_dir:
         # check that output directory is not the same as input directory
         if image_dir != image_paths[0].parent:
-            shutil.rmtree(image_dir, ignore_errors=True)
-        image_dir.mkdir(exist_ok=True, parents=True)
+            for i in range(num_downscales + 1):
+                dir_to_remove = image_dir if i == 0 else f"{image_dir}_{2**i}"
+                shutil.rmtree(dir_to_remove, ignore_errors=True)
+    image_dir.mkdir(exist_ok=True, parents=True)
 
     copied_image_paths = []
 
@@ -242,11 +253,11 @@ def copy_images_list(
     for idx, image_path in enumerate(image_paths):
         if verbose:
             CONSOLE.log(f"Copying image {idx + 1} of {len(image_paths)}...")
-        copied_image_path = image_dir / f"frame_{idx + 1:05d}{image_path.suffix}"
+        copied_image_path = image_dir / f"{image_prefix}{idx + 1:05d}{image_path.suffix}"
         try:
             # if CR2 raw, we want to read raw and write RAW_CONVERTED_SUFFIX, and change the file suffix for downstream processing
             if image_path.suffix.lower() in ALLOWED_RAW_EXTS:
-                copied_image_path = image_dir / f"frame_{idx + 1:05d}{RAW_CONVERTED_SUFFIX}"
+                copied_image_path = image_dir / f"{image_prefix}{idx + 1:05d}{RAW_CONVERTED_SUFFIX}"
                 with rawpy.imread(str(image_path)) as raw:
                     rgb = raw.postprocess()
                 imageio.imsave(copied_image_path, rgb)
@@ -283,7 +294,7 @@ def copy_images_list(
     # When this is not the case (e.g. mixed portrait and landscape images), we need to do individually.
     # (Unfortunately, that is much slower.)
     for framenum in range(1, (1 if same_dimensions else num_frames) + 1):
-        framename = "frame_%05d" if same_dimensions else f"frame_{framenum:05d}"
+        framename = f"{image_prefix}%05d" if same_dimensions else f"{image_prefix}{framenum:05d}"
         ffmpeg_cmd = f'ffmpeg -y -noautorotate -i "{image_dir / f"{framename}{copied_image_paths[0].suffix}"}" -q:v 2 '
 
         crop_cmd = ""
@@ -315,7 +326,7 @@ def copy_images_list(
     if num_frames == 0:
         CONSOLE.log("[bold red]:skull: No usable images in the data folder.")
     else:
-        CONSOLE.log("[bold green]:tada: Done copying images.")
+        CONSOLE.log(f"[bold green]:tada: Done copying images with prefix '{image_prefix}'.")
 
     return copied_image_paths
 
@@ -363,7 +374,9 @@ def copy_and_upscale_polycam_depth_maps_list(
 def copy_images(
     data: Path,
     image_dir: Path,
+    image_prefix: str = "frame_",
     verbose: bool = False,
+    keep_image_dir: bool = False,
     crop_factor: Tuple[float, float, float, float] = (0.0, 0.0, 0.0, 0.0),
     num_downscales: int = 0,
     same_dimensions: bool = True,
@@ -373,8 +386,10 @@ def copy_images(
     Args:
         data: Path to the directory of images.
         image_dir: Path to the output directory.
+        image_prefix: Prefix for the image filenames.
         verbose: If True, print extra logging.
         crop_factor: Portion of the image to crop. Should be in [0,1] (top, bottom, left, right)
+        keep_image_dir: If True, don't delete the output directory if it already exists.
     Returns:
         The mapping from the original filenames to the new ones.
     """
@@ -390,6 +405,8 @@ def copy_images(
             image_dir=image_dir,
             crop_factor=crop_factor,
             verbose=verbose,
+            image_prefix=image_prefix,
+            keep_image_dir=keep_image_dir,
             num_downscales=num_downscales,
             same_dimensions=same_dimensions,
         )
@@ -406,8 +423,6 @@ def downscale_images(
     """(Now deprecated; much faster integrated into copy_images.)
     Downscales the images in the directory. Uses FFMPEG.
 
-    Assumes images are named frame_00001.png, frame_00002.png, etc.
-
     Args:
         image_dir: Path to the directory containing the images.
         num_downscales: Number of times to downscale the images. Downscales by 2 each time.
diff --git a/nerfstudio/process_data/video_to_nerfstudio_dataset.py b/nerfstudio/process_data/video_to_nerfstudio_dataset.py
index b1eec97eab..928752e432 100644
--- a/nerfstudio/process_data/video_to_nerfstudio_dataset.py
+++ b/nerfstudio/process_data/video_to_nerfstudio_dataset.py
@@ -18,9 +18,7 @@
 from dataclasses import dataclass
 
 from nerfstudio.process_data import equirect_utils, process_data_utils
-from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import (
-    ColmapConverterToNerfstudioDataset,
-)
+from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import ColmapConverterToNerfstudioDataset
 from nerfstudio.utils.rich_utils import CONSOLE
 
 
@@ -35,7 +33,7 @@ class VideoToNerfstudioDataset(ColmapConverterToNerfstudioDataset):
     """
 
     num_frames_target: int = 300
-    """Target number of frames to use for the dataset, results may not be exact."""
+    """Target number of frames to use per video, results may not be exact."""
     percent_radius_crop: float = 1.0
     """Create circle crop mask. The radius is the percent of the image diagonal."""
 
@@ -43,6 +41,7 @@ def main(self) -> None:
         """Process video into a nerfstudio dataset."""
 
         summary_log = []
+        summary_log_eval = []
         # Convert video to images
         if self.camera_type == "equirectangular":
             # create temp images folder to store the equirect and perspective images
@@ -65,10 +64,28 @@ def main(self) -> None:
                 num_downscales=self.num_downscales,
                 crop_factor=self.crop_factor,
                 verbose=self.verbose,
+                image_prefix="frame_train_" if self.eval_data is not None else "frame_",
+                keep_image_dir=False,
             )
+            if self.eval_data is not None:
+                summary_log_eval, num_extracted_frames_eval = process_data_utils.convert_video_to_images(
+                    self.eval_data,
+                    image_dir=self.image_dir,
+                    num_frames_target=self.num_frames_target,
+                    num_downscales=self.num_downscales,
+                    crop_factor=self.crop_factor,
+                    verbose=self.verbose,
+                    image_prefix="frame_eval_",
+                    keep_image_dir=True,
+                )
+                summary_log += summary_log_eval
+                num_extracted_frames += num_extracted_frames_eval
 
         # Generate planar projections if equirectangular
         if self.camera_type == "equirectangular":
+            if self.eval_data is not None:
+                raise ValueError("Cannot use eval_data with camera_type equirectangular.")
+
             perspective_image_size = equirect_utils.compute_resolution_from_equirect(
                 self.output_dir / "temp_images", self.images_per_equirect
             )
@@ -97,7 +114,7 @@ def main(self) -> None:
                 process_data_utils.downscale_images(self.image_dir, self.num_downscales, verbose=self.verbose)
             )
 
-        # # Create mask
+        # Create mask
         mask_path = process_data_utils.save_mask(
             image_dir=self.image_dir,
             num_downscales=self.num_downscales,
diff --git a/nerfstudio/scripts/process_data.py b/nerfstudio/scripts/process_data.py
index 201eb94fc9..8b4151f757 100644
--- a/nerfstudio/scripts/process_data.py
+++ b/nerfstudio/scripts/process_data.py
@@ -33,12 +33,8 @@
     realitycapture_utils,
     record3d_utils,
 )
-from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import (
-    BaseConverterToNerfstudioDataset,
-)
-from nerfstudio.process_data.images_to_nerfstudio_dataset import (
-    ImagesToNerfstudioDataset,
-)
+from nerfstudio.process_data.colmap_converter_to_nerfstudio_dataset import BaseConverterToNerfstudioDataset
+from nerfstudio.process_data.images_to_nerfstudio_dataset import ImagesToNerfstudioDataset
 from nerfstudio.process_data.video_to_nerfstudio_dataset import VideoToNerfstudioDataset
 from nerfstudio.utils.rich_utils import CONSOLE
 
@@ -253,6 +249,8 @@ def main(self) -> None:
             raise ValueError(f"XML file {self.xml} must have a .xml extension")
         if not self.xml.exists:
             raise ValueError(f"XML file {self.xml} doesn't exist")
+        if self.eval_data is not None:
+            raise ValueError("Cannot use eval_data since cameras were already aligned with Metashape.")
 
         self.output_dir.mkdir(parents=True, exist_ok=True)
         image_dir = self.output_dir / "images"
@@ -338,6 +336,8 @@ def main(self) -> None:
             raise ValueError(f"CSV file {self.csv} must have a .csv extension")
         if not self.csv.exists:
             raise ValueError(f"CSV file {self.csv} doesn't exist")
+        if self.eval_data is not None:
+            raise ValueError("Cannot use eval_data since cameras were already aligned with RealityCapture.")
 
         self.output_dir.mkdir(parents=True, exist_ok=True)
         image_dir = self.output_dir / "images"
diff --git a/nerfstudio/scripts/viewer/run_viewer.py b/nerfstudio/scripts/viewer/run_viewer.py
index c4ebefacb7..56a712026f 100644
--- a/nerfstudio/scripts/viewer/run_viewer.py
+++ b/nerfstudio/scripts/viewer/run_viewer.py
@@ -21,7 +21,6 @@
 import time
 from dataclasses import dataclass, field, fields
 from pathlib import Path
-
 from typing import Literal
 
 import tyro
@@ -113,8 +112,9 @@ def _start_viewer(config: TrainerConfig, pipeline: Pipeline, step: int):
 
     assert viewer_state and pipeline.datamanager.train_dataset
     viewer_state.init_scene(
-        dataset=pipeline.datamanager.train_dataset,
+        train_dataset=pipeline.datamanager.train_dataset,
         train_state="completed",
+        eval_dataset=pipeline.datamanager.eval_dataset,
     )
     if isinstance(viewer_state, ViewerState):
         viewer_state.viser_server.set_training_state("completed")
diff --git a/nerfstudio/viewer/server/viewer_state.py b/nerfstudio/viewer/server/viewer_state.py
index 0e948d2ccc..ad80822ce8 100644
--- a/nerfstudio/viewer/server/viewer_state.py
+++ b/nerfstudio/viewer/server/viewer_state.py
@@ -38,10 +38,7 @@
 from nerfstudio.viewer.server import viewer_utils
 from nerfstudio.viewer.server.control_panel import ControlPanel
 from nerfstudio.viewer.server.gui_utils import parse_object
-from nerfstudio.viewer.server.render_state_machine import (
-    RenderAction,
-    RenderStateMachine,
-)
+from nerfstudio.viewer.server.render_state_machine import RenderAction, RenderStateMachine
 from nerfstudio.viewer.server.utils import get_intrinsics_matrix_and_camera_to_world_h
 from nerfstudio.viewer.server.viewer_elements import ViewerControl, ViewerElement
 from nerfstudio.viewer.viser import ViserServer
@@ -331,7 +328,7 @@ def get_camera(self, image_height: int, image_width: int) -> Optional[Cameras]:
         camera = camera.to(self.get_model().device)
         return camera
 
-    def _pick_drawn_image_idxs(self, total_num: int) -> list[int]:
+    def _pick_drawn_image_idxs(self, total_num: int) -> np.ndarray:
         """Determine indices of images to display in viewer.
 
         Args:
@@ -345,9 +342,14 @@ def _pick_drawn_image_idxs(self, total_num: int) -> list[int]:
         else:
             num_display_images = min(self.config.max_num_display_images, total_num)
         # draw indices, roughly evenly spaced
-        return np.linspace(0, total_num - 1, num_display_images, dtype=np.int32).tolist()
+        return np.linspace(0, total_num - 1, num_display_images, dtype=np.int32)
 
-    def init_scene(self, dataset: InputDataset, train_state: Literal["training", "paused", "completed"]) -> None:
+    def init_scene(
+        self,
+        train_dataset: InputDataset,
+        train_state: Literal["training", "paused", "completed"],
+        eval_dataset: Optional[InputDataset] = None,
+    ) -> None:
         """Draw some images and the scene aabb in the viewer.
 
         Args:
@@ -360,16 +362,39 @@ def init_scene(self, dataset: InputDataset, train_state: Literal["training", "pa
             export_path_name=self.log_filename.parent.stem,
         )
 
+        # total num of images
+        num_images = len(train_dataset)
+        if eval_dataset is not None:
+            num_images += len(eval_dataset)
+
         # draw the training cameras and images
-        image_indices = self._pick_drawn_image_idxs(len(dataset))
-        for idx in image_indices:
-            image = dataset[idx]["image"]
+        image_indices = self._pick_drawn_image_idxs(num_images)
+        for idx in image_indices[image_indices < len(train_dataset)].tolist():
+            image = train_dataset[idx]["image"]
             bgr = image[..., [2, 1, 0]]
-            camera_json = dataset.cameras.to_json(camera_idx=idx, image=bgr, max_size=100)
+            camera_json = train_dataset.cameras.to_json(camera_idx=idx, image=bgr, max_size=100)
             self.viser_server.add_dataset_image(idx=f"{idx:06d}", json=camera_json)
 
+        # draw the eval cameras and images
+        if eval_dataset is not None:
+            image_indices = image_indices[image_indices >= len(train_dataset)] - len(train_dataset)
+            for idx in image_indices.tolist():
+                image = eval_dataset[idx]["image"]
+                bgr = image[..., [2, 1, 0]]
+                # color the eval image borders red
+                # TODO: color the threejs frustum instead of changing the image itself like we are doing here
+                t = int(min(image.shape[:2]) * 0.1)  # border thickness as 10% of min height or width resolution
+                bc = torch.tensor((0, 0, 1.0))
+                bgr[:t, :, :] = bc
+                bgr[-t:, :, :] = bc
+                bgr[:, -t:, :] = bc
+                bgr[:, :t, :] = bc
+
+                camera_json = eval_dataset.cameras.to_json(camera_idx=idx, image=bgr, max_size=100)
+                self.viser_server.add_dataset_image(idx=f"{idx+len(train_dataset):06d}", json=camera_json)
+
         # draw the scene box (i.e., the bounding box)
-        self.viser_server.update_scene_box(dataset.scene_box)
+        self.viser_server.update_scene_box(train_dataset.scene_box)
 
         # set the initial state whether to train or not
         self.train_btn_state = train_state
diff --git a/nerfstudio/viewer_beta/viewer.py b/nerfstudio/viewer_beta/viewer.py
index 4c007f082f..a5271571dd 100644
--- a/nerfstudio/viewer_beta/viewer.py
+++ b/nerfstudio/viewer_beta/viewer.py
@@ -169,7 +169,12 @@ def _pick_drawn_image_idxs(self, total_num: int) -> list[int]:
         # draw indices, roughly evenly spaced
         return np.linspace(0, total_num - 1, num_display_images, dtype=np.int32).tolist()
 
-    def init_scene(self, dataset: InputDataset, train_state: Literal["training", "paused", "completed"]) -> None:
+    def init_scene(
+        self,
+        train_dataset: InputDataset,
+        train_state: Literal["training", "paused", "completed"],
+        eval_dataset: Optional[InputDataset] = None,
+    ) -> None:
         """Draw some images and the scene aabb in the viewer.
 
         Args:
@@ -178,10 +183,10 @@ def init_scene(self, dataset: InputDataset, train_state: Literal["training", "pa
         """
 
         # draw the training cameras and images
-        image_indices = self._pick_drawn_image_idxs(len(dataset))
+        image_indices = self._pick_drawn_image_idxs(len(train_dataset))
         for idx in image_indices:
-            image = dataset[idx]["image"]
-            camera = dataset.cameras[idx]
+            image = train_dataset[idx]["image"]
+            camera = train_dataset.cameras[idx]
             image_uint8 = (image * 255).detach().type(torch.uint8)
             image_uint8 = image_uint8.permute(2, 0, 1)
             image_uint8 = torchvision.transforms.functional.resize(image_uint8, 100)  # type: ignore
diff --git a/tests/process_data/test_process_images.py b/tests/process_data/test_process_images.py
index f143f2b1fa..a3d0323551 100644
--- a/tests/process_data/test_process_images.py
+++ b/tests/process_data/test_process_images.py
@@ -11,14 +11,8 @@
 from nerfstudio.data.dataparsers.nerfstudio_dataparser import NerfstudioDataParserConfig
 from nerfstudio.data.utils.colmap_parsing_utils import Camera
 from nerfstudio.data.utils.colmap_parsing_utils import Image as ColmapImage
-from nerfstudio.data.utils.colmap_parsing_utils import (
-    qvec2rotmat,
-    write_cameras_binary,
-    write_images_binary,
-)
-from nerfstudio.process_data.images_to_nerfstudio_dataset import (
-    ImagesToNerfstudioDataset,
-)
+from nerfstudio.data.utils.colmap_parsing_utils import qvec2rotmat, write_cameras_binary, write_images_binary
+from nerfstudio.process_data.images_to_nerfstudio_dataset import ImagesToNerfstudioDataset
 
 
 def random_quaternion(num_poses: int):