Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Replicate demo and API #39

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ This software project accompanies the research paper:
**[Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073)**,
*Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, and Vladlen Koltun*.

[![Replicate](https://replicate.com/chenxwh/ml-depth-pro/badge)](https://replicate.com/chenxwh/ml-depth-pro)

![](data/depth-pro-teaser.jpg)

We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image.
Expand Down
31 changes: 31 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Configuration for Cog ⚙️
# Reference: https://cog.run/yaml

build:
# set to true if your model requires a GPU
gpu: true

# a list of ubuntu apt packages to install
system_packages:
- "libgl1-mesa-glx"
- "libglib2.0-0"

# python version in the form '3.11' or '3.11.4'
python_version: "3.9"

# a list of packages in the format <package-name>==<version>
python_packages:
- torch
- torchvision
- timm
- numpy<2
- pillow_heif
- matplotlib

# commands run after the environment is setup
run:
- pip install ipython
- curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.8.2/pget_linux_x86_64" && chmod +x /usr/local/bin/pget

# predict.py defines how predictions are run on your model
predict: "predict.py:Predictor"
110 changes: 110 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Prediction interface for Cog ⚙️
# https://cog.run/python

import os
import subprocess
import time
import numpy as np
import PIL.Image
import torch
from matplotlib import pyplot as plt
from cog import BasePredictor, Input, Path, BaseModel

from src.depth_pro import create_model_and_transforms, load_rgb
from src.depth_pro.depth_pro import DepthProConfig


MODEL_CACHE = "checkpoints"
MODEL_URL = (
f"https://weights.replicate.delivery/default/apple/ml-depth-pro/{MODEL_CACHE}.tar"
)

os.environ.update(
{
"HF_DATASETS_OFFLINE": "1",
"TRANSFORMERS_OFFLINE": "1",
"HF_HOME": MODEL_CACHE,
"TORCH_HOME": MODEL_CACHE,
"HF_DATASETS_CACHE": MODEL_CACHE,
"TRANSFORMERS_CACHE": MODEL_CACHE,
"HUGGINGFACE_HUB_CACHE": MODEL_CACHE,
}
)


class ModelOutput(BaseModel):
npz: Path
color_map: Path


def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""

if not os.path.exists(MODEL_CACHE):
download_weights(MODEL_URL, MODEL_CACHE)

self.model, self.transform = create_model_and_transforms(
config=DepthProConfig(
patch_encoder_preset="dinov2l16_384",
image_encoder_preset="dinov2l16_384",
checkpoint_uri=f"./{MODEL_CACHE}/depth_pro.pt",
decoder_features=256,
use_fov_head=True,
fov_encoder_preset="dinov2l16_384",
),
device=torch.device("cuda:0"),
precision=torch.half,
)
self.model.eval()

def predict(
self,
image_path: Path = Input(description="Input image"),
) -> ModelOutput:
"""Run a single prediction on the model"""

image, _, f_px = load_rgb(image_path)

# Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
# otherwise the model estimates `f_px` to compute the depth metricness.
prediction = self.model.infer(self.transform(image), f_px=f_px)

# Extract the depth and focal length.
depth = prediction["depth"].detach().cpu().numpy().squeeze()
if f_px is not None:
print(f"Focal length (from exif): {f_px:0.2f}")
elif prediction["focallength_px"] is not None:
focallength_px = prediction["focallength_px"].detach().cpu().item()
print(f"Estimated focal length: {focallength_px}")

inverse_depth = 1 / depth
# Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization.
max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1)
min_invdepth_vizu = max(1 / 250, inverse_depth.min())
inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / (
max_invdepth_vizu - min_invdepth_vizu
)

# Save Depth as npz file.
out_npz = "/tmp/out.npz"
np.savez_compressed(out_npz, depth=depth)
np.savez_compressed("out.npz", depth=depth)

# Save as color-mapped "turbo" jpg image.
cmap = plt.get_cmap("turbo")
color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(np.uint8)
out_color_map = "/tmp/out.jpg"
PIL.Image.fromarray(color_depth).save(out_color_map, format="JPEG", quality=90)

PIL.Image.fromarray(color_depth).save("out.jpg", format="JPEG", quality=90)

return ModelOutput(npz=Path(out_npz), color_map=Path(out_color_map))