diff --git a/README.md b/README.md index 82ae8fe..305482c 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ Pytorch implementation of the CVPR 2021 oral paper: SliceNet: deep dense depth e Paper at: https://openaccess.thecvf.com/content/CVPR2021/papers/Pintore_SliceNet_Deep_Dense_Depth_Estimation_From_a_Single_Indoor_Panorama_CVPR_2021_paper.pdf -![](assets/intro.png) We present a novel deep neural network to estimate a depth map from a single monocular indoor panorama, called SliceNet. The network directly works on the equirectangular projection, exploiting the properties of indoor 360 images. @@ -18,7 +17,9 @@ This repo is a **python** implementation where you can test **depth inference** ![](assets/overview.png) ## Updates -* 2020-07-21: Network source code and demo released +* 2021-08-13: IMPORTANT: Fixed bug in weights init: model and pre-trained weights updated + - REPLACE PREVIOUS MODEL AND WEIGHTS +* 2021-07-21: Network source code and demo released ## Requirements - Python >=3.6 @@ -36,12 +37,13 @@ Copy to your local ./ckpt directory. - As in previous comparable works we resize the resolution of equirectangular image and depth map into 512 × 1024. - Stitching the original 18-poses format to equirectangular have been perfomed following the official procedure from https://github.com/niessner/Matterport/blob/master/data_organization.md, based on the methods/tools of Zhang https://github.com/yindaz/PanoBasic. -- [resnet50_stanford.pth](https://vicserver.crs4.it/slicenet/resnet50_stanford.pth) +- [resnet50_stanford.pth] - Trained with ResNet50 using Stanford-2D-3D-S dataset http://buildingparser.stanford.edu/dataset.html. - As in previous comparable works we adopt the official Fold#1 as split thus taking fifth area (area 5) for testing and the others for training. - As in previous comparable works we resize the resolution of equirectangular image and depth map into 512 × 1024. - Invalid measures are masked as 0. -- [resnet50_s3d.pth](https://vicserver.crs4.it/slicenet/resnet50_s3d.pth) + - COMING SOON +- [resnet50_s3d.pth] - Trained with ResNet50 using Structured3D dataset and their official splitting. - [resnet50_360D.pth] - COMING SOON diff --git a/assets/intro.png b/assets/intro.png deleted file mode 100644 index 14996e1..0000000 Binary files a/assets/intro.png and /dev/null differ diff --git a/slice_model.py b/slice_model.py index 0b16d9d..3229da9 100644 --- a/slice_model.py +++ b/slice_model.py @@ -2,10 +2,9 @@ import torch import torch.nn as nn import torch.nn.functional as F - import torchvision.models as models - import functools +import time ENCODER_RESNET = [ 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', @@ -16,7 +15,6 @@ def lr_pad(x, padding=1): ''' Pad left/right-most to each other instead of zero padding ''' return torch.cat([x[..., -padding:], x, x[..., :padding]], dim=3) - class LR_PAD(nn.Module): ''' Pad left/right-most to each other instead of zero padding ''' def __init__(self, padding=1): @@ -25,7 +23,6 @@ def __init__(self, padding=1): def forward(self, x): return lr_pad(x, self.padding) - def wrap_lr_pad(net): for name, m in net.named_modules(): if not isinstance(m, nn.Conv2d): @@ -108,8 +105,11 @@ def forward(self, x, out_w): #####HorizonNet-style upsampling x = torch.cat([x[..., -1:], x, x[..., :1]], 3) ## plus 2 on W - x = F.interpolate(x, size=(x.shape[2], out_w + 2 * factor), mode='bilinear', align_corners=False) + x = F.interpolate(x, size=(x.shape[2], out_w + 2 * factor), mode='bilinear', align_corners=False) ####NB interpolating only W x = x[..., factor:-factor] ##minus 2 on W + + ##SIMPLEST + ##x = F.interpolate(x, size=(x.shape[2], out_w), mode='bilinear', align_corners=False) return x @@ -150,10 +150,10 @@ def __init__(self, backbone, full_size = False): self.full_size = full_size - self.out_w_size = 512 + ##self.out_w_size = 512 - if(self.full_size): - self.out_w_size = 1024 + ##if(self.full_size): + ##self.out_w_size = 1024 self.feature_extractor = Resnet(backbone, pretrained=True) @@ -165,7 +165,9 @@ def __init__(self, backbone, full_size = False): if(self.full_size): c_last *= 2 - + + ##print('c_last',c_last) + self.slicing_module = MultiSlicing(c1, c2, c3, c4, self.ch_scale) self.bi_rnn = nn.LSTM(input_size=c_last, @@ -204,7 +206,7 @@ def __init__(self, backbone, full_size = False): ''' Pad left/right-most to each other instead of zero padding ''' wrap_lr_pad(self) - self.apply(xavier_init) + ##self.apply(xavier_init) def _prepare_x(self, x): if self.x_mean.device != x.device: