TRI-ML · stellarpower · Aug 13, 2021 · Aug 25, 2021 · Aug 25, 2021
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
 # Handy commands:
 # - `make docker-build`: builds DOCKERIMAGE (default: `packnet-sfm:latest`)
-PROJECT ?= packnet-sfm
+PROJECT ?= stellarpower/packnet-sfm
 WORKSPACE ?= /workspace/$(PROJECT)
 DOCKER_IMAGE ?= ${PROJECT}:latest
 
@@ -79,4 +79,4 @@ docker-run: docker-build
 
 docker-run-mpi: docker-build
 	nvidia-docker run ${DOCKER_OPTS} ${DOCKER_IMAGE} \
-		bash -c "${MPI_CMD} ${COMMAND}"
+		bash -c "${MPI_CMD} ${COMMAND}"
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -1,12 +1,48 @@
 # Copyright 2020 Toyota Research Institute.  All rights reserved.
 
-FROM nvidia/cuda:10.2-devel-ubuntu18.04
+# CUDA_VERSION is defined in the image, so add prefix
+ARG PSFM_CUDA_MAJOR_VERSION=11
+ARG PSFM_CUDA_MINOR_VERSION=1
+ARG PSFM_CUDA_VERSION=${PSFM_CUDA_MAJOR_VERSION}.${PSFM_CUDA_MINOR_VERSION}
+
+ARG SOURCE_IMAGE=nvidia/cuda:${PSFM_CUDA_VERSION}-devel-ubuntu18.04
+########################################
+FROM $SOURCE_IMAGE as base
+# Reïntroduce variables from before:
+ARG PSFM_CUDA_MAJOR_VERSION
+ARG PSFM_CUDA_MINOR_VERSION
+ARG PSFM_CUDA_VERSION
+ARG SOURCE_IMAGE
 
 ENV PROJECT=packnet-sfm
-ENV PYTORCH_VERSION=1.8.1
-ENV TORCHVISION_VERSION=0.9.1
-ENV CUDNN_VERSION=7.6.5.32-1+cuda10.2
-ENV NCCL_VERSION=2.7.8-1+cuda10.2
+
+# https://lambdalabs.com/blog/install-tensorflow-and-pytorch-on-rtx-30-series/
+# 3090 => CUDA 11.1 OK
+#    => cuDNN 8.2.1
+
+# From https://github.com/pytorch/vision:
+# Torch 1.9.0 <=> Torchvision 0.10.0
+
+# From https://pytorch.org/get-started/locally/
+# Torch 1.9.0 => CUDA 11.1 || CUDA 10.2
+
+#ENV CUDNN_VERSION=7.6.5.32-1+cuda10.2
+#ENV NCCL_VERSION=2.7.8-1+cuda10.2
+
+ENV CUDNN_VERSION=8.0.5.39-1+cuda${PSFM_CUDA_VERSION} 
+# Use the variable as a sanity check that we're using the right CUDA
+ARG CUDNN_PACKAGE=libcudnn8=${CUDNN_VERSION}
+ENV NCCL_VERSION=2.7.8-1+cuda${PSFM_CUDA_VERSION}
+ARG NCCL_PACKAGE=libnccl2=${NCCL_VERSION}
+
+ENV PYTORCH_VERSION=1.9.0
+ENV TORCHVISION_VERSION=0.10.0
+
+
+RUN echo "Using " ${SOURCE_IMAGE} " with CUDA version " ${PSFM_CUDA_VERSION} \
+    " and Pytorch/torchvision " ${PYTORCH_VERSION}/${TORCHVISION_VERSION} >&2
+
+
 ENV HOROVOD_VERSION=65de4c961d1e5ad2828f2f6c4329072834f27661
 ENV TRT_VERSION=6.0.1.5
 ENV LC_ALL=C.UTF-8
@@ -29,8 +65,8 @@ RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-
     vim \
     wget \
     ca-certificates \
-    libcudnn7=${CUDNN_VERSION} \
-    libnccl2=${NCCL_VERSION} \
+    ${CUDNN_PACKAGE} \
+    ${NCCL_PACKAGE} \
     libnccl-dev=${NCCL_VERSION} \
     libjpeg-dev \
     libpng-dev \
@@ -85,8 +121,14 @@ RUN pip install future typing numpy pandas matplotlib jupyter h5py \
     mpi4py onnx onnxruntime pycuda yacs cython==0.29.10
 
 # Install PyTorch
-RUN pip install torch==${PYTORCH_VERSION} \
-    torchvision==${TORCHVISION_VERSION} && ldconfig
+#RUN pip install torch==${PYTORCH_VERSION} \
+#    torchvision==${TORCHVISION_VERSION} && ldconfig
+RUN pip3 install \
+    torch==${PYTORCH_VERSION}+cu${PSFM_CUDA_MAJOR_VERSION}${PSFM_CUDA_MINOR_VERSION} \
+    torchvision==${TORCHVISION_VERSION}+cu${PSFM_CUDA_MAJOR_VERSION}${PSFM_CUDA_MINOR_VERSION} \
+    -f https://download.pytorch.org/whl/torch_stable.html \
+    && ldconfig
+
 
 # Install apex
 RUN mkdir /workspace
@@ -141,4 +183,4 @@ WORKDIR /workspace/${PROJECT}
 # Copy project source last (to avoid cache busting)
 WORKDIR /workspace/${PROJECT}
 COPY . /workspace/${PROJECT}
-ENV PYTHONPATH="/workspace/${PROJECT}:$PYTHONPATH"
+ENV PYTHONPATH="/workspace/${PROJECT}:$PYTHONPATH"
diff --git a/packnet_sfm/models/model_wrapper.py b/packnet_sfm/models/model_wrapper.py
@@ -292,14 +292,14 @@ def evaluate_depth(self, batch):
         """Evaluate batch to produce depth metrics."""
         # Get predicted depth
         inv_depths = self.model(batch)['inv_depths']
-        depth = inv2depth(inv_depths[0])
+        depth = inv2depth(inv_depths)
         # Post-process predicted depth
         batch['rgb'] = flip_lr(batch['rgb'])
         if 'input_depth' in batch:
             batch['input_depth'] = flip_lr(batch['input_depth'])
         inv_depths_flipped = self.model(batch)['inv_depths']
         inv_depth_pp = post_process_inv_depth(
-            inv_depths[0], inv_depths_flipped[0], method='mean')
+            inv_depths, inv_depths_flipped, method='mean')
         depth_pp = inv2depth(inv_depth_pp)
         batch['rgb'] = flip_lr(batch['rgb'])
         # Calculate predicted metrics