From e212340e4519fd0dbccaceab67a1556ca206b503 Mon Sep 17 00:00:00 2001
From: LAURENDEAU Matthieu <matthieu.laurendeau@thalesgroup.com>
Date: Wed, 14 Feb 2024 09:02:15 +0100
Subject: [PATCH] BUG: Remove CudaContextManager class and use cudaSetDevice

Use the primary context with cudaSetDevice() introduced by Cuda 7 (https://developer.download.nvidia.com/compute/cuda/7_0/Prod/doc/CUDA_Toolkit_Release_Notes.pdf) instead of a new one. cudaSetDevice is called before every memory transfer between the CPU and GPU to be sure that the context is set for the current thread, see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs/.
---
 include/itkCudaContextManager.h     |  66 -----------------
 include/itkCudaDataManager.h        |   5 +-
 include/itkCudaImageDataManager.h   |   1 -
 include/itkCudaImageDataManager.hxx |   8 +-
 src/CMakeLists.txt                  |   1 -
 src/itkCudaContextManager.cxx       | 109 ----------------------------
 src/itkCudaDataManager.cxx          |  28 ++-----
 7 files changed, 9 insertions(+), 209 deletions(-)
 delete mode 100644 include/itkCudaContextManager.h
 delete mode 100644 src/itkCudaContextManager.cxx
diff --git a/include/itkCudaContextManager.h b/include/itkCudaContextManager.h
deleted file mode 100644
index fa3b7d3..0000000
--- a/include/itkCudaContextManager.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*=========================================================================
- *
- *  Copyright NumFOCUS
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *         https://www.apache.org/licenses/LICENSE-2.0.txt
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *=========================================================================*/
-#ifndef itkCudaContextManager_h
-#define itkCudaContextManager_h
-
-#include "itkCudaUtil.h"
-#include <itkLightObject.h>
-#include "CudaCommonExport.h"
-
-//
-// Singleton class for CudaContextManager
-//
-
-/** \class CudaContextManager
- *
- * \brief Class to store the Cuda context.
- *
- * \ingroup ITKCudaCommon
- */
-namespace itk
-{
-class CudaCommon_EXPORT CudaContextManager : public LightObject
-{
-public:
-  static CudaContextManager *
-  GetInstance();
-
-  static void
-  DestroyInstance();
-
-  CUcontext *
-  GetCurrentContext();
-
-  int
-  GetCurrentDevice();
-
-private:
-  CudaContextManager();
-  ~CudaContextManager() override;
-
-  CUcontext m_Context;
-  int       m_Device;
-  int       m_DeviceIdx;
-  int       m_NumberOfDevices;
-
-  static CudaContextManager * m_Instance;
-  static bool                 m_Initialized;
-};
-} // namespace itk
-
-#endif
diff --git a/include/itkCudaDataManager.h b/include/itkCudaDataManager.h
index e6f4649..c5cca65 100644
--- a/include/itkCudaDataManager.h
+++ b/include/itkCudaDataManager.h
@@ -22,7 +22,6 @@
 #include "itkDataObject.h"
 #include "itkObjectFactory.h"
 #include "itkCudaUtil.h"
-#include "itkCudaContextManager.h"
 #include "CudaCommonExport.h"
 
 #include <mutex>
@@ -232,6 +231,8 @@ class CudaCommon_EXPORT CudaDataManager : public Object
   void
   PrintSelf(std::ostream & os, Indent indent) const override;
 
+  int m_Device;
+
 private:
   CudaDataManager(const Self &) = delete; // purposely not implemented
   void
@@ -240,8 +241,6 @@ class CudaCommon_EXPORT CudaDataManager : public Object
 protected:
   size_t m_BufferSize; // # of bytes
 
-  CudaContextManager * m_ContextManager;
-
   /** buffer type */
   int m_MemFlags;
 
diff --git a/include/itkCudaImageDataManager.h b/include/itkCudaImageDataManager.h
index eb62557..c6c817f 100644
--- a/include/itkCudaImageDataManager.h
+++ b/include/itkCudaImageDataManager.h
@@ -23,7 +23,6 @@
 #include <itkObjectFactory.h>
 #include "itkCudaUtil.h"
 #include "itkCudaDataManager.h"
-#include "itkCudaContextManager.h"
 
 namespace itk
 {
diff --git a/include/itkCudaImageDataManager.hxx b/include/itkCudaImageDataManager.hxx
index 57fecec..670f39b 100644
--- a/include/itkCudaImageDataManager.hxx
+++ b/include/itkCudaImageDataManager.hxx
@@ -76,9 +76,7 @@ CudaImageDataManager<ImageType>::MakeCPUBufferUpToDate()
       std::cout << this << ": GPU->CPU data copy" << std::endl;
 #endif
 
-      CUDA_CHECK(cuCtxSetCurrent(
-        *(this->m_ContextManager->GetCurrentContext()))); // This is necessary when running multithread to bind the host
-                                                          // CPU thread to the right context
+      CUDA_CHECK(cudaSetDevice(m_Device));
       errid = cudaMemcpy(m_CPUBuffer, m_GPUBuffer->GetPointer(), m_BufferSize, cudaMemcpyDeviceToHost);
       CudaCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
 
@@ -117,9 +115,7 @@ CudaImageDataManager<ImageType>::MakeGPUBufferUpToDate()
       std::cout << "CPU->GPU data copy" << std::endl;
 #endif
 
-      CUDA_CHECK(cuCtxSetCurrent(
-        *(this->m_ContextManager->GetCurrentContext()))); // This is necessary when running multithread to bind the host
-                                                          // CPU thread to the right context
+      CUDA_CHECK(cudaSetDevice(m_Device));
       errid = cudaMemcpy(m_GPUBuffer->GetPointer(), m_CPUBuffer, m_BufferSize, cudaMemcpyHostToDevice);
       CudaCheckError(errid, __FILE__, __LINE__, ITK_LOCATION);
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b936a79..416ff0f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(CudaCommon_SRCS
-  itkCudaContextManager.cxx
   itkCudaDataManager.cxx
   itkCudaUtil.cxx
   itkCudaMemoryProbe.cxx
diff --git a/src/itkCudaContextManager.cxx b/src/itkCudaContextManager.cxx
deleted file mode 100644
index 4c80479..0000000
--- a/src/itkCudaContextManager.cxx
+++ /dev/null
@@ -1,109 +0,0 @@
-/*=========================================================================
- *
- *  Copyright NumFOCUS
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *         https://www.apache.org/licenses/LICENSE-2.0.txt
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- *=========================================================================*/
-#include <cassert>
-#include "itkCudaContextManager.h"
-#include "cuda.h"
-#include "cuda_runtime_api.h"
-
-namespace itk
-{
-// static variable initialization
-CudaContextManager * CudaContextManager::m_Instance = nullptr;
-bool                 CudaContextManager::m_Initialized = false;
-
-
-CudaContextManager *
-CudaContextManager::GetInstance()
-{
-  if (m_Instance == nullptr)
-  {
-    m_Instance = new CudaContextManager();
-  }
-  m_Instance->Register();
-  return m_Instance;
-}
-
-void
-CudaContextManager::DestroyInstance()
-{
-  m_Instance->UnRegister();
-  if (m_Instance->GetReferenceCount() == 1)
-  {
-    m_Instance->Delete();
-    m_Instance = nullptr;
-  }
-}
-
-CudaContextManager::CudaContextManager()
-{
-  m_DeviceIdx = -1;
-  m_Device = 0;
-
-  if (!m_Initialized)
-  {
-    cuInit(0);
-    m_Initialized = true;
-  }
-
-  std::vector<cudaDeviceProp> devices;
-  m_NumberOfDevices = itk::CudaGetAvailableDevices(devices);
-
-  if (m_NumberOfDevices)
-  {
-    CUdevice device = 0;
-    m_DeviceIdx = itk::CudaGetMaxFlopsDev();
-    CUDA_CHECK(cuDeviceGet(&device, m_DeviceIdx));
-
-    CUDA_CHECK(cuCtxCreate(&m_Context, CU_CTX_SCHED_AUTO, device));
-
-    CUDA_CHECK(cuCtxSetCurrent(m_Context));
-
-    m_Device = device;
-  }
-  else
-  {
-    m_Context = nullptr;
-    m_Device = 0;
-    m_DeviceIdx = 0;
-  }
-}
-
-CudaContextManager::~CudaContextManager()
-{
-  if (m_Context)
-  {
-    CUDA_CHECK(cuCtxDestroy(m_Context));
-  }
-  cudaDeviceReset();
-}
-
-int
-CudaContextManager::GetCurrentDevice()
-{
-  int device = -1;
-  CUDA_CHECK(cudaGetDevice(&device));
-  return device;
-}
-
-CUcontext *
-CudaContextManager::GetCurrentContext()
-{
-  return &m_Context;
-}
-
-} // namespace itk
diff --git a/src/itkCudaDataManager.cxx b/src/itkCudaDataManager.cxx
index 6db4870..1d83101 100644
--- a/src/itkCudaDataManager.cxx
+++ b/src/itkCudaDataManager.cxx
@@ -24,18 +24,8 @@ namespace itk
 // constructor
 CudaDataManager::CudaDataManager()
 {
-  m_ContextManager = CudaContextManager::GetInstance();
-
-  // Creating the context in the constructor allows avoiding a memory leak.
-  // However, the cuda data manager is created even if there is no use of CUDA
-  // software and sometimes one compiles RTK with CUDA but wants to use it
-  // without CUDA. So if the context pointer is nullptr, which indicates that there
-  // is no CUDA device available, we just do not set the context (SR). This fixes
-  // the problem reported here:
-  // https://www.creatis.insa-lyon.fr/pipermail/rtk-users/2015-July/000570.html
-  CUcontext * ctx = m_ContextManager->GetCurrentContext();
-  if (ctx)
-    CUDA_CHECK(cuCtxSetCurrent(*ctx));
+  m_Device = 0;
+  CUDA_CHECK(cudaSetDevice(m_Device));
 
   m_CPUBuffer = nullptr;
   m_GPUBuffer = GPUMemPointer::New();
@@ -56,7 +46,6 @@ CudaDataManager::CudaDataManager()
 CudaDataManager::~CudaDataManager()
 {
   m_GPUBuffer = nullptr;
-  CudaContextManager::DestroyInstance();
 }
 
 void
@@ -91,9 +80,7 @@ CudaDataManager::Free()
   {
     try
     {
-      CUDA_CHECK(cuCtxSetCurrent(
-        *(this->m_ContextManager->GetCurrentContext()))); // This is necessary when running multithread to bind the host
-                                                          // CPU thread to the right context
+      CUDA_CHECK(cudaSetDevice(m_Device));
       m_GPUBuffer->Free();
     }
     catch (itk::ExceptionObject & e)
@@ -171,9 +158,7 @@ CudaDataManager::UpdateCPUBuffer()
       std::cout << this << "::UpdateCPUBuffer GPU->CPU data copy " << m_GPUBuffer->GetPointer() << "->" << m_CPUBuffer
                 << " : " << m_BufferSize << std::endl;
 #endif
-      CUDA_CHECK(cuCtxSetCurrent(
-        *(this->m_ContextManager->GetCurrentContext()))); // This is necessary when running multithread to bind the host
-                                                          // CPU thread to the right context
+      CUDA_CHECK(cudaSetDevice(m_Device));
       CUDA_CHECK(cudaMemcpy(m_CPUBuffer, m_GPUBuffer->GetPointer(), m_BufferSize, cudaMemcpyDeviceToHost));
       m_IsCPUBufferDirty = false;
     }
@@ -212,9 +197,7 @@ CudaDataManager::UpdateGPUBuffer()
       std::cout << this << "::UpdateGPUBuffer CPU->GPU data copy " << m_CPUBuffer << "->" << m_GPUBuffer->GetPointer()
                 << " : " << m_BufferSize << std::endl;
 #endif
-      CUDA_CHECK(cuCtxSetCurrent(
-        *(this->m_ContextManager->GetCurrentContext()))); // This is necessary when running multithread to bind the host
-                                                          // CPU thread to the right context
+      CUDA_CHECK(cudaSetDevice(m_Device));
       CUDA_CHECK(cudaMemcpy(m_GPUBuffer->GetPointer(), m_CPUBuffer, m_BufferSize, cudaMemcpyHostToDevice));
     }
     m_IsGPUBufferDirty = false;
@@ -259,7 +242,6 @@ CudaDataManager::Graft(const CudaDataManager * data)
   if (data)
   {
     m_BufferSize = data->m_BufferSize;
-    m_ContextManager = data->m_ContextManager;
     m_GPUBuffer = data->m_GPUBuffer;
     m_CPUBuffer = data->m_CPUBuffer;
     m_IsCPUBufferDirty = data->m_IsCPUBufferDirty;