Use CUDAStream instead of cudaStream_t in kernel_launcher (#4071)

q10 · facebook-github-bot · commit 0911c94093e6 · 2025-05-02T14:42:26.000-07:00
Summary: Pull Request resolved: #4071 Use CUDAStream that provides ID and device_idx without use of special APIs. at::cuda::getCurrentCUDAStream() at callsites returns CUDAStream that is then implicitly converted in cudaStream_t, but there's no reason to do that. Reviewed By: ngimel Differential Revision: D74051709 fbshipit-source-id: 52ce540d1e86e019aa369e9b8910bd71f4316c51
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/device_properties.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/device_properties.cuh
@@ -52,76 +52,4 @@ inline auto get_device_properties(const int device) {
   }
 }
 
-////////////////////////////////////////////////////////////////////////////////
-// Get CUDA Device From Stream
-//
-// Given a CUDA stream, fetch the device ID that the stream is associated with.
-// This function is memoized since the operation may be expensive
-////////////////////////////////////////////////////////////////////////////////
-
-inline auto get_device_for_stream(const cudaStream_t& stream) {
-  // Keep as thread local to avoid race conditions
-  static thread_local std::unordered_map<cudaStream_t, int> table;
-
-  if (const auto search = table.find(stream); search != table.end()) {
-    return search->second;
-
-  } else {
-    int device = 0;
-
-    // CUDA 12.8+ introduced cudaStreamGetDevice() to straightforwardly fetch
-    // the device from a given stream, but since the runtime drivers may not be
-    // at the latest, it will not support the API.  As such, we fetch the device
-    // ID can be fetched by context capture instead.
-
-    // Save the current device
-    int current_device;
-    C10_CUDA_CHECK(cudaGetDevice(&current_device));
-
-    // Force stream association by capturing dummy work
-    cudaStreamCaptureStatus status;
-    C10_CUDA_CHECK(cudaStreamIsCapturing(stream, &status));
-
-    // Save the device associated with the stream, and revert back to the
-    // current device
-    C10_CUDA_CHECK(cudaGetDevice(&device));
-    C10_CUDA_CHECK(cudaSetDevice(current_device));
-
-    table.insert({stream, device});
-    return device;
-  }
-}
-
-inline auto get_stream_id(const cudaStream_t& stream) {
-#if defined(__HIPCC__) || (defined(CUDA_VERSION) && (CUDA_VERSION < 12060))
-  // cudaStreamGetId is not available in HIP, and is only available in
-  // CUDA 12.6+.  Since streams are unique, we use its pointer value as the
-  // effective stream ID here.
-  return reinterpret_cast<unsigned long long>(stream);
-
-#else
-  // Keep as thread local to avoid race conditions
-  static thread_local std::unordered_map<cudaStream_t, unsigned long long>
-      table;
-
-  if (const auto search = table.find(stream); search != table.end()) {
-    return search->second;
-
-  } else {
-    unsigned long long streamId = 0;
-
-    if (auto [_, driver_version] = get_compute_versions();
-        driver_version <= 12060) {
-      streamId = reinterpret_cast<unsigned long long>(stream);
-
-    } else {
-      C10_CUDA_CHECK(cudaStreamGetId(stream, &streamId));
-    }
-
-    table.insert({stream, streamId});
-    return streamId;
-  }
-#endif
-}
-
 } // namespace fbgemm_gpu::utils
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh
@@ -212,12 +212,12 @@ struct KernelLauncher {
       const dim3 grid,
       const dim3 block,
       const size_t shared_mem_per_block,
-      const cudaStream_t stream,
+      const c10::cuda::CUDAStream stream,
       Args&&... args) const {
     // Fetch device properties from the stream information
-    const auto device = get_device_for_stream(stream);
+    const auto device = stream.device_index();
     const auto properties = get_device_properties(device);
-    const auto streamId = get_stream_id(stream);
+    const auto streamId = stream.id();
 
     // Check that the grid sizes are within the range per the device associated
     // with the compute stream
diff --git a/fbgemm_gpu/test/utils/kernel_launcher_test.cu b/fbgemm_gpu/test/utils/kernel_launcher_test.cu
@@ -233,7 +233,7 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
   at::Tensor A, B, C;
   std::tie(A, B, C) = sample_tensors(size);
 
-  const auto device = get_device_for_stream(at::cuda::getCurrentCUDAStream());
+  const auto device = at::cuda::getCurrentCUDAStream().device_index();
   const auto properties = get_device_properties(device);
   const auto grid_max = properties.maxGridSize;
   const auto block_max = properties.maxThreadsDim;