Add keep_orig_idx_per_feature parameter to block_bucketize_sparse_features kernel

emlin · facebook-github-bot · commit 2a3e87e63313 · 2025-04-25T16:54:38.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1112 **Context** Enhance block_bucketize_sparse_features and block_bucketize_sparse_features_inference kernels to support mixed-format embedding tables. Previously, the keep_orig_idx parameter was a boolean flag applied uniformly across all features, determining whether to retain the original index. With the introduction of [the Flexible Collision-Free Embedding Table](https://github.com/pytorch/torchrec/blob/main/rfc/RFC-0002-Flexible-Collision-Free-Embedding-Table.md), one embedding collection may include both collision-free and collision tables. This update allows the kernel to handle mixed formats by supporting feature-wise control over index retention. For collision-free tables, a large table size of 2^50 is set, maintaining parameters as id-value pairs and preserving the original global id. This change facilitates the use of mixed-style embedding tables effectively. Spec: - keep_orig_idx_per_feature is an optional parameter with per feature settings. - If the keep_orig_idx_per_feature is not None, the value will override global flag keep_orig_idx, no matter it's true for false. - If keep_orig_idx_per_feature is None, fallback to keep_orig_idx control. Note: Adding additional parameter keep_orig_idx_per_feature, instead of change keep_orig_idx directly, is to avoid backward compatibility issue. Differential Revision: D73606958
diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -186,6 +186,7 @@ block_bucketize_sparse_features_cuda(
     const int64_t max_batch_size,
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool keep_orig_idx,
+    const std::optional<at::Tensor>& keep_orig_idx_per_feature,
     const std::optional<at::Tensor>& total_num_blocks);
 
 std::tuple<
@@ -208,6 +209,7 @@ block_bucketize_sparse_features_cpu(
     const int64_t max_batch_size,
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool keep_orig_idx,
+    const std::optional<at::Tensor>& keep_orig_idx_per_feature,
     const std::optional<at::Tensor>& total_num_blocks);
 
 std::tuple<
@@ -231,6 +233,7 @@ block_bucketize_sparse_features_inference_cuda(
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool return_bucket_mapping,
     const bool keep_orig_idx,
+    const std::optional<at::Tensor>& keep_orig_idx_per_feature,
     const std::optional<at::Tensor>& total_num_blocks);
 
 ///@ingroup sparse-data-cuda
@@ -261,6 +264,7 @@ block_bucketize_sparse_features_inference_cpu(
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool return_bucket_mapping,
     const bool keep_orig_idx,
+    const std::optional<at::Tensor>& keep_orig_idx_per_feature,
     const std::optional<at::Tensor>& total_num_blocks);
 
 ///@ingroup sparse-data-cpu
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -191,7 +191,8 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_pooled_sparse_features_cuda
     const index_t* const __restrict__ block_bucketize_pos_concat,
     const index_t* const __restrict__ block_bucketize_pos_offsets,
     const index_t* const __restrict__ indices_to_lb,
-    const bool keep_orig_idx) {
+    const bool keep_orig_idx,
+    const bool* const __restrict__ keep_orig_idx_per_feature) {
   using uindex_t = std::make_unsigned_t<index_t>;
   const auto bt_start = blockIdx.x * blockDim.y + threadIdx.y;
   const auto stride = gridDim.x * blockDim.y;
@@ -220,6 +221,12 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_pooled_sparse_features_cuda
         total_num_blocks == nullptr ? my_size : total_num_blocks[t];
     const index_t global_idx_size = blk_size * global_num_blks;
     const index_t local_idx_size = blk_size * local_num_blks;
+    auto keep_idx = keep_orig_idx;
+    if (keep_orig_idx_per_feature != nullptr) {
+      // When keep_orig_idx_per_feature is set, override global
+      // keep_orig_idx settings
+      keep_idx = keep_orig_idx_per_feature[t];
+    }
     for (auto i = rowstart + threadIdx.x; i < rowend; i += blockDim.x) {
       // We have use cases using none-hashed raw indices that can be either
       // negative or larger than embedding table hash_size (blk_size *
@@ -233,7 +240,7 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_pooled_sparse_features_cuda
       if (!use_block_bucketize_pos) { // uniform bucket sizes
         p = idx < global_idx_size ? idx / local_idx_size
                                   : (idx % global_num_blks) / local_num_blks;
-        if (keep_orig_idx) {
+        if (keep_idx) {
           new_idx = idx;
         } else if (idx < global_idx_size) {
           new_idx = idx % local_idx_size;
@@ -243,7 +250,7 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_pooled_sparse_features_cuda
       } else { // variable bucket sizes
         uindex_t lb = indices_to_lb[i];
         p = lb < my_size ? lb : idx % my_size;
-        if (keep_orig_idx) {
+        if (keep_idx) {
           new_idx = idx;
         } else if (blk_size == 0) {
           new_idx = idx / global_num_blks;
@@ -307,7 +314,8 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sequence_sparse_features_cu
     const index_t* const __restrict__ block_bucketize_pos_concat,
     const index_t* const __restrict__ block_bucketize_pos_offsets,
     const index_t* const __restrict__ indices_to_lb,
-    const bool keep_orig_idx) {
+    const bool keep_orig_idx,
+    const bool* const __restrict__ keep_orig_idx_per_feature) {
   using uindex_t = std::make_unsigned_t<index_t>;
   using uoffset_t = std::make_unsigned_t<offset_t>;
   CUDA_KERNEL_LOOP(b_t, lengths_size) {
@@ -324,6 +332,12 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sequence_sparse_features_cu
     offset_t rowend = offsets_data[b_t];
     const auto use_block_bucketize_pos =
         (block_bucketize_pos_concat != nullptr);
+    auto keep_idx = keep_orig_idx;
+    if (keep_orig_idx_per_feature != nullptr) {
+      // When keep_orig_idx_per_feature is set, override global
+      // keep_orig_idx settings
+      keep_idx = keep_orig_idx_per_feature[t];
+    }
     for (index_t i = rowstart; i < rowend; ++i) {
       // We have use cases using none-hashed raw indices that can be either
       // negative or larger than embedding table hash_size (blk_size *
@@ -337,7 +351,7 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sequence_sparse_features_cu
       if (!use_block_bucketize_pos) {
         p = idx < global_idx_size ? idx / local_idx_size
                                   : (idx % global_num_blks) / local_num_blks;
-        if (keep_orig_idx) {
+        if (keep_idx) {
           new_idx = idx;
         } else if (idx < global_idx_size) {
           new_idx = idx % local_idx_size;
@@ -347,7 +361,7 @@ __launch_bounds__(kMaxThreads) void _block_bucketize_sequence_sparse_features_cu
       } else {
         uindex_t lb = indices_to_lb[i];
         p = lb < my_size ? lb : idx % my_size;
-        if (keep_orig_idx) {
+        if (keep_idx) {
           new_idx = idx;
         } else if (blk_size == 0) {
           new_idx = idx / global_num_blks;
@@ -455,7 +469,10 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel(
                             block_bucketize_pos.has_value()                      \
                                 ? indices_to_lb.data_ptr<index_t>()              \
                                 : static_cast<index_t*>(nullptr),                \
-                            keep_orig_idx);                                      \
+                            keep_orig_idx,                                       \
+                            keep_orig_idx_per_feature.has_value()                \
+                                ? keep_orig_idx_per_feature->data_ptr<bool>()    \
+                                : static_cast<bool*>(nullptr));                  \
                     C10_CUDA_KERNEL_LAUNCH_CHECK();                              \
                   });                                                            \
             });                                                                  \
@@ -514,7 +531,10 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel(
                       block_bucketize_pos.has_value()                               \
                           ? indices_to_lb.data_ptr<index_t>()                       \
                           : static_cast<index_t*>(nullptr),                         \
-                      keep_orig_idx);                                               \
+                      keep_orig_idx,                                                \
+                      keep_orig_idx_per_feature.has_value()                         \
+                          ? keep_orig_idx_per_feature->data_ptr<bool>()             \
+                          : static_cast<bool*>(nullptr));                           \
               C10_CUDA_KERNEL_LAUNCH_CHECK();                                       \
             });                                                                     \
       });
@@ -577,7 +597,10 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel(
                         block_bucketize_pos.has_value()                          \
                             ? indices_to_lb.data_ptr<index_t>()                  \
                             : static_cast<index_t*>(nullptr),                    \
-                        keep_orig_idx);                                          \
+                        keep_orig_idx,                                           \
+                        keep_orig_idx_per_feature.has_value()                    \
+                            ? keep_orig_idx_per_feature->data_ptr<bool>()        \
+                            : static_cast<bool*>(nullptr));                      \
                     C10_CUDA_KERNEL_LAUNCH_CHECK();                              \
                   });                                                            \
             });                                                                  \
@@ -637,13 +660,17 @@ __launch_bounds__(kMaxThreads) void _populate_bucketized_permute_cuda_kernel(
                   block_bucketize_pos.has_value()                                   \
                       ? indices_to_lb.data_ptr<index_t>()                           \
                       : static_cast<index_t*>(nullptr),                             \
-                  keep_orig_idx);                                                   \
+                  keep_orig_idx,                                                    \
+                  keep_orig_idx_per_feature.has_value()                             \
+                      ? keep_orig_idx_per_feature->data_ptr<bool>()                 \
+                      : static_cast<bool*>(nullptr));                               \
               C10_CUDA_KERNEL_LAUNCH_CHECK();                                       \
             });                                                                     \
       });
 
 // This function partitions sparse features
-// continuously along the sparse dimension into my_size blocks
+// continuously along the sparse dimension into
+// my_size blocks
 std::tuple<
     Tensor,
     Tensor,
@@ -664,7 +691,8 @@ _block_bucketize_sparse_features_cuda(
     const int64_t max_B,
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool return_bucket_mapping,
-    const bool keep_orig_idx) {
+    const bool keep_orig_idx,
+    const std::optional<Tensor>& keep_orig_idx_per_feature) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(lengths, indices);
 
   CUDA_DEVICE_GUARD(lengths);
@@ -740,8 +768,9 @@ _block_bucketize_sparse_features_cuda(
     at::Tensor sizes_vec =
         at::tensor(sizes_, at::TensorOptions().dtype(indices_contig.dtype()));
     block_bucketize_pos_offsets = asynchronous_exclusive_cumsum_cpu(
-        sizes_vec); // expect sizes_vec to be a small tensor, using cpu instead
-                    // of gpu for cumsum
+        sizes_vec); // expect sizes_vec to be a
+                    // small tensor, using cpu
+                    // instead of gpu for cumsum
     block_bucketize_pos_offsets = block_bucketize_pos_offsets.to(
         block_bucketize_pos_concat.device(), true);
   }
@@ -896,8 +925,8 @@ _block_bucketize_sparse_features_cuda(
 #undef LAUNCH_BLOCK_BUCKETIZE_POOLED_SPARSE_FEATURES_CUDA_KERNEL_2_WITHOUT_WEIGHT
 
 // This function partitions sparse features
-// continuously along the sparse dimension into my_size
-// blocks
+// continuously along the sparse dimension into
+// my_size blocks
 DLL_PUBLIC std::tuple<
     Tensor,
     Tensor,
@@ -916,6 +945,7 @@ block_bucketize_sparse_features_cuda(
     const int64_t max_B,
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool keep_orig_idx,
+    const std::optional<at::Tensor>& keep_orig_idx_per_feature,
     const std::optional<Tensor>& total_num_blocks) {
   Tensor new_lengths;
   Tensor new_indices;
@@ -942,12 +972,14 @@ block_bucketize_sparse_features_cuda(
           max_B,
           block_bucketize_pos,
           false,
-          keep_orig_idx);
+          keep_orig_idx,
+          keep_orig_idx_per_feature);
   return {new_lengths, new_indices, new_weights, new_pos, unbucketize_permute};
 }
 
 // This function partitions sparse features
-// continuously along the sparse dimension into my_size blocks
+// continuously along the sparse dimension into
+// my_size blocks
 DLL_PUBLIC std::tuple<
     Tensor,
     Tensor,
@@ -968,6 +1000,7 @@ block_bucketize_sparse_features_inference_cuda(
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool return_bucket_mapping,
     const bool keep_orig_idx,
+    const std::optional<at::Tensor>& keep_orig_idx_per_feature,
     const std::optional<Tensor>& total_num_blocks) {
   return _block_bucketize_sparse_features_cuda(
       lengths,
@@ -982,7 +1015,8 @@ block_bucketize_sparse_features_inference_cuda(
       max_B,
       block_bucketize_pos,
       return_bucket_mapping,
-      keep_orig_idx);
+      keep_orig_idx,
+      keep_orig_idx_per_feature);
 }
 
 DLL_PUBLIC Tensor populate_bucketized_permute_cuda(
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -365,7 +365,8 @@ void _block_bucketize_sparse_features_cpu_kernel(
     const std::optional<Tensor>& batch_size_per_feature,
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const std::optional<Tensor>& bucket_mapping,
-    const bool keep_orig_idx) {
+    const bool keep_orig_idx,
+    const std::optional<Tensor>& keep_orig_idx_per_feature = std::nullopt) {
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
@@ -487,6 +488,13 @@ void _block_bucketize_sparse_features_cpu_kernel(
         : my_size;
     const index_t global_idx_size = blk_size * global_num_blks;
     const index_t local_idx_size = blk_size * local_num_blks;
+    auto keep_idx = keep_orig_idx;
+    if (keep_orig_idx_per_feature.has_value()) {
+      // When keep_orig_idx_per_feature is set, override global
+      // keep_orig_idx settings
+      keep_idx = keep_orig_idx_per_feature.value().data_ptr<bool>()[t];
+    }
+
     for (const auto b : c10::irange(cur_batch_size)) {
       const auto b_t = (variable_batch_size ? cur_offset : t * B) + b;
       const offset_t rowstart = offsets_data[b_t];
@@ -503,7 +511,7 @@ void _block_bucketize_sparse_features_cpu_kernel(
         if (variable_bucket_sizes) {
           int64_t lb = lower_bounds[i];
           p = lb < my_size ? lb : idx % my_size;
-          if (keep_orig_idx) {
+          if (keep_idx) {
             new_idx = idx;
           } else if (blk_size == 0) {
             new_idx = idx / global_num_blks;
@@ -517,7 +525,7 @@ void _block_bucketize_sparse_features_cpu_kernel(
           const uindex_t ub = static_cast<uindex_t>(global_idx_size);
           p = idx < ub ? idx / local_idx_size
                        : (idx % global_num_blks) / local_num_blks;
-          if (keep_orig_idx) {
+          if (keep_idx) {
             new_idx = idx;
           } else if (idx < ub) {
             new_idx = idx % local_idx_size;
@@ -1134,7 +1142,8 @@ _block_bucketize_sparse_features_cpu(
     const int64_t /* max_batch_size */, // Only used in GPU variant
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool return_bucket_mapping,
-    const bool keep_orig_idx) {
+    const bool keep_orig_idx,
+    const std::optional<Tensor>& keep_orig_idx_per_feature = std::nullopt) {
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
@@ -1183,7 +1192,8 @@ _block_bucketize_sparse_features_cpu(
                         batch_size_per_feature,                  \
                         block_bucketize_pos,                     \
                         bucket_mapping,                          \
-                        keep_orig_idx);                          \
+                        keep_orig_idx,                           \
+                        keep_orig_idx_per_feature);              \
                   });                                            \
             });                                                  \
       });
@@ -1219,7 +1229,8 @@ _block_bucketize_sparse_features_cpu(
                   batch_size_per_feature,                                   \
                   block_bucketize_pos,                                      \
                   bucket_mapping,                                           \
-                  keep_orig_idx);                                           \
+                  keep_orig_idx,                                            \
+                  keep_orig_idx_per_feature);                               \
             });                                                             \
       });
   const auto lengths_sum = indices.numel();
@@ -1289,7 +1300,8 @@ block_bucketize_sparse_features_cpu(
     const int64_t /* max_batch_size */, // Only used in GPU variant
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool keep_orig_idx,
-    const std::optional<Tensor>& total_num_blocks) {
+    const std::optional<Tensor>& keep_orig_idx_per_feature = std::nullopt,
+    const std::optional<Tensor>& total_num_blocks = std::nullopt) {
   Tensor new_lengths;
   Tensor new_indices;
   std::optional<Tensor> new_weights;
@@ -1315,7 +1327,8 @@ block_bucketize_sparse_features_cpu(
           -1, /* placeholder for max_batch_size */
           block_bucketize_pos,
           false,
-          keep_orig_idx);
+          keep_orig_idx,
+          keep_orig_idx_per_feature);
   return {new_lengths, new_indices, new_weights, new_pos, unbucketize_permute};
 }
 
@@ -1339,7 +1352,8 @@ block_bucketize_sparse_features_inference_cpu(
     const std::optional<std::vector<at::Tensor>>& block_bucketize_pos,
     const bool return_bucket_mapping,
     const bool keep_orig_idx,
-    const std::optional<Tensor>& total_num_blocks) {
+    const std::optional<Tensor>& keep_orig_idx_per_feature = std::nullopt,
+    const std::optional<Tensor>& total_num_blocks = std::nullopt) {
   return _block_bucketize_sparse_features_cpu(
       lengths,
       indices,
@@ -1353,7 +1367,8 @@ block_bucketize_sparse_features_inference_cpu(
       -1, /* placeholder for max_batch_size */
       block_bucketize_pos,
       return_bucket_mapping,
-      keep_orig_idx);
+      keep_orig_idx,
+      keep_orig_idx_per_feature);
 }
 
 // This function partitions sparse features
@@ -3401,9 +3416,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
   m.def(
       "populate_bucketized_permute(Tensor lengths, Tensor bucketized_lengths, Tensor bucket_mapping) -> Tensor");
   m.def(
-      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None, bool keep_orig_idx=False, Tensor? total_num_blocks=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None, bool keep_orig_idx=False, Tensor? keep_orig_idx_per_feature=None, Tensor? total_num_blocks=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?)");
   m.def(
-      "block_bucketize_sparse_features_inference(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None, bool return_bucket_mapping=False, bool keep_orig_idx=False, Tensor? total_num_blocks=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?, Tensor?)");
+      "block_bucketize_sparse_features_inference(Tensor lengths, Tensor indices, bool bucketize_pos, bool sequence, Tensor block_sizes, SymInt my_size, Tensor? weights=None, Tensor? batch_size_per_feature=None, SymInt max_B= -1, Tensor[]? block_bucketize_pos=None, bool return_bucket_mapping=False, bool keep_orig_idx=False, Tensor? keep_orig_idx_per_feature=None, Tensor? total_num_blocks=None) -> (Tensor, Tensor, Tensor?, Tensor?, Tensor?, Tensor?)");
   m.def(
       "bucketize_sparse_features(Tensor lengths, Tensor indices, bool bucketize_pos, SymInt my_size, Tensor? weights=None) -> (Tensor, Tensor, Tensor?, Tensor?)");
   m.def(
diff --git a/fbgemm_gpu/test/sparse/block_bucketize_test.py b/fbgemm_gpu/test/sparse/block_bucketize_test.py