pytorch · q10 · Apr 23, 2025
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp
@@ -175,10 +175,10 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
     {% endif %}
 
     {% if not nobag %}
-    const int32_t T = D_offsets.numel() - 1;
+    const auto T = D_offsets.numel() - 1;
     {% else %}
-    const int32_t total_L = indices.numel();
-    const int32_t T = weights_offsets.numel();
+    const auto total_L = indices.numel();
+    const auto T = weights_offsets.numel();
     {% endif %}
     TORCH_CHECK(T > 0);
     // offsets = [B x T  + 1]

diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu
@@ -115,10 +115,10 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
     indices = indices.contiguous();
 
     {%- if not nobag %}
-    const int32_t T = D_offsets.numel() - 1;
+    const auto T = D_offsets.numel() - 1;
     {%- else %}
-    const int32_t total_L = indices.numel();
-    const int32_t T = weights_offsets.numel();
+    const auto total_L = indices.numel();
+    const auto T = weights_offsets.numel();
     {%- endif %}
 
     TORCH_CHECK(T > 0);

diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -690,9 +690,9 @@ Tensor {{ embedding_cuda_op }}(
     }
 
     {%- if not nobag %}
-    int32_t T = D_offsets.numel() - 1;
+    auto T = D_offsets.numel() - 1;
     {%- else %}
-    int32_t T = weights_offsets.numel();
+    auto T = weights_offsets.numel();
     {%- endif %}
 
     TORCH_CHECK_GT(T, 0);

diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -453,10 +453,10 @@ batch_index_select_dim0_codegen_forward_cuda(
     CUDA_DEVICE_GUARD(dev_weights);
 
     {%- if not nobag %}
-    int32_t T = D_offsets.numel() - 1;
+    auto T = D_offsets.numel() - 1;
     {%- else %}
-    int32_t total_L = indices.numel();
-    int32_t T = weights_offsets.numel();
+    auto total_L = indices.numel();
+    auto T = weights_offsets.numel();
     {%- endif %}
     TORCH_CHECK_GT(T, 0);
     // offsets = [B x T  + 1]

diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp
@@ -164,7 +164,7 @@ Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
         .device(host_weights.options().device());
     const int64_t vbe_output_size_ = vbe_output_size.guard_int(__FILE__, __LINE__);
     Tensor output_new = at::empty({vbe_output_size_}, options);
-    const int32_t T = D_offsets.numel() - 1;
+    const auto T = D_offsets.numel() - 1;
     const int32_t R = vbe_B_offsets_rank_per_feature.size(1) - 1;
 
     for (int32_t r = 0; r < R; r++){

diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu
@@ -670,7 +670,7 @@ at::Tensor get_fp8_per_tensor_scale(
   auto _st = input.scalar_type();
   TORCH_CHECK(_st == torch::kBFloat16, "Invalid datatype. input must be BF16");
 
-  int out_size = input.numel() == 0 ? 0 : 1;
+  auto out_size = input.numel() == 0 ? 0 : 1;
 
   at::Tensor scale = torch::empty(
       {out_size},

diff --git a/fbgemm_gpu/src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu b/fbgemm_gpu/src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu
@@ -769,7 +769,7 @@ Tensor remap_indices_update_utils_cuda(
   const auto feature_offsets_a = feature_offsets.accessor<int64_t, 1>();
 
   const auto use_gdt = full_values_list.has_value();
-  const int32_t num_features = feature_lengths.numel();
+  const auto num_features = feature_lengths.numel();
   const bool update_util_value = update_util.has_value()
       ? update_util.value()
       : ((iter < 10) || (iter < 100 && (iter + 1) % 19 == 0) ||

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu
@@ -93,7 +93,7 @@ std::tuple<Tensor, Tensor> batched_dense_vec_jagged_2d_mul_backward(
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(grad_output, a_values, a_offsets, v);
   CUDA_DEVICE_GUARD(grad_output);
 
-  const int B = a_offsets.numel() - 1;
+  const auto B = a_offsets.numel() - 1;
   const int D = grad_output.size(-1);
 
   Tensor a_values_grad = at::zeros_like(a_values);

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu
@@ -58,7 +58,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward(
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(v, a_values, a_offsets);
   CUDA_DEVICE_GUARD(v);
 
-  const int B = a_offsets.numel() - 1;
+  const auto B = a_offsets.numel() - 1;
   TORCH_CHECK(
       B == 0 || v.size(0) % B == 0,
       "B, ",

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu
@@ -158,7 +158,7 @@ Tensor jagged_dense_bmm_forward_cuda(
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(x_values, x_offsets, y);
   CUDA_DEVICE_GUARD(x_values);
 
-  const int B = x_offsets.numel() - 1;
+  const auto B = x_offsets.numel() - 1;
   const int M = x_values.size(-1);
   const int N = y.size(-1);
   const int total_L = x_values.size(0);

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu
@@ -164,7 +164,7 @@ Tensor jagged_jagged_bmm_forward_cuda(
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(x_values, y_values, offsets);
   CUDA_DEVICE_GUARD(x_values);
 
-  const int B = offsets.numel() - 1;
+  const auto B = offsets.numel() - 1;
   const int M = x_values.size(-1);
   const int N = y_values.size(-1);
   auto output = at::zeros({B, M, N}, x_values.options());

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
@@ -797,7 +797,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward(
   TENSOR_ON_CPU(a_values);
   TENSOR_ON_CPU(a_offsets);
 
-  const int B = a_offsets.numel() - 1;
+  const auto B = a_offsets.numel() - 1;
   TORCH_CHECK(
       B == 0 || v.size(0) % B == 0,
       "B, ",
@@ -834,7 +834,7 @@ std::tuple<Tensor, Tensor> batched_dense_vec_jagged_2d_mul_backward(
   Tensor a_values_grad = at::zeros_like(a_values);
   Tensor v_grad = at::empty_like(v);
 
-  const int B = a_offsets.numel() - 1;
+  const auto B = a_offsets.numel() - 1;
   const int D = grad_output.size(-1);
 
   if (B > 0 && D > 0) {
@@ -1316,7 +1316,7 @@ Tensor jagged_softmax_forward(
     const int64_t max_L) {
   TENSOR_ON_CPU(values);
   TENSOR_ON_CPU(offsets);
-  const int B = offsets.numel() - 1;
+  const auto B = offsets.numel() - 1;
   const int D = values.size(1);
   auto output = at::empty_like(values);
 
@@ -1374,7 +1374,7 @@ Tensor jagged_softmax_backward(
   TENSOR_ON_CPU(grad_output);
   TENSOR_ON_CPU(output);
   TENSOR_ON_CPU(offsets);
-  const int B = offsets.numel() - 1;
+  const auto B = offsets.numel() - 1;
   const int D = grad_output.size(1);
   auto grad_input = at::empty_like(grad_output);
 

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu
@@ -180,8 +180,8 @@ class KeyedJaggedIndexSelectDim1GPUOp
     device_guard.set_index(values.get_device());
 
     const auto batch_size = _batch_size.guard_int(__FILE__, __LINE__);
-    const int num_batches = lengths.numel() / batch_size;
-    const int num_output_lengths = num_batches * indices.numel();
+    const auto num_batches = lengths.numel() / batch_size;
+    const auto num_output_lengths = num_batches * indices.numel();
     const int MAX_CUMSUM_ENTRIES_PER_BLOCK = 256;
     auto grid_size = cuda_calc_xblock_count(
         num_output_lengths, MAX_CUMSUM_ENTRIES_PER_BLOCK);

diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu
@@ -437,7 +437,7 @@ void copy_to_shared(const Tensor& t) {
   int total_shared_mem = prop.sharedMemPerBlock;
   int num_sms = prop.multiProcessorCount;
   // Make sure that input tensor can fit on shared memory.
-  int input_size = t.numel() * t.element_size();
+  auto input_size = t.numel() * t.element_size();
   TORCH_CHECK(
       input_size <= total_shared_mem,
       "Input tensor is too large to fit on shared memory");

diff --git a/fbgemm_gpu/src/quantize_ops/quantize_mx.cu b/fbgemm_gpu/src/quantize_ops/quantize_mx.cu
@@ -119,7 +119,7 @@ DLL_PUBLIC at::Tensor quantize_mx_cuda(
   TORCH_CHECK(!flush_fp32_subnorms, "flush_fp32_subnorms is not yet supported");
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input);
 
-  const uint32_t total_elems = input.numel();
+  const auto total_elems = input.numel();
   if (total_elems == 0) {
     return at::empty(0, input.options().dtype(at::kByte));
   }
@@ -145,7 +145,7 @@ DLL_PUBLIC at::Tensor quantize_mx_cuda(
 
   at::Device device = input.device();
   const at::cuda::CUDAGuard device_guard{device};
-  const uint32_t total_num_groups = input.numel() / mx_group_size;
+  const auto total_num_groups = input.numel() / mx_group_size;
 
   RoundingMode rd = static_cast<RoundingMode>(rounding_mode);
 

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu
@@ -58,8 +58,8 @@ Tensor batched_unary_embeddings_forward_cuda(
 
   // N: number of tasks, T: number of tables, B: batch size
   const int32_t N = weight.size(0);
-  const int32_t T = table_offsets.numel() - 1;
-  const int32_t B = (offsets.numel() - 1) / T;
+  const auto T = table_offsets.numel() - 1;
+  const auto B = (offsets.numel() - 1) / T;
   TORCH_CHECK(N > 0);
   TORCH_CHECK(B > 0);
   TORCH_CHECK(T > 0);
@@ -215,7 +215,7 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda(
           info_B_num_bits,
           info_B_mask);
 
-  int threads = std::min<int32_t>(sorted_linear_indices_run.numel(), 512);
+  auto threads = std::min<int32_t>(sorted_linear_indices_run.numel(), 512);
   dim3 blocks(
       cuda_calc_xblock_count(sorted_linear_indices_run.numel(), threads), N);
   auto grad_weight = at::zeros_like(weight);

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu
@@ -157,7 +157,7 @@ bucketize_sparse_features_cuda(
   CUDA_DEVICE_GUARD(lengths);
 
   // allocate tensors and buffers
-  const int lengths_size = lengths.numel();
+  const auto lengths_size = lengths.numel();
   const int new_lengths_size = lengths_size * my_size;
   auto offsets = at::empty({lengths_size}, lengths.options());
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -369,7 +369,7 @@ void _block_bucketize_sparse_features_cpu_kernel(
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
-  const int32_t T = block_sizes.numel();
+  const auto T = block_sizes.numel();
   const int32_t B = lengths_size / T;
   auto offsets = at::empty({lengths_size + 1}, lengths.options());
   auto new_offsets = at::empty({new_lengths_size + 1}, lengths.options());
@@ -1945,8 +1945,8 @@ Tensor batched_unary_embeddings_forward_cpu(
 
   // N: number of tasks, T: number of tables, B: batch size
   const int32_t N = weight.sizes()[0];
-  const int32_t T = table_offsets.numel() - 1;
-  const int32_t B = (offsets.numel() - 1) / T;
+  const auto T = table_offsets.numel() - 1;
+  const auto B = (offsets.numel() - 1) / T;
   TORCH_CHECK(N > 0);
   TORCH_CHECK(T > 0);
   TORCH_CHECK(B > 0);

diff --git a/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu b/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu
@@ -66,7 +66,7 @@ DLL_PUBLIC Tensor segment_sum_csr_cuda(
   }
 
   constexpr uint32_t threads_per_block = 256;
-  const uint32_t num_blocks = csr_seg.numel() - 1;
+  const auto num_blocks = csr_seg.numel() - 1;
 
   FBGEMM_DISPATCH_ALL_TYPES(
       values.scalar_type(), "_segment_sum_csr_cuda_1", [&] {

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
@@ -121,7 +121,7 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
       static_cast<int64_t>(
           static_cast<uint64_t>(lxu_cache_state.size(0)) << kLFUCounterBits),
       unique_indices.options().dtype(at::kLong));
-  const int32_t N = unique_indices.numel();
+  const auto N = unique_indices.numel();
   auto sorted_cache_sets = empty_like(cache_sets);
   auto cache_set_sorted_unique_indices = empty_like(unique_indices);
 

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu
@@ -182,7 +182,7 @@ void lfu_cache_insert_cuda(
 
   CUDA_DEVICE_GUARD(weights);
 
-  const int32_t N = cache_set_sorted_unique_indices.numel();
+  const auto N = cache_set_sorted_unique_indices.numel();
 
   DISPATCH_EMB_CACHE_TYPES(
       weights.scalar_type(),

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu
@@ -167,7 +167,7 @@ void lfu_cache_insert_byte_cuda(
 
   CUDA_DEVICE_GUARD(weights);
 
-  const int32_t N = cache_set_sorted_unique_indices.numel();
+  const auto N = cache_set_sorted_unique_indices.numel();
 
   AT_DISPATCH_INDEX_TYPES(
       cache_set_sorted_unique_indices.scalar_type(),

diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu
@@ -212,7 +212,7 @@ get_unique_indices_cuda_impl(
   CUDA_DEVICE_GUARD(linear_indices);
 
   TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
-  const int32_t N = linear_indices.numel();
+  const auto N = linear_indices.numel();
   auto sorted_indices = at::empty_like(linear_indices);
   auto unique_indices = at::empty_like(linear_indices);
   auto unique_indices_length =

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
@@ -181,7 +181,7 @@ lru_cache_find_uncached_cuda(
       unique_indices,
       lxu_cache_state.size(0),
       unique_indices.options().dtype(at::kInt));
-  const int32_t N = unique_indices.numel();
+  const auto N = unique_indices.numel();
   auto sorted_cache_sets = empty_like(cache_sets);
   auto cache_set_sorted_unique_indices = empty_like(unique_indices);
 

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu
@@ -204,7 +204,7 @@ void lru_cache_insert_cuda(
 
   CUDA_DEVICE_GUARD(weights);
 
-  const int32_t N = cache_set_sorted_unique_indices.numel();
+  const auto N = cache_set_sorted_unique_indices.numel();
   DISPATCH_EMB_CACHE_TYPES(
       weights.scalar_type(),
       lxu_cache_weights.scalar_type(),

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu
@@ -106,7 +106,7 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(
 
   CUDA_DEVICE_GUARD(linear_cache_indices);
 
-  const int32_t N = linear_cache_indices.numel();
+  const auto N = linear_cache_indices.numel();
 
   auto cache_sets = empty_like(
       linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
@@ -392,7 +392,7 @@ void lru_cache_insert_byte_cuda(
 
   CUDA_DEVICE_GUARD(weights);
 
-  const int32_t N = cache_set_sorted_unique_indices.numel();
+  const auto N = cache_set_sorted_unique_indices.numel();
 
   AT_DISPATCH_INDEX_TYPES(
       cache_set_sorted_unique_indices.scalar_type(),

diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
@@ -102,7 +102,7 @@ DLL_PUBLIC void lxu_cache_flush_cuda(
 
   CUDA_DEVICE_GUARD(lxu_cache_weights);
 
-  const int32_t T = D_offsets.numel() - 1;
+  const auto T = D_offsets.numel() - 1;
   const int32_t S = lxu_cache_weights.size(0);
   const int32_t tx = std::min<int32_t>(total_D / 4 / T, kMaxThreads);
   const dim3 threads(tx, kMaxThreads / tx);

diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu
@@ -368,7 +368,7 @@ ssd_cache_populate_actions_cuda(
           .to(at::kInt);
 
   TORCH_CHECK_LT(unique_indices.numel(), std::numeric_limits<int32_t>::max());
-  const int32_t N = unique_indices.numel();
+  const auto N = unique_indices.numel();
 
   auto evicted_indices = empty_like(unique_indices);
   const auto int_options = unique_indices.options().dtype(at::kInt);