diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp index 710ade449c..9566b27bbc 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_cpu_template.cpp @@ -175,10 +175,10 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{ {% endif %} {% if not nobag %} - const int32_t T = D_offsets.numel() - 1; + const auto T = D_offsets.numel() - 1; {% else %} - const int32_t total_L = indices.numel(); - const int32_t T = weights_offsets.numel(); + const auto total_L = indices.numel(); + const auto T = weights_offsets.numel(); {% endif %} TORCH_CHECK(T > 0); // offsets = [B x T + 1] diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu index e7ba777ae2..4fd1115ec9 100644 --- a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu +++ b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_host_template.cu @@ -115,10 +115,10 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no indices = indices.contiguous(); {%- if not nobag %} - const int32_t T = D_offsets.numel() - 1; + const auto T = D_offsets.numel() - 1; {%- else %} - const int32_t total_L = indices.numel(); - const int32_t T = weights_offsets.numel(); + const auto total_L = indices.numel(); + const auto T = weights_offsets.numel(); {%- endif %} TORCH_CHECK(T > 0); diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu index 2579d59e9a..9b26cdbf44 100644 --- a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu +++ b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu @@ -690,9 +690,9 @@ Tensor {{ embedding_cuda_op }}( } {%- if not nobag %} - int32_t T = D_offsets.numel() - 1; + auto T = D_offsets.numel() - 1; {%- else %} - int32_t T = weights_offsets.numel(); + auto T = weights_offsets.numel(); {%- endif %} TORCH_CHECK_GT(T, 0); diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu index d916ac02b2..dad9aa6791 100644 --- a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu +++ b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu @@ -453,10 +453,10 @@ batch_index_select_dim0_codegen_forward_cuda( CUDA_DEVICE_GUARD(dev_weights); {%- if not nobag %} - int32_t T = D_offsets.numel() - 1; + auto T = D_offsets.numel() - 1; {%- else %} - int32_t total_L = indices.numel(); - int32_t T = weights_offsets.numel(); + auto total_L = indices.numel(); + auto T = weights_offsets.numel(); {%- endif %} TORCH_CHECK_GT(T, 0); // offsets = [B x T + 1] diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp index 4826de8356..afb7523a48 100644 --- a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp +++ b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cpu_wrapper_template.cpp @@ -164,7 +164,7 @@ Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper( .device(host_weights.options().device()); const int64_t vbe_output_size_ = vbe_output_size.guard_int(__FILE__, __LINE__); Tensor output_new = at::empty({vbe_output_size_}, options); - const int32_t T = D_offsets.numel() - 1; + const auto T = D_offsets.numel() - 1; const int32_t R = vbe_B_offsets_rank_per_feature.size(1) - 1; for (int32_t r = 0; r < R; r++){ diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu index f4b9e94272..a74f262d6b 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu @@ -670,7 +670,7 @@ at::Tensor get_fp8_per_tensor_scale( auto _st = input.scalar_type(); TORCH_CHECK(_st == torch::kBFloat16, "Invalid datatype. input must be BF16"); - int out_size = input.numel() == 0 ? 0 : 1; + auto out_size = input.numel() == 0 ? 0 : 1; at::Tensor scale = torch::empty( {out_size}, diff --git a/fbgemm_gpu/src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu b/fbgemm_gpu/src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu index 46bf36da09..bb6f1f14b1 100644 --- a/fbgemm_gpu/src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu +++ b/fbgemm_gpu/src/intraining_embedding_pruning_ops/intraining_embedding_pruning.cu @@ -769,7 +769,7 @@ Tensor remap_indices_update_utils_cuda( const auto feature_offsets_a = feature_offsets.accessor(); const auto use_gdt = full_values_list.has_value(); - const int32_t num_features = feature_lengths.numel(); + const auto num_features = feature_lengths.numel(); const bool update_util_value = update_util.has_value() ? update_util.value() : ((iter < 10) || (iter < 100 && (iter + 1) % 19 == 0) || diff --git a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu index 08842f2f8b..f1e33b79e6 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu @@ -93,7 +93,7 @@ std::tuple batched_dense_vec_jagged_2d_mul_backward( TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(grad_output, a_values, a_offsets, v); CUDA_DEVICE_GUARD(grad_output); - const int B = a_offsets.numel() - 1; + const auto B = a_offsets.numel() - 1; const int D = grad_output.size(-1); Tensor a_values_grad = at::zeros_like(a_values); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu index c13d945439..d6b467c4bf 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu @@ -58,7 +58,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward( TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(v, a_values, a_offsets); CUDA_DEVICE_GUARD(v); - const int B = a_offsets.numel() - 1; + const auto B = a_offsets.numel() - 1; TORCH_CHECK( B == 0 || v.size(0) % B == 0, "B, ", diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu index 53dffedccf..1078a46c39 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu @@ -158,7 +158,7 @@ Tensor jagged_dense_bmm_forward_cuda( TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(x_values, x_offsets, y); CUDA_DEVICE_GUARD(x_values); - const int B = x_offsets.numel() - 1; + const auto B = x_offsets.numel() - 1; const int M = x_values.size(-1); const int N = y.size(-1); const int total_L = x_values.size(0); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu index 8fbd361a79..87acd7b732 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu @@ -164,7 +164,7 @@ Tensor jagged_jagged_bmm_forward_cuda( TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(x_values, y_values, offsets); CUDA_DEVICE_GUARD(x_values); - const int B = offsets.numel() - 1; + const auto B = offsets.numel() - 1; const int M = x_values.size(-1); const int N = y_values.size(-1); auto output = at::zeros({B, M, N}, x_values.options()); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp index 7ce2c09ea0..b2c2b295f8 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp +++ b/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp @@ -797,7 +797,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward( TENSOR_ON_CPU(a_values); TENSOR_ON_CPU(a_offsets); - const int B = a_offsets.numel() - 1; + const auto B = a_offsets.numel() - 1; TORCH_CHECK( B == 0 || v.size(0) % B == 0, "B, ", @@ -834,7 +834,7 @@ std::tuple batched_dense_vec_jagged_2d_mul_backward( Tensor a_values_grad = at::zeros_like(a_values); Tensor v_grad = at::empty_like(v); - const int B = a_offsets.numel() - 1; + const auto B = a_offsets.numel() - 1; const int D = grad_output.size(-1); if (B > 0 && D > 0) { @@ -1316,7 +1316,7 @@ Tensor jagged_softmax_forward( const int64_t max_L) { TENSOR_ON_CPU(values); TENSOR_ON_CPU(offsets); - const int B = offsets.numel() - 1; + const auto B = offsets.numel() - 1; const int D = values.size(1); auto output = at::empty_like(values); @@ -1374,7 +1374,7 @@ Tensor jagged_softmax_backward( TENSOR_ON_CPU(grad_output); TENSOR_ON_CPU(output); TENSOR_ON_CPU(offsets); - const int B = offsets.numel() - 1; + const auto B = offsets.numel() - 1; const int D = grad_output.size(1); auto grad_input = at::empty_like(grad_output); diff --git a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu index 22b1f56333..8f727ffe22 100644 --- a/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu +++ b/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu @@ -180,8 +180,8 @@ class KeyedJaggedIndexSelectDim1GPUOp device_guard.set_index(values.get_device()); const auto batch_size = _batch_size.guard_int(__FILE__, __LINE__); - const int num_batches = lengths.numel() / batch_size; - const int num_output_lengths = num_batches * indices.numel(); + const auto num_batches = lengths.numel() / batch_size; + const auto num_output_lengths = num_batches * indices.numel(); const int MAX_CUMSUM_ENTRIES_PER_BLOCK = 256; auto grid_size = cuda_calc_xblock_count( num_output_lengths, MAX_CUMSUM_ENTRIES_PER_BLOCK); diff --git a/fbgemm_gpu/src/memory_utils/memory_utils.cu b/fbgemm_gpu/src/memory_utils/memory_utils.cu index 7bf847784c..efcb53a78a 100644 --- a/fbgemm_gpu/src/memory_utils/memory_utils.cu +++ b/fbgemm_gpu/src/memory_utils/memory_utils.cu @@ -437,7 +437,7 @@ void copy_to_shared(const Tensor& t) { int total_shared_mem = prop.sharedMemPerBlock; int num_sms = prop.multiProcessorCount; // Make sure that input tensor can fit on shared memory. - int input_size = t.numel() * t.element_size(); + auto input_size = t.numel() * t.element_size(); TORCH_CHECK( input_size <= total_shared_mem, "Input tensor is too large to fit on shared memory"); diff --git a/fbgemm_gpu/src/quantize_ops/quantize_mx.cu b/fbgemm_gpu/src/quantize_ops/quantize_mx.cu index e1fbaf2c07..6d8f29d057 100644 --- a/fbgemm_gpu/src/quantize_ops/quantize_mx.cu +++ b/fbgemm_gpu/src/quantize_ops/quantize_mx.cu @@ -119,7 +119,7 @@ DLL_PUBLIC at::Tensor quantize_mx_cuda( TORCH_CHECK(!flush_fp32_subnorms, "flush_fp32_subnorms is not yet supported"); TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input); - const uint32_t total_elems = input.numel(); + const auto total_elems = input.numel(); if (total_elems == 0) { return at::empty(0, input.options().dtype(at::kByte)); } @@ -145,7 +145,7 @@ DLL_PUBLIC at::Tensor quantize_mx_cuda( at::Device device = input.device(); const at::cuda::CUDAGuard device_guard{device}; - const uint32_t total_num_groups = input.numel() / mx_group_size; + const auto total_num_groups = input.numel() / mx_group_size; RoundingMode rd = static_cast(rounding_mode); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu index f488ae8165..7f733019d7 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu @@ -58,8 +58,8 @@ Tensor batched_unary_embeddings_forward_cuda( // N: number of tasks, T: number of tables, B: batch size const int32_t N = weight.size(0); - const int32_t T = table_offsets.numel() - 1; - const int32_t B = (offsets.numel() - 1) / T; + const auto T = table_offsets.numel() - 1; + const auto B = (offsets.numel() - 1) / T; TORCH_CHECK(N > 0); TORCH_CHECK(B > 0); TORCH_CHECK(T > 0); @@ -215,7 +215,7 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda( info_B_num_bits, info_B_mask); - int threads = std::min(sorted_linear_indices_run.numel(), 512); + auto threads = std::min(sorted_linear_indices_run.numel(), 512); dim3 blocks( cuda_calc_xblock_count(sorted_linear_indices_run.numel(), threads), N); auto grad_weight = at::zeros_like(weight); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu index 3d87d6fe40..d010d6cf4e 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu @@ -157,7 +157,7 @@ bucketize_sparse_features_cuda( CUDA_DEVICE_GUARD(lengths); // allocate tensors and buffers - const int lengths_size = lengths.numel(); + const auto lengths_size = lengths.numel(); const int new_lengths_size = lengths_size * my_size; auto offsets = at::empty({lengths_size}, lengths.options()); auto new_lengths = at::zeros({new_lengths_size}, lengths.options()); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp index 2b6e6e4122..66d9c9240d 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp @@ -369,7 +369,7 @@ void _block_bucketize_sparse_features_cpu_kernel( // allocate tensors and buffers const auto lengths_size = lengths.numel(); const auto new_lengths_size = lengths_size * my_size; - const int32_t T = block_sizes.numel(); + const auto T = block_sizes.numel(); const int32_t B = lengths_size / T; auto offsets = at::empty({lengths_size + 1}, lengths.options()); auto new_offsets = at::empty({new_lengths_size + 1}, lengths.options()); @@ -1945,8 +1945,8 @@ Tensor batched_unary_embeddings_forward_cpu( // N: number of tasks, T: number of tables, B: batch size const int32_t N = weight.sizes()[0]; - const int32_t T = table_offsets.numel() - 1; - const int32_t B = (offsets.numel() - 1) / T; + const auto T = table_offsets.numel() - 1; + const auto B = (offsets.numel() - 1) / T; TORCH_CHECK(N > 0); TORCH_CHECK(T > 0); TORCH_CHECK(B > 0); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu b/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu index a5edae15f7..de560cf884 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu @@ -66,7 +66,7 @@ DLL_PUBLIC Tensor segment_sum_csr_cuda( } constexpr uint32_t threads_per_block = 256; - const uint32_t num_blocks = csr_seg.numel() - 1; + const auto num_blocks = csr_seg.numel() - 1; FBGEMM_DISPATCH_ALL_TYPES( values.scalar_type(), "_segment_sum_csr_cuda_1", [&] { diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu index 7fe594d1a9..42a2a55fcd 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu @@ -121,7 +121,7 @@ std::pair lfu_cache_find_uncached_cuda( static_cast( static_cast(lxu_cache_state.size(0)) << kLFUCounterBits), unique_indices.options().dtype(at::kLong)); - const int32_t N = unique_indices.numel(); + const auto N = unique_indices.numel(); auto sorted_cache_sets = empty_like(cache_sets); auto cache_set_sorted_unique_indices = empty_like(unique_indices); diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu index 370f30cb3f..4084afd6cf 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu @@ -182,7 +182,7 @@ void lfu_cache_insert_cuda( CUDA_DEVICE_GUARD(weights); - const int32_t N = cache_set_sorted_unique_indices.numel(); + const auto N = cache_set_sorted_unique_indices.numel(); DISPATCH_EMB_CACHE_TYPES( weights.scalar_type(), diff --git a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu index 1fa91c519e..4efab588ad 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu @@ -167,7 +167,7 @@ void lfu_cache_insert_byte_cuda( CUDA_DEVICE_GUARD(weights); - const int32_t N = cache_set_sorted_unique_indices.numel(); + const auto N = cache_set_sorted_unique_indices.numel(); AT_DISPATCH_INDEX_TYPES( cache_set_sorted_unique_indices.scalar_type(), diff --git a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu index a96b254bfd..5a99f09a6a 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu @@ -212,7 +212,7 @@ get_unique_indices_cuda_impl( CUDA_DEVICE_GUARD(linear_indices); TORCH_CHECK(linear_indices.numel() < std::numeric_limits::max()); - const int32_t N = linear_indices.numel(); + const auto N = linear_indices.numel(); auto sorted_indices = at::empty_like(linear_indices); auto unique_indices = at::empty_like(linear_indices); auto unique_indices_length = diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu index 517d1b373b..653147c67c 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu @@ -181,7 +181,7 @@ lru_cache_find_uncached_cuda( unique_indices, lxu_cache_state.size(0), unique_indices.options().dtype(at::kInt)); - const int32_t N = unique_indices.numel(); + const auto N = unique_indices.numel(); auto sorted_cache_sets = empty_like(cache_sets); auto cache_set_sorted_unique_indices = empty_like(unique_indices); diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu index fde27d8211..94f98a5023 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu @@ -204,7 +204,7 @@ void lru_cache_insert_cuda( CUDA_DEVICE_GUARD(weights); - const int32_t N = cache_set_sorted_unique_indices.numel(); + const auto N = cache_set_sorted_unique_indices.numel(); DISPATCH_EMB_CACHE_TYPES( weights.scalar_type(), lxu_cache_weights.scalar_type(), diff --git a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu index e52af82bba..88d4b13574 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu @@ -106,7 +106,7 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda( CUDA_DEVICE_GUARD(linear_cache_indices); - const int32_t N = linear_cache_indices.numel(); + const auto N = linear_cache_indices.numel(); auto cache_sets = empty_like( linear_cache_indices, linear_cache_indices.options().dtype(at::kInt)); @@ -392,7 +392,7 @@ void lru_cache_insert_byte_cuda( CUDA_DEVICE_GUARD(weights); - const int32_t N = cache_set_sorted_unique_indices.numel(); + const auto N = cache_set_sorted_unique_indices.numel(); AT_DISPATCH_INDEX_TYPES( cache_set_sorted_unique_indices.scalar_type(), diff --git a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu index daeb6b0be4..e87e97ce1f 100644 --- a/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu +++ b/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu @@ -102,7 +102,7 @@ DLL_PUBLIC void lxu_cache_flush_cuda( CUDA_DEVICE_GUARD(lxu_cache_weights); - const int32_t T = D_offsets.numel() - 1; + const auto T = D_offsets.numel() - 1; const int32_t S = lxu_cache_weights.size(0); const int32_t tx = std::min(total_D / 4 / T, kMaxThreads); const dim3 threads(tx, kMaxThreads / tx); diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu index 340d616e6f..c9c29661b7 100644 --- a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu +++ b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu @@ -368,7 +368,7 @@ ssd_cache_populate_actions_cuda( .to(at::kInt); TORCH_CHECK_LT(unique_indices.numel(), std::numeric_limits::max()); - const int32_t N = unique_indices.numel(); + const auto N = unique_indices.numel(); auto evicted_indices = empty_like(unique_indices); const auto int_options = unique_indices.options().dtype(at::kInt);