Skip to content

Gen modes: Remove -Wno-mismatched-tags #4011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@ Tensor int_nbit_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{
{% endif %}

{% if not nobag %}
const int32_t T = D_offsets.numel() - 1;
const auto T = D_offsets.numel() - 1;
{% else %}
const int32_t total_L = indices.numel();
const int32_t T = weights_offsets.numel();
const auto total_L = indices.numel();
const auto T = weights_offsets.numel();
{% endif %}
TORCH_CHECK(T > 0);
// offsets = [B x T + 1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,10 @@ __global__ void {{ type_map[emb_weight_type].enum_name }}_split_embedding{{ "_no
indices = indices.contiguous();

{%- if not nobag %}
const int32_t T = D_offsets.numel() - 1;
const auto T = D_offsets.numel() - 1;
{%- else %}
const int32_t total_L = indices.numel();
const int32_t T = weights_offsets.numel();
const auto total_L = indices.numel();
const auto T = weights_offsets.numel();
{%- endif %}

TORCH_CHECK(T > 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -690,9 +690,9 @@ Tensor {{ embedding_cuda_op }}(
}

{%- if not nobag %}
int32_t T = D_offsets.numel() - 1;
auto T = D_offsets.numel() - 1;
{%- else %}
int32_t T = weights_offsets.numel();
auto T = weights_offsets.numel();
{%- endif %}

TORCH_CHECK_GT(T, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,10 @@ batch_index_select_dim0_codegen_forward_cuda(
CUDA_DEVICE_GUARD(dev_weights);

{%- if not nobag %}
int32_t T = D_offsets.numel() - 1;
auto T = D_offsets.numel() - 1;
{%- else %}
int32_t total_L = indices.numel();
int32_t T = weights_offsets.numel();
auto total_L = indices.numel();
auto T = weights_offsets.numel();
{%- endif %}
TORCH_CHECK_GT(T, 0);
// offsets = [B x T + 1]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ Tensor split_embedding_codegen_forward_{{ wdesc }}{{ vdesc }}_pt2_cpu_wrapper(
.device(host_weights.options().device());
const int64_t vbe_output_size_ = vbe_output_size.guard_int(__FILE__, __LINE__);
Tensor output_new = at::empty({vbe_output_size_}, options);
const int32_t T = D_offsets.numel() - 1;
const auto T = D_offsets.numel() - 1;
const int32_t R = vbe_B_offsets_rank_per_feature.size(1) - 1;

for (int32_t r = 0; r < R; r++){
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -670,7 +670,7 @@ at::Tensor get_fp8_per_tensor_scale(
auto _st = input.scalar_type();
TORCH_CHECK(_st == torch::kBFloat16, "Invalid datatype. input must be BF16");

int out_size = input.numel() == 0 ? 0 : 1;
auto out_size = input.numel() == 0 ? 0 : 1;

at::Tensor scale = torch::empty(
{out_size},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ Tensor remap_indices_update_utils_cuda(
const auto feature_offsets_a = feature_offsets.accessor<int64_t, 1>();

const auto use_gdt = full_values_list.has_value();
const int32_t num_features = feature_lengths.numel();
const auto num_features = feature_lengths.numel();
const bool update_util_value = update_util.has_value()
? update_util.value()
: ((iter < 10) || (iter < 100 && (iter + 1) % 19 == 0) ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ std::tuple<Tensor, Tensor> batched_dense_vec_jagged_2d_mul_backward(
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(grad_output, a_values, a_offsets, v);
CUDA_DEVICE_GUARD(grad_output);

const int B = a_offsets.numel() - 1;
const auto B = a_offsets.numel() - 1;
const int D = grad_output.size(-1);

Tensor a_values_grad = at::zeros_like(a_values);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward(
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(v, a_values, a_offsets);
CUDA_DEVICE_GUARD(v);

const int B = a_offsets.numel() - 1;
const auto B = a_offsets.numel() - 1;
TORCH_CHECK(
B == 0 || v.size(0) % B == 0,
"B, ",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ Tensor jagged_dense_bmm_forward_cuda(
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(x_values, x_offsets, y);
CUDA_DEVICE_GUARD(x_values);

const int B = x_offsets.numel() - 1;
const auto B = x_offsets.numel() - 1;
const int M = x_values.size(-1);
const int N = y.size(-1);
const int total_L = x_values.size(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ Tensor jagged_jagged_bmm_forward_cuda(
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(x_values, y_values, offsets);
CUDA_DEVICE_GUARD(x_values);

const int B = offsets.numel() - 1;
const auto B = offsets.numel() - 1;
const int M = x_values.size(-1);
const int N = y_values.size(-1);
auto output = at::zeros({B, M, N}, x_values.options());
Expand Down
8 changes: 4 additions & 4 deletions fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ Tensor batched_dense_vec_jagged_2d_mul_forward(
TENSOR_ON_CPU(a_values);
TENSOR_ON_CPU(a_offsets);

const int B = a_offsets.numel() - 1;
const auto B = a_offsets.numel() - 1;
TORCH_CHECK(
B == 0 || v.size(0) % B == 0,
"B, ",
Expand Down Expand Up @@ -834,7 +834,7 @@ std::tuple<Tensor, Tensor> batched_dense_vec_jagged_2d_mul_backward(
Tensor a_values_grad = at::zeros_like(a_values);
Tensor v_grad = at::empty_like(v);

const int B = a_offsets.numel() - 1;
const auto B = a_offsets.numel() - 1;
const int D = grad_output.size(-1);

if (B > 0 && D > 0) {
Expand Down Expand Up @@ -1316,7 +1316,7 @@ Tensor jagged_softmax_forward(
const int64_t max_L) {
TENSOR_ON_CPU(values);
TENSOR_ON_CPU(offsets);
const int B = offsets.numel() - 1;
const auto B = offsets.numel() - 1;
const int D = values.size(1);
auto output = at::empty_like(values);

Expand Down Expand Up @@ -1374,7 +1374,7 @@ Tensor jagged_softmax_backward(
TENSOR_ON_CPU(grad_output);
TENSOR_ON_CPU(output);
TENSOR_ON_CPU(offsets);
const int B = offsets.numel() - 1;
const auto B = offsets.numel() - 1;
const int D = grad_output.size(1);
auto grad_input = at::empty_like(grad_output);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ class KeyedJaggedIndexSelectDim1GPUOp
device_guard.set_index(values.get_device());

const auto batch_size = _batch_size.guard_int(__FILE__, __LINE__);
const int num_batches = lengths.numel() / batch_size;
const int num_output_lengths = num_batches * indices.numel();
const auto num_batches = lengths.numel() / batch_size;
const auto num_output_lengths = num_batches * indices.numel();
const int MAX_CUMSUM_ENTRIES_PER_BLOCK = 256;
auto grid_size = cuda_calc_xblock_count(
num_output_lengths, MAX_CUMSUM_ENTRIES_PER_BLOCK);
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/src/memory_utils/memory_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ void copy_to_shared(const Tensor& t) {
int total_shared_mem = prop.sharedMemPerBlock;
int num_sms = prop.multiProcessorCount;
// Make sure that input tensor can fit on shared memory.
int input_size = t.numel() * t.element_size();
auto input_size = t.numel() * t.element_size();
TORCH_CHECK(
input_size <= total_shared_mem,
"Input tensor is too large to fit on shared memory");
Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/src/quantize_ops/quantize_mx.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ DLL_PUBLIC at::Tensor quantize_mx_cuda(
TORCH_CHECK(!flush_fp32_subnorms, "flush_fp32_subnorms is not yet supported");
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(input);

const uint32_t total_elems = input.numel();
const auto total_elems = input.numel();
if (total_elems == 0) {
return at::empty(0, input.options().dtype(at::kByte));
}
Expand All @@ -145,7 +145,7 @@ DLL_PUBLIC at::Tensor quantize_mx_cuda(

at::Device device = input.device();
const at::cuda::CUDAGuard device_guard{device};
const uint32_t total_num_groups = input.numel() / mx_group_size;
const auto total_num_groups = input.numel() / mx_group_size;

RoundingMode rd = static_cast<RoundingMode>(rounding_mode);

Expand Down
6 changes: 3 additions & 3 deletions fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ Tensor batched_unary_embeddings_forward_cuda(

// N: number of tasks, T: number of tables, B: batch size
const int32_t N = weight.size(0);
const int32_t T = table_offsets.numel() - 1;
const int32_t B = (offsets.numel() - 1) / T;
const auto T = table_offsets.numel() - 1;
const auto B = (offsets.numel() - 1) / T;
TORCH_CHECK(N > 0);
TORCH_CHECK(B > 0);
TORCH_CHECK(T > 0);
Expand Down Expand Up @@ -215,7 +215,7 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda(
info_B_num_bits,
info_B_mask);

int threads = std::min<int32_t>(sorted_linear_indices_run.numel(), 512);
auto threads = std::min<int32_t>(sorted_linear_indices_run.numel(), 512);
dim3 blocks(
cuda_calc_xblock_count(sorted_linear_indices_run.numel(), threads), N);
auto grad_weight = at::zeros_like(weight);
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ bucketize_sparse_features_cuda(
CUDA_DEVICE_GUARD(lengths);

// allocate tensors and buffers
const int lengths_size = lengths.numel();
const auto lengths_size = lengths.numel();
const int new_lengths_size = lengths_size * my_size;
auto offsets = at::empty({lengths_size}, lengths.options());
auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
Expand Down
6 changes: 3 additions & 3 deletions fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ void _block_bucketize_sparse_features_cpu_kernel(
// allocate tensors and buffers
const auto lengths_size = lengths.numel();
const auto new_lengths_size = lengths_size * my_size;
const int32_t T = block_sizes.numel();
const auto T = block_sizes.numel();
const int32_t B = lengths_size / T;
auto offsets = at::empty({lengths_size + 1}, lengths.options());
auto new_offsets = at::empty({new_lengths_size + 1}, lengths.options());
Expand Down Expand Up @@ -1945,8 +1945,8 @@ Tensor batched_unary_embeddings_forward_cpu(

// N: number of tasks, T: number of tables, B: batch size
const int32_t N = weight.sizes()[0];
const int32_t T = table_offsets.numel() - 1;
const int32_t B = (offsets.numel() - 1) / T;
const auto T = table_offsets.numel() - 1;
const auto B = (offsets.numel() - 1) / T;
TORCH_CHECK(N > 0);
TORCH_CHECK(T > 0);
TORCH_CHECK(B > 0);
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ DLL_PUBLIC Tensor segment_sum_csr_cuda(
}

constexpr uint32_t threads_per_block = 256;
const uint32_t num_blocks = csr_seg.numel() - 1;
const auto num_blocks = csr_seg.numel() - 1;

FBGEMM_DISPATCH_ALL_TYPES(
values.scalar_type(), "_segment_sum_csr_cuda_1", [&] {
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ std::pair<Tensor, Tensor> lfu_cache_find_uncached_cuda(
static_cast<int64_t>(
static_cast<uint64_t>(lxu_cache_state.size(0)) << kLFUCounterBits),
unique_indices.options().dtype(at::kLong));
const int32_t N = unique_indices.numel();
const auto N = unique_indices.numel();
auto sorted_cache_sets = empty_like(cache_sets);
auto cache_set_sorted_unique_indices = empty_like(unique_indices);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ void lfu_cache_insert_cuda(

CUDA_DEVICE_GUARD(weights);

const int32_t N = cache_set_sorted_unique_indices.numel();
const auto N = cache_set_sorted_unique_indices.numel();

DISPATCH_EMB_CACHE_TYPES(
weights.scalar_type(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ void lfu_cache_insert_byte_cuda(

CUDA_DEVICE_GUARD(weights);

const int32_t N = cache_set_sorted_unique_indices.numel();
const auto N = cache_set_sorted_unique_indices.numel();

AT_DISPATCH_INDEX_TYPES(
cache_set_sorted_unique_indices.scalar_type(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ get_unique_indices_cuda_impl(
CUDA_DEVICE_GUARD(linear_indices);

TORCH_CHECK(linear_indices.numel() < std::numeric_limits<int32_t>::max());
const int32_t N = linear_indices.numel();
const auto N = linear_indices.numel();
auto sorted_indices = at::empty_like(linear_indices);
auto unique_indices = at::empty_like(linear_indices);
auto unique_indices_length =
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ lru_cache_find_uncached_cuda(
unique_indices,
lxu_cache_state.size(0),
unique_indices.options().dtype(at::kInt));
const int32_t N = unique_indices.numel();
const auto N = unique_indices.numel();
auto sorted_cache_sets = empty_like(cache_sets);
auto cache_set_sorted_unique_indices = empty_like(unique_indices);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ void lru_cache_insert_cuda(

CUDA_DEVICE_GUARD(weights);

const int32_t N = cache_set_sorted_unique_indices.numel();
const auto N = cache_set_sorted_unique_indices.numel();
DISPATCH_EMB_CACHE_TYPES(
weights.scalar_type(),
lxu_cache_weights.scalar_type(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ Tensor direct_mapped_lru_cache_find_uncached_cuda(

CUDA_DEVICE_GUARD(linear_cache_indices);

const int32_t N = linear_cache_indices.numel();
const auto N = linear_cache_indices.numel();

auto cache_sets = empty_like(
linear_cache_indices, linear_cache_indices.options().dtype(at::kInt));
Expand Down Expand Up @@ -392,7 +392,7 @@ void lru_cache_insert_byte_cuda(

CUDA_DEVICE_GUARD(weights);

const int32_t N = cache_set_sorted_unique_indices.numel();
const auto N = cache_set_sorted_unique_indices.numel();

AT_DISPATCH_INDEX_TYPES(
cache_set_sorted_unique_indices.scalar_type(),
Expand Down
2 changes: 1 addition & 1 deletion fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ DLL_PUBLIC void lxu_cache_flush_cuda(

CUDA_DEVICE_GUARD(lxu_cache_weights);

const int32_t T = D_offsets.numel() - 1;
const auto T = D_offsets.numel() - 1;
const int32_t S = lxu_cache_weights.size(0);
const int32_t tx = std::min<int32_t>(total_D / 4 / T, kMaxThreads);
const dim3 threads(tx, kMaxThreads / tx);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ ssd_cache_populate_actions_cuda(
.to(at::kInt);

TORCH_CHECK_LT(unique_indices.numel(), std::numeric_limits<int32_t>::max());
const int32_t N = unique_indices.numel();
const auto N = unique_indices.numel();

auto evicted_indices = empty_like(unique_indices);
const auto int_options = unique_indices.options().dtype(at::kInt);
Expand Down
Loading