applied jinja is_rocm onto optimizations for backward and forward parameters

kudomcho · kudomcho · commit 48d3161cece8 · 2025-10-22T18:45:41.000Z
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_indice_weights_template.cu
@@ -213,7 +213,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
                 2, offset_idx + D_emb <= weights_numel, offset_idx
             )
             {%- endif %}
-
+            {%- if is_rocm %}
             int32_t j = 0;
             {%- if not ssd and not dense and not use_vec_blocking and not vbe %}
             // Currently for split_embedding_codegen_grad_indice_weights_kernel only
@@ -335,6 +335,9 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
             }
             {%- endif %}
             for (; j < kWarpSize && l_start + j < L; ++j) {
+            {%- else %} // if is_rocm
+            for (auto j = 0; j < kWarpSize && l_start + j < L; ++j) {
+            {%- endif %} // if is_rocm
                 const auto offset_idx_j = shfl_sync(offset_idx, j);
                 {%- if not dense %}
                 const auto {{ locs_or_addrs_idx }}_j = shfl_sync({{ locs_or_addrs_idx }}, j);
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -987,8 +987,11 @@ Tensor {{ embedding_cuda_op }}(
                 auto num_long_run_ids = at::zeros({1}, indices.options().dtype(at::kInt));
 
                 const bool use_deterministic_algorithms = at::globalContext().deterministicAlgorithms();
-                const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
-
+                {% if is_rocm %}
+                    const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 4096;
+                {% else %}
+                    const int max_segment_length_per_cta = use_deterministic_algorithms ? INT_MAX : 1024;
+                {%- endif %}
                 Tensor long_run_id_to_really_long_run_ids;
                 if (use_deterministic_algorithms) {
                     long_run_id_to_really_long_run_ids =
@@ -1059,8 +1062,8 @@ Tensor {{ embedding_cuda_op }}(
 
                     // Compute shared memory size for cta_per_row
                     constexpr auto kCacheAccBytes = sizeof(at::acc_type<cache_t, true>);
-                    int32_t total_L = indices.numel();
-                    #ifdef USE_ROCM
+                    {% if is_rocm %}
+                        int32_t total_L = indices.numel();
                         int32_t num_cta_per_row_groups;
                         int32_t work_group_size;
                         if (total_L/total_B > 1){
@@ -1071,10 +1074,10 @@ Tensor {{ embedding_cuda_op }}(
                             num_cta_per_row_groups = kMaxThreads / kWarpSize;
                             work_group_size = kMaxThreads;
                         }
-                    #else
+                    {%- else %}
                         int32_t num_cta_per_row_groups = kMaxThreads / kWarpSize;
                         int32_t work_group_size = kMaxThreads;
-                    #endif
+                    {%- endif %}
                     const size_t cta_per_row_smem_bytes = compute_num_groups_and_dynamic_smem_bytes(
                         &num_cta_per_row_groups,
                         [&] (int num_groups) {
@@ -1091,7 +1094,6 @@ Tensor {{ embedding_cuda_op }}(
                     FBGEMM_LAUNCH_KERNEL(
                         backward_cta_per_row_kernel,
                         cta_per_row_grid_size,
-                        // (64, 2)
                         dim3(kThreadGroupSize, num_cta_per_row_groups),
                         cta_per_row_smem_bytes,
                         at::cuda::getCurrentCUDAStream(),
@@ -1195,7 +1197,7 @@ Tensor {{ embedding_cuda_op }}(
                              kUseVecBlocking>;
 
                     // Compute shared memory size for warp_per_row
-                    #ifdef USE_ROCM
+                    {%- if is_rocm %}
                         int32_t num_warp_per_row_groups;
                         
                         if (total_L/total_B > 1){
@@ -1204,9 +1206,9 @@ Tensor {{ embedding_cuda_op }}(
                         else{
                             num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
                         }
-                    #else
+                    {%- else %}
                         int32_t num_warp_per_row_groups = kBackwardMaxThreads / kThreadGroupSize;
-                    #endif
+                    {%- endif %}
                     int32_t warp_per_row_smem_bytes = 0;
 
                     if constexpr (kUseVecBlocking) {
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -458,15 +458,15 @@ batch_index_select_dim0_codegen_forward_cuda(
 
     CUDA_DEVICE_GUARD(dev_weights);
 
-    #ifdef USE_ROCM
+    {% if is_rocm %}
         if (!rocm::is_supported_cdna()) {
             TORCH_WARN_ONCE("Running on non-CDNA architecture. Performance may be suboptimal.");
         }
         else {
             // Ensure we're running on a supported CDNA architecture (including MI350)
             TORCH_WARN_ONCE("Running on CDNA architecture");
         }
-    #endif
+    {%- endif %}
     
     {%- if not nobag %}
     int32_t T = D_offsets.numel() - 1;
diff --git a/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp b/fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp
@@ -341,7 +341,7 @@ class BatchIndexSelectDim0GPUOp
     Tensor grad_dev_weights;
     TORCH_CHECK_EQ(grad_outputs.size(), 1);
 
-    constexpr int32_t max_segment_length_per_warp = 16384;
+    constexpr int32_t max_segment_length_per_warp = 32;
 
     auto grad_output = grad_outputs[0];
 
@@ -656,7 +656,7 @@ class BatchIndexSelectDim0TensorGPUOp
     const auto permute_output_dim_0_1 =
         ctx->saved_data["permute_output_dim_0_1"].toBool();
 
-    constexpr int32_t max_segment_length_per_warp =  16384;
+    constexpr int32_t max_segment_length_per_warp =  32;
 
     auto grad_output = grad_outputs[0];
 
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -698,8 +698,10 @@ class {{ autograd_func }} :
     TORCH_CHECK(aux_tensor[IDX_LXU_CACHE_LOCATIONS].has_value(), "lxu_cache_locations should have value.");
     const auto lxu_cache_locations = aux_tensor[IDX_LXU_CACHE_LOCATIONS].value();
     const auto is_experimental = aux_bool[IDX_IS_EXPERIMENTAL_TBE];
+    {% if is_rocm %}
     const auto mixed_D = aux_bool[IDX_MIXED_D];
     {%- endif %}
+    {%- endif %}
 
     // Default values for Dynamo tracing
     // SymInt does not support bitshifts operator
@@ -1009,7 +1011,9 @@ static torch::autograd::variable_list backward(
     int32_t max_segment_length_per_warp = 64;
     // Workaround. Should not be upstreamed in any way.
     // Redistribute all cta_per_row work to warp_per_row.
+    {% if is_rocm %}
     int32_t total_L = indices.numel();
+    {%- endif %}
     {%- if (not nobag) and 
            (optimizer == "rowwise_adagrad") and 
            (not vbe) and