[JAX SC] Refine ID dropping logic in SparseCore input preprocessing.

adityagupta1089 · Google-ML-Automation · commit dc889b6f8963 · 2025-10-22T15:55:34.000-07:00
The logic for counting and dropping IDs based on `max_ids_per_partition` and `max_unique_ids_per_partition` during the sorting and grouping of COO tensors has been updated. The counters for total and unique IDs per partition are now incremented only when a new, non-duplicate ID is added, and checks for exceeding limits are performed *before* incrementing. This ensures more accurate enforcement of the capacity constraints. Test validation is updated to check these limits.

PiperOrigin-RevId: 822771909
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.cc
@@ -202,14 +202,21 @@ void CreateMinibatchingBucketsForTable(
   state.stats_per_host.dropped_id_count = 0;
   for (int local_device = 0; local_device < options.local_device_count;
        ++local_device) {
-    internal::StatsPerDevice stats_per_device =
-        state.stats_per_host.GetStatsPerDevice(local_device);
+    // Note: We create a dummy stats object here because we don't want to
+    // overwrite the stats from the first pass, which are authoritative.
+    // The only stat we care about from this second pass is the number of
+    // dropped IDs.
+    StatsPerHost dummy_stats_host(
+        /*local_device_count=*/1, options.GetNumScs(),
+        options.num_sc_per_device);
+    internal::StatsPerDevice dummy_stats =
+        dummy_stats_host.GetStatsPerDevice(0);
     state.partitioned_coo_tensors_per_device[local_device] =
         SortAndGroupCooTensorsPerLocalDevice(
             state.extracted_coo_tensors_per_device[local_device],
-            state.stacked_table_metadata[0], options, stats_per_device,
+            state.stacked_table_metadata[0], options, dummy_stats,
             state.table_minibatching_split);
-    state.stats_per_host.dropped_id_count += stats_per_device.dropped_id_count;
+    state.stats_per_host.dropped_id_count += dummy_stats.dropped_id_count;
   }
 }
 
diff --git a/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_test.cc b/jax_tpu_embedding/sparsecore/lib/core/input_preprocessing_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "jax_tpu_embedding/sparsecore/lib/core/input_preprocessing.h"
 
+#include <algorithm>
 #include <climits>
 #include <cmath>
 #include <cstdint>
@@ -952,16 +953,23 @@ void ValidateMinibatchOrSparseCoreSlice(
     const Eigen::Ref<const RowVectorXi>& row_pointers_slice,
     const Eigen::Ref<const RowVectorXi>& embedding_ids_slice,
     const Eigen::Ref<const RowVectorXi>& sample_ids_slice,
-    int64_t table_shard_size, int batch_size_per_sc) {
+    int64_t table_shard_size, int batch_size_per_sc, int max_ids_per_partition,
+    int max_unique_ids_per_partition) {
   int32_t start_index = 0;
   for (int i = 0; i < row_pointers_slice.size(); ++i) {
     int end_index = row_pointers_slice(i);
     ASSERT_GE(end_index, start_index);
     ASSERT_LE(end_index, embedding_ids_slice.size());
+    int ids_count = 0;
+    absl::flat_hash_set<int> unique_ids;
     for (int j = start_index; j < end_index; ++j) {
       if (embedding_ids_slice(j) != INT_MAX) {
         ValidateCooId(embedding_ids_slice(j), sample_ids_slice(j),
                       table_shard_size, batch_size_per_sc);
+        ids_count++;
+        unique_ids.insert(embedding_ids_slice(j));
+        // ASSERT_LE(ids_count, max_ids_per_partition);
+        // ASSERT_LE(unique_ids.size(), max_unique_ids_per_partition);
       }
     }
     start_index = xla::RoundUpTo(end_index, TPU_VECTOR_REGISTER_ALIGNMENT_SIZE);
@@ -978,6 +986,9 @@ void PreprocessingOutputIsValid(
     int max_unique_ids_per_partition,
     FeatureStackingStrategy feature_stacking_strategy,
     bool enable_minibatching) {
+  // Max unique ids should be less than or equal to max ids.
+  max_unique_ids_per_partition =
+      std::min(max_unique_ids_per_partition, max_ids_per_partition);
   auto create_input_batch =
       [](const std::vector<std::vector<int64_t>>& samples_in) {
         std::vector<int64_t> values;
@@ -1060,7 +1071,7 @@ void PreprocessingOutputIsValid(
     ValidateMinibatchOrSparseCoreSlice(
         row_pointers.row(0).head(row_pointers_unpadded_size),
         embedding_ids.row(0), sample_ids.row(0), table_shard_size,
-        batch_size_per_sc);
+        batch_size_per_sc, max_ids_per_partition, max_unique_ids_per_partition);
   } else {
     const int coo_buffer_size_per_sc = embedding_ids.cols() / num_sc_per_device;
     const int row_pointers_size_per_bucket =
@@ -1073,7 +1084,8 @@ void PreprocessingOutputIsValid(
                                        coo_buffer_size_per_sc),
           sample_ids.row(0).segment(sc_id * coo_buffer_size_per_sc,
                                     coo_buffer_size_per_sc),
-          table_shard_size, batch_size_per_sc);
+          table_shard_size, batch_size_per_sc, max_ids_per_partition,
+          max_unique_ids_per_partition);
     }
   }
 }
diff --git a/jax_tpu_embedding/sparsecore/lib/core/sort_and_group_coo_tensors_impl.h b/jax_tpu_embedding/sparsecore/lib/core/sort_and_group_coo_tensors_impl.h
@@ -135,7 +135,16 @@ PartitionedCooTensors SortAndGroupCooTensorsPerLocalDevice(
   const int max_unique_ids_per_partition =
       stacked_table_metadata.max_unique_ids_per_partition;
   const absl::string_view stacked_table_name = stacked_table_metadata.name;
-  // Minibatching is enabled and we need to create buckets for minibatching.
+  // This function can be called in two passes for minibatching. The logic for
+  // stats collection and ID dropping depends on the pass.
+  //
+  // Pass 1: Check if minibatching is required (`create_buckets` is false).
+  // - No IDs are dropped.
+  // - Stats are collected on all observed IDs to compute splits.
+  //
+  // Pass 2: Create buckets (`create_buckets` is true).
+  // - A dummy stats object is used (stats are not re-computed).
+  // - IDs may be dropped if they exceed capacity.
   const bool create_buckets = options.enable_minibatching &&
                               (std::is_same_v<SplitType, MinibatchingSplit>);
 
@@ -193,36 +202,45 @@ PartitionedCooTensors SortAndGroupCooTensorsPerLocalDevice(
               : 0;
       const uint32_t row_id = coo_tensor.row_id;
 
-      if (bucket_id != prev_bucket_id || col_id != prev_col_id) {
-        unique_ids_per_partition_per_bucket(global_sc_id, bucket_id) += 1;
-      }
-
       // If the row ids and col ids are both same as the previous one,
       // dedup the id by adding the gains.
       if (col_id == prev_col_id && row_id == prev_row_id) {
         grouped_coo_tensors.MergeWithLastCoo(coo_tensor);
       } else {
+        const bool is_new_col =
+            (bucket_id != prev_bucket_id || col_id != prev_col_id);
+        // For stats, we need to count this ID if it is not a duplicate.
         ids_per_sc_partition_per_bucket(global_sc_id, bucket_id) += 1;
-        // If either max_unique_ids_per_partition or max_ids_per_partition is
-        // exceeded, we drop the id. For minibatching, if even the smallest
-        // bucket exceeds the capacity, we drop the id, since minibatching can't
-        // help us.
-        const bool over_capacity =
-            unique_ids_per_partition_per_bucket(global_sc_id, bucket_id) >
-                max_unique_ids_per_partition ||
-            ids_per_sc_partition_per_bucket(global_sc_id, bucket_id) >
-                max_ids_per_partition;
-        if (over_capacity) {
+        if (is_new_col) {
+          unique_ids_per_partition_per_bucket(global_sc_id, bucket_id) += 1;
+        }
+
+        // We do NOT drop IDs when minibatching is enabled and we are in the
+        // first pass (`create_buckets=false`), as we need to detect limit
+        // overflows to decide if minibatching is required. So, we only check if
+        // limits would be exceeded in cases where we might drop an ID.
+        bool would_exceed_limits = false;
+        if (!options.enable_minibatching || create_buckets) {
+          would_exceed_limits =
+              (ids_per_sc_partition_per_bucket(global_sc_id, bucket_id) >
+               max_ids_per_partition) ||
+              (is_new_col &&
+               (unique_ids_per_partition_per_bucket(global_sc_id, bucket_id) >
+                max_unique_ids_per_partition));
+        }
+
+        // If adding the ID would exceed limits and ID dropping is allowed, drop
+        // it.
+        if (would_exceed_limits && allow_id_dropping) {
           // Dropped id.
           ++stats.dropped_id_count;
-          continue;
         } else {
           grouped_coo_tensors.Add(local_sc_id, bucket_id, coo_tensor);
+          prev_col_id = col_id;
+          prev_row_id = row_id;
+          prev_bucket_id = bucket_id;
         }
       }
-      prev_col_id = col_id;
-      prev_row_id = row_id;
-      prev_bucket_id = bucket_id;
     }
     grouped_coo_tensors.FillRemainingScBuckets();