facebookincubator · Yuhta · Apr 10, 2025
diff --git a/velox/core/QueryConfig.h b/velox/core/QueryConfig.h
@@ -531,6 +531,17 @@ class QueryConfig {
   static constexpr const char* kRequestDataSizesMaxWaitSec =
       "request_data_sizes_max_wait_sec";
 
+  /// If this is false (the default), in streaming aggregation, wait until we
+  /// have enough number of output rows to produce a batch of size specified by
+  /// Operator::outputBatchRows.
+  ///
+  /// If this is true, we put the rows in output batch, as soon as the
+  /// corresponding groups are fully aggregated.  This is useful for reducing
+  /// memory consumption, if the downstream operators are not sensitive to small
+  /// batch size.
+  static constexpr const char* kStreamingAggregationEagerFlush =
+      "streaming_aggregation_eager_flush";
+
   bool selectiveNimbleReaderEnabled() const {
     return get<bool>(kSelectiveNimbleReaderEnabled, false);
   }
@@ -976,6 +987,10 @@ class QueryConfig {
     return get<bool>(kThrowExceptionOnDuplicateMapKeys, false);
   }
 
+  bool streamingAggregationEagerFlush() const {
+    return get<bool>(kStreamingAggregationEagerFlush, false);
+  }
+
   template <typename T>
   T get(const std::string& key, const T& defaultValue) const {
     return config_->get<T>(key, defaultValue);

diff --git a/velox/docs/configs.rst b/velox/docs/configs.rst
@@ -31,14 +31,6 @@ Generic Configuration
      - integer
      - 5000
      - TableScan operator will exit getOutput() method after this many milliseconds even if it has no data to return yet. Zero means 'no time limit'.
-   * - abandon_partial_aggregation_min_rows
-     - integer
-     - 100,000
-     - Number of input rows to receive before starting to check whether to abandon partial aggregation.
-   * - abandon_partial_aggregation_min_pct
-     - integer
-     - 80
-     - Abandons partial aggregation if number of groups equals or exceeds this percentage of the number of input rows.
    * - abandon_partial_topn_row_number_min_rows
      - integer
      - 100,000
@@ -399,6 +391,34 @@ Spilling
      - 0
      - Percentage of aggregation or join input batches that will be forced to spill for testing. 0 means no extra spilling.
 
+Aggregation
+-----------
+.. list-table::
+   :widths: 20 10 10 70
+   :header-rows: 1
+
+   * - Property Name
+     - Type
+     - Default Value
+     - Description
+   * - abandon_partial_aggregation_min_rows
+     - integer
+     - 100,000
+     - Number of input rows to receive before starting to check whether to abandon partial aggregation.
+   * - abandon_partial_aggregation_min_pct
+     - integer
+     - 80
+     - Abandons partial aggregation if number of groups equals or exceeds this percentage of the number of input rows.
+   * - streaming_aggregation_eager_flush
+     - bool
+     - false
+     - If this is false (the default), in streaming aggregation, wait until we
+       have enough number of output rows to produce a batch of size specified by
+       Operator::outputBatchRows.  If this is true, we put the rows in output
+       batch, as soon as the corresponding groups are fully aggregated.  This is
+       useful for reducing memory consumption, if the downstream operators are
+       not sensitive to small batch size.
+
 Table Scan
 ------------
 .. list-table::

diff --git a/velox/exec/Aggregate.h b/velox/exec/Aggregate.h
@@ -171,6 +171,35 @@ class Aggregate {
       const std::vector<VectorPtr>& args,
       bool mayPushdown) = 0;
 
+  /// Called by aggregation operator to set whether the input data is eligible
+  /// for clustered input optimization.  This is turned off, in cases for
+  /// example if the input rows from same group are not contiguous, or the
+  /// aggregate is sorted or distinct.
+  void setClusteredInput(bool value) {
+    clusteredInput_ = value;
+  }
+
+  /// Whether the function itself supports clustered input optimization.
+  ///
+  /// When this returns true, `addRawClusteredInput` should be implemented.
+  virtual bool supportsAddRawClusteredInput() const {
+    return false;
+  }
+
+  /// Fast path for the function when the input rows from same group are
+  /// clustered together. `groups`, `rows`, and `args` are the same as
+  /// `addRawInput`, `groupBoundaries` is the indices into `groups` indicating
+  /// the row after last row of each group.
+  ///
+  /// Will only be called when `supportsAddRawClusteredInput` returns true.
+  virtual void addRawClusteredInput(
+      char** /*groups*/,
+      const SelectivityVector& /*rows*/,
+      const std::vector<VectorPtr>& /*args*/,
+      const folly::Range<const vector_size_t*>& /*groupBoundaries*/) {
+    VELOX_NYI("Unimplemented: {} {}", typeid(*this).name(), __func__);
+  }
+
   // Updates final accumulators from intermediate results.
   // @param groups Pointers to the start of the group rows. These are aligned
   // with the 'args', e.g. data in the i-th row of the 'args' goes to the i-th
@@ -468,6 +497,8 @@ class Aggregate {
   std::vector<vector_size_t> pushdownCustomIndices_;
 
   bool validateIntermediateInputs_ = false;
+
+  bool clusteredInput_ = false;
 };
 
 using AggregateFunctionFactory = std::function<std::unique_ptr<Aggregate>(

diff --git a/velox/exec/StreamingAggregation.cpp b/velox/exec/StreamingAggregation.cpp
@@ -32,7 +32,10 @@ StreamingAggregation::StreamingAggregation(
               : "Aggregation"),
       outputBatchSize_{outputBatchRows()},
       aggregationNode_{aggregationNode},
-      step_{aggregationNode->step()} {
+      step_{aggregationNode->step()},
+      eagerFlush_{operatorCtx_->driverCtx()
+                      ->queryConfig()
+                      .streamingAggregationEagerFlush()} {
   if (aggregationNode_->ignoreNullKeys()) {
     VELOX_UNSUPPORTED(
         "Streaming aggregation doesn't support ignoring null keys yet");
@@ -75,6 +78,15 @@ void StreamingAggregation::initialize() {
     }
   }
 
+  if (isRawInput(step_)) {
+    for (column_index_t i = 0; i < aggregates_.size(); ++i) {
+      if (aggregates_[i].sortingKeys.empty() && !aggregates_[i].distinct) {
+        // Must be set before we initialize row container, because it could
+        // change the type and size of accumulator.
+        aggregates_[i].function->setClusteredInput(true);
+      }
+    }
+  }
   masks_ = std::make_unique<AggregationMasks>(extractMaskChannels(aggregates_));
   rows_ = makeRowContainer(groupingKeyTypes);
 
@@ -175,6 +187,9 @@ RowVectorPtr StreamingAggregation::createOutput(size_t numGroups) {
     }
   }
 
+  std::rotate(groups_.begin(), groups_.begin() + numGroups, groups_.end());
+  numGroups_ -= numGroups;
+
   return output;
 }
 
@@ -215,6 +230,15 @@ void StreamingAggregation::assignGroups() {
       }
     }
   }
+
+  groupBoundaries_.clear();
+  for (vector_size_t i = 1; i < numInput; ++i) {
+    if (inputGroups_[i] != inputGroups_[i - 1]) {
+      groupBoundaries_.push_back(i);
+    }
+  }
+  VELOX_CHECK_GT(numInput, 0);
+  groupBoundaries_.push_back(numInput);
 }
 
 const SelectivityVector& StreamingAggregation::getSelectivityVector(
@@ -256,7 +280,12 @@ void StreamingAggregation::evaluateAggregates() {
     }
 
     if (isRawInput(step_)) {
-      function->addRawInput(inputGroups_.data(), rows, args, false);
+      if (function->supportsAddRawClusteredInput()) {
+        function->addRawClusteredInput(
+            inputGroups_.data(), rows, args, groupBoundaries_);
+      } else {
+        function->addRawInput(inputGroups_.data(), rows, args, false);
+      }
     } else {
       function->addIntermediateResults(inputGroups_.data(), rows, args, false);
     }
@@ -274,9 +303,7 @@ bool StreamingAggregation::isFinished() {
 RowVectorPtr StreamingAggregation::getOutput() {
   if (!input_) {
     if (noMoreInput_ && numGroups_ > 0) {
-      auto output = createOutput(numGroups_);
-      numGroups_ = 0;
-      return output;
+      return createOutput(numGroups_);
     }
     return nullptr;
   }
@@ -294,19 +321,10 @@ RowVectorPtr StreamingAggregation::getOutput() {
   evaluateAggregates();
 
   RowVectorPtr output;
-  if (numGroups_ > outputBatchSize_) {
+  if (eagerFlush_ && numGroups_ > 1) {
+    output = createOutput(numGroups_ - 1);
+  } else if (numGroups_ > outputBatchSize_) {
     output = createOutput(outputBatchSize_);
-
-    // Rotate the entries in the groups_ vector to move the remaining groups to
-    // the beginning and place re-usable groups at the end.
-    std::vector<char*> copy(groups_.size());
-    std::copy(groups_.begin() + outputBatchSize_, groups_.end(), copy.begin());
-    std::copy(
-        groups_.begin(),
-        groups_.begin() + outputBatchSize_,
-        copy.begin() + groups_.size() - outputBatchSize_);
-    groups_ = std::move(copy);
-    numGroups_ -= outputBatchSize_;
   }
 
   prevInput_ = input_;

diff --git a/velox/exec/StreamingAggregation.h b/velox/exec/StreamingAggregation.h
@@ -91,6 +91,8 @@ class StreamingAggregation : public Operator {
 
   const core::AggregationNode::Step step_;
 
+  const bool eagerFlush_;
+
   std::vector<column_index_t> groupingKeys_;
   std::vector<AggregateInfo> aggregates_;
   std::unique_ptr<SortedAggregations> sortedAggregations_;
@@ -117,6 +119,10 @@ class StreamingAggregation : public Operator {
   // Pointers to groups for all input rows.
   std::vector<char*> inputGroups_;
 
+  // Indices into `groups` indicating the row after last row of each group.  The
+  // last element of this is the total size of input.
+  std::vector<vector_size_t> groupBoundaries_;
+
   // A subset of input rows to evaluate the aggregate function on. Rows
   // where aggregation mask is false are excluded.
   SelectivityVector inputRows_;