Support cast from decimal to varchar (6210)

rui-mo · rui-mo · commit 6942d8c5fe3f · 2023-08-31T14:06:38.000+08:00
diff --git a/velox/benchmarks/basic/CastBenchmark.cpp b/velox/benchmarks/basic/CastBenchmark.cpp
@@ -27,27 +27,41 @@ int main(int argc, char** argv) {
   folly::Init init(&argc, &argv);
 
   ExpressionBenchmarkBuilder benchmarkBuilder;
-
+  const vector_size_t vectorSize = 1000;
   auto vectorMaker = benchmarkBuilder.vectorMaker();
   auto invalidInput = vectorMaker.flatVector<facebook::velox::StringView>({""});
   auto validInput = vectorMaker.flatVector<facebook::velox::StringView>({""});
   auto nanInput = vectorMaker.flatVector<facebook::velox::StringView>({""});
+  auto shortDecimalInput = vectorMaker.flatVector<int64_t>(
+      vectorSize, [&](auto j) { return 12345 * j; }, nullptr, DECIMAL(18, 6));
+  auto longDecimalInput = vectorMaker.flatVector<int128_t>(
+      vectorSize,
+      [&](auto j) {
+        return facebook::velox::HugeInt::build(12345 * j, 56789 * j + 12345);
+      },
+      nullptr,
+      DECIMAL(38, 16));
 
-  invalidInput->resize(1000);
-  validInput->resize(1000);
-  nanInput->resize(1000);
+  invalidInput->resize(vectorSize);
+  validInput->resize(vectorSize);
+  nanInput->resize(vectorSize);
 
-  for (int i = 0; i < 1000; i++) {
+  for (int i = 0; i < vectorSize; i++) {
     nanInput->set(i, "$"_sv);
     invalidInput->set(i, StringView::makeInline(std::string("")));
     validInput->set(i, StringView::makeInline(std::to_string(i)));
   }
 
   benchmarkBuilder
       .addBenchmarkSet(
-          "cast_int",
+          "cast",
           vectorMaker.rowVector(
-              {"valid", "empty", "nan"}, {validInput, invalidInput, nanInput}))
+              {"valid", "empty", "nan", "short_decimal", "long_decimal"},
+              {validInput,
+               invalidInput,
+               nanInput,
+               shortDecimalInput,
+               longDecimalInput}))
       .addExpression("try_cast_invalid_empty_input", "try_cast (empty as int) ")
       .addExpression(
           "tryexpr_cast_invalid_empty_input", "try (cast (empty as int))")
@@ -56,6 +70,8 @@ int main(int argc, char** argv) {
       .addExpression("try_cast_valid", "try_cast (valid as int)")
       .addExpression("tryexpr_cast_valid", "try (cast (valid as int))")
       .addExpression("cast_valid", "cast(valid as int)")
+      .addExpression("cast_short_decimal", "cast (short_decimal as varchar)")
+      .addExpression("cast_long_decimal", "cast (long_decimal as varchar)")
       .withIterations(100)
       .disableTesting();
 
diff --git a/velox/docs/functions/presto/conversion.rst b/velox/docs/functions/presto/conversion.rst
@@ -173,7 +173,7 @@ supported conversions to/from JSON are listed in :doc:`json`.
      - Y
      - Y
      - Y
-     -
+     - Y
      -
      -
      - Y
@@ -457,6 +457,7 @@ Valid examples
   SELECT cast(infinity() as varchar); -- 'Infinity'
   SELECT cast(true as varchar); -- 'true'
   SELECT cast(timestamp '1970-01-01 00:00:00' as varchar); -- '1970-01-01T00:00:00.000'
+  SELECT cast(cast(22.51 as DECIMAL(4, 2)) as varchar); -- '22.51'
 
 Cast to Timestamp
 -----------------
diff --git a/velox/expression/CastExpr-inl.h b/velox/expression/CastExpr-inl.h
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 #pragma once
+
+#include <charconv>
+
 #include "velox/common/base/Exceptions.h"
 #include "velox/core/CoreTypeSystem.h"
 #include "velox/expression/StringWriter.h"
@@ -48,6 +51,27 @@ inline std::exception_ptr makeBadCastException(
       false));
 }
 
+/// Returns the number of decimal digits in n. Leading zeros are not counted
+/// except for n == 0 in which case count_digits returns 1.
+inline int countDigits(uint128_t n) {
+  int count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10)
+      return count;
+    if (n < 100)
+      return count + 1;
+    if (n < 1000)
+      return count + 2;
+    if (n < 10000)
+      return count + 3;
+    n /= 10000U;
+    count += 4;
+  }
+}
+
 } // namespace
 
 template <bool adjustForTimeZone>
@@ -366,6 +390,86 @@ VectorPtr CastExpr::applyDecimalToBooleanCast(
   return result;
 }
 
+template <typename FromNativeType>
+VectorPtr CastExpr::applyDecimalToVarcharCast(
+    const SelectivityVector& rows,
+    const BaseVector& input,
+    exec::EvalCtx& context,
+    const TypePtr& fromType) {
+  VectorPtr result;
+  context.ensureWritable(rows, VARCHAR(), result);
+  (*result).clearNulls(rows);
+  const auto simpleInput = input.as<SimpleVector<FromNativeType>>();
+  int precision = getDecimalPrecisionScale(*fromType).first;
+  int scale = getDecimalPrecisionScale(*fromType).second;
+  // A varchar's size is estimated with unscaled value digits, dot, leading
+  // zero, and possible minus sign.
+  int varcharSize = precision + !!(scale > 0) + !!(precision == scale) + 1;
+  // Calculate the max total size of the raw string buffer.
+  size_t maxTotalResultBytes = rows.countSelected() * varcharSize;
+
+  auto flatResult = result->asFlatVector<StringView>();
+  Buffer* buffer = flatResult->getBufferWithSpace(maxTotalResultBytes);
+  size_t bufferSize = buffer->size();
+  char* rawBuffer = buffer->asMutable<char>() + bufferSize;
+  buffer->setSize(bufferSize + maxTotalResultBytes);
+  size_t offset = 0;
+  applyToSelectedNoThrowLocal(context, rows, result, [&](vector_size_t row) {
+    if (simpleInput->isNullAt(row)) {
+      result->setNull(row, true);
+    } else {
+      auto unscaledValue = simpleInput->valueAt(row);
+      const char zero = '0';
+      size_t startOffset = offset;
+      if (unscaledValue == 0) {
+        memcpy(rawBuffer + offset, &zero, sizeof(char));
+        offset += sizeof(char);
+      } else {
+        if (unscaledValue < 0) {
+          const char minusSign = '-';
+          memcpy(rawBuffer + offset, &minusSign, sizeof(char));
+          offset += sizeof(char);
+          unscaledValue = ~unscaledValue + 1;
+        }
+        auto [position, ec] = std::to_chars(
+            rawBuffer + offset,
+            rawBuffer + offset + varcharSize,
+            unscaledValue / DecimalUtil::kPowersOfTen[scale]);
+        VELOX_USER_CHECK(
+            ec == std::errc(),
+            "Cast from decimal to varchar fails because {}.",
+            std::make_error_code(ec).message());
+        offset = position - rawBuffer;
+        if (scale > 0) {
+          const char dot = '.';
+          memcpy(rawBuffer + offset, &dot, sizeof(char));
+          offset += sizeof(char);
+
+          uint128_t fraction = unscaledValue % DecimalUtil::kPowersOfTen[scale];
+          // Append leading zeros.
+          for (size_t i = 0; i < std::max(scale - countDigits(fraction), 0);
+               i++) {
+            memcpy(rawBuffer + offset, &zero, sizeof(char));
+            offset += sizeof(char);
+          }
+          auto [position, ec] = std::to_chars(
+              rawBuffer + offset, rawBuffer + offset + varcharSize, fraction);
+          VELOX_USER_CHECK(
+              ec == std::errc(),
+              "Cast from decimal to varchar fails because {}.",
+              std::make_error_code(ec).message());
+          offset = position - rawBuffer;
+        }
+      }
+      flatResult->setNoCopy(
+          row, StringView(rawBuffer + startOffset, offset - startOffset));
+    }
+  });
+  // Update the exact buffer size.
+  buffer->setSize(bufferSize + offset);
+  return result;
+}
+
 template <typename FromNativeType>
 VectorPtr CastExpr::applyDecimalToPrimitiveCast(
     const SelectivityVector& rows,
diff --git a/velox/expression/CastExpr.cpp b/velox/expression/CastExpr.cpp
@@ -526,14 +526,26 @@ void CastExpr::applyPeeled(
   } else if (toType->isLongDecimal()) {
     result = applyDecimal<int128_t>(rows, input, context, fromType, toType);
   } else if (fromType->isDecimal()) {
-    result = VELOX_DYNAMIC_DECIMAL_TYPE_DISPATCH(
-        applyDecimalToPrimitiveCast,
-        fromType,
-        rows,
-        input,
-        context,
-        fromType,
-        toType);
+    switch (toType->kind()) {
+      case TypeKind::VARCHAR:
+        result = VELOX_DYNAMIC_DECIMAL_TYPE_DISPATCH(
+            applyDecimalToVarcharCast,
+            fromType,
+            rows,
+            input,
+            context,
+            fromType);
+        break;
+      default:
+        result = VELOX_DYNAMIC_DECIMAL_TYPE_DISPATCH(
+            applyDecimalToPrimitiveCast,
+            fromType,
+            rows,
+            input,
+            context,
+            fromType,
+            toType);
+    }
   } else {
     switch (toType->kind()) {
       case TypeKind::MAP:
diff --git a/velox/expression/CastExpr.h b/velox/expression/CastExpr.h
@@ -264,6 +264,13 @@ class CastExpr : public SpecialForm {
       const BaseVector& input,
       VectorPtr& result);
 
+  template <typename FromNativeType>
+  VectorPtr applyDecimalToVarcharCast(
+      const SelectivityVector& rows,
+      const BaseVector& input,
+      exec::EvalCtx& context,
+      const TypePtr& fromType);
+
   template <TypeKind ToKind>
   void applyCastPrimitivesDispatch(
       const TypePtr& fromType,
diff --git a/velox/expression/tests/CastExprTest.cpp b/velox/expression/tests/CastExprTest.cpp
@@ -1431,6 +1431,46 @@ TEST_F(CastExprTest, decimalToBool) {
       "c0", longFlat, makeNullableFlatVector<bool>({1, 0, std::nullopt}));
 }
 
+TEST_F(CastExprTest, decimalToVarchar) {
+  auto shortFlat = makeNullableFlatVector<int64_t>(
+      {DecimalUtil::kShortDecimalMin,
+       -3,
+       0,
+       55,
+       DecimalUtil::kShortDecimalMax,
+       std::nullopt},
+      DECIMAL(18, 18));
+  testComplexCast(
+      "c0",
+      shortFlat,
+      makeNullableFlatVector<StringView>(
+          {"-0.999999999999999999",
+           "-0.000000000000000003",
+           "0",
+           "0.000000000000000055",
+           "0.999999999999999999",
+           std::nullopt}));
+
+  auto longFlat = makeNullableFlatVector<int128_t>(
+      {DecimalUtil::kLongDecimalMin,
+       0,
+       DecimalUtil::kLongDecimalMax,
+       HugeInt::build(0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFFFFFFFFFFull),
+       HugeInt::build(0xffff, 0xffffffffffffffff),
+       std::nullopt},
+      DECIMAL(38, 5));
+  testComplexCast(
+      "c0",
+      longFlat,
+      makeNullableFlatVector<StringView>(
+          {"-999999999999999999999999999999999.99999",
+           "0",
+           "999999999999999999999999999999999.99999",
+           "-0.00001",
+           "12089258196146291747.06175",
+           std::nullopt}));
+}
+
 TEST_F(CastExprTest, decimalToDecimal) {
   // short to short, scale up.
   auto shortFlat =