Add a CBORGenerator feature for lenient unicode encoding

guillaumebort · guillaumebort · commit df54c3640f2a · 2020-09-29T11:44:02.000+02:00
If enabled, the generator will output the Unicode Replacement Character
for invalid unicode sequence (invalid surrogate chars in the Java
String) instead of failing with an IllegalArgumentException
diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
 {
     private final static int[] NO_INTS = new int[0];
 
+    /**
+     * The replacement character to use to fix invalid unicode sequences.
+     */
+    final static int REPLACEMENT_CHAR = 0xfffd;
+
     /**
      * Let's ensure that we have big enough output buffer because of safety
      * margins we need for UTF-8 encoding.
@@ -61,7 +66,14 @@ public enum Feature implements FormatFeature {
          * Default value is <code>false</code> meaning that type tag will not be
          * written at the beginning of a new document.
          */
-        WRITE_TYPE_HEADER(false)
+        WRITE_TYPE_HEADER(false),
+
+        /**
+         * Feature that determines if an invalid surrogate encoding found in the
+         * incoming String should fail with an exception or silently be outputed
+         * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
+         */
+        LENIENT_UTF_ENCODING(false),
 
         ;
 
@@ -138,6 +150,8 @@ public int getMask() {
 
     protected boolean _cfgMinimalInts;
 
+    protected boolean _cfgLenientUnicodeEncoding;
+
     /*
     /**********************************************************************
     /* Output state
@@ -231,6 +245,7 @@ public CBORGenerator(ObjectWriteContext writeCtxt, IOContext ctxt,
                 : null;
         _tokenWriteContext = CBORWriteContext.createRootContext(dups);
         _cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
+        _cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
         _out = out;
         _bufferRecyclable = true;
         _outputBuffer = ctxt.allocWriteEncodingBuffer(BYTE_BUFFER_FOR_OUTPUT);
@@ -357,6 +372,9 @@ public CBORGenerator enable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = true;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = true;
+        }
         return this;
     }
 
@@ -365,6 +383,9 @@ public CBORGenerator disable(Feature f) {
         if (f == Feature.WRITE_MINIMAL_INTS) {
             _cfgMinimalInts = false;
         }
+        if (f == Feature.LENIENT_UTF_ENCODING) {
+            _cfgLenientUnicodeEncoding = false;
+        }
         return this;
     }
 
@@ -1356,81 +1377,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
         do {
             int c = str[i];
             if (c > 0x7F) {
-                return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
+                return _encode2(i, outputPtr, str, end, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         } while (++i < end);
         return outputPtr - outputStart;
     }
 
-    /**
-     * Helper method called when the whole character sequence is known to fit in
-     * the output buffer, but not all characters are single-byte (ASCII)
-     * characters.
-     */
-    private final int _shortUTF8Encode2(char[] str, int i, int end,
-            int outputPtr, int outputStart) {
-        final byte[] outBuf = _outputBuffer;
-        while (i < end) {
-            int c = str[i++];
-            if (c <= 0x7F) {
-                outBuf[outputPtr++] = (byte) c;
-                continue;
-            }
-            // Nope, multi-byte:
-            if (c < 0x800) { // 2-byte
-                outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // 3 or 4 bytes (surrogate)
-            // Surrogates?
-            if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
-                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
-                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-                continue;
-            }
-            // Yup, a surrogate pair
-            if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
-            }
-            // ... meaning it must have a pair
-            if (i >= end) {
-                _throwIllegalSurrogate(c);
-            }
-            c = _convertSurrogate(c, str[i++]);
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
-            }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
-        }
-        return (outputPtr - outputStart);
-    }
-
     private final int _encode(int outputPtr, String str, int len) {
         final byte[] outBuf = _outputBuffer;
         final int outputStart = outputPtr;
 
         for (int i = 0; i < len; ++i) {
             int c = str.charAt(i);
             if (c > 0x7F) {
-                return _encode2(i, outputPtr, str, len, outputStart);
+                return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
             }
             outBuf[outputPtr++] = (byte) c;
         }
         return (outputPtr - outputStart);
     }
 
-    private final int _encode2(int i, int outputPtr, String str, int len,
+    private final int _encode2(int i, int outputPtr, char[] str, int len,
             int outputStart) {
         final byte[] outBuf = _outputBuffer;
         // no; non-ASCII stuff, slower loop
         while (i < len) {
-            int c = str.charAt(i++);
+            int c = str[i++];
             if (c <= 0x7F) {
                 outBuf[outputPtr++] = (byte) c;
                 continue;
@@ -1452,20 +1425,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
             }
             // Yup, a surrogate pair
             if (c > SURR1_LAST) { // must be from first range; second won't do
-                _throwIllegalSurrogate(c);
+                if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogate(c);
+                }
             }
             // ... meaning it must have a pair
-            if (i >= len) {
-                _throwIllegalSurrogate(c);
+            else if (i >= len) {
+               if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogate(c);
+                }
             }
-            c = _convertSurrogate(c, str.charAt(i++));
-            if (c > 0x10FFFF) { // illegal in JSON as well as in XML
-                _throwIllegalSurrogate(c);
+            // ... verify that the next character is in range
+            else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
+                if (_cfgLenientUnicodeEncoding) {
+                    c = REPLACEMENT_CHAR;
+                } else {
+                    _throwIllegalSurrogatePair(c, str[i]);
+                }
+            }
+            // ... we have a valid surrogate pair
+            else {
+                c = _convertSurrogate(c, str[i++]);
+            }
+            // if we replaced by the replacement char we actually have a 3 bytes char
+            if (c == REPLACEMENT_CHAR) {
+                outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
+            } else {
+                outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
+                outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
             }
-            outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
-            outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
         }
         return (outputPtr - outputStart);
     }
@@ -1474,16 +1470,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
      * Method called to calculate UTF codepoint, from a surrogate pair.
      */
     private int _convertSurrogate(int firstPart, int secondPart) {
-        // Ok, then, is the second part valid?
-        if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
-            throw new IllegalArgumentException(
+        int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
+                + (secondPart - SURR2_FIRST);
+        if (c > 0x10FFFF) { // illegal in JSON as well as in XML
+            if (_cfgLenientUnicodeEncoding) {
+                c = REPLACEMENT_CHAR;
+            } else {
+                _throwIllegalSurrogate(c);
+            }
+        }
+        return c;
+    }
+
+    private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
+         throw new IllegalArgumentException(
                     "Broken surrogate pair: first char 0x"
                             + Integer.toHexString(firstPart) + ", second 0x"
                             + Integer.toHexString(secondPart)
                             + "; illegal combination");
-        }
-        return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
-                + (secondPart - SURR2_FIRST);
     }
 
     private void _throwIllegalSurrogate(int code) {
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java
@@ -94,6 +94,14 @@ protected CBORGenerator cborGenerator(OutputStream result)
         return (CBORGenerator) CBORMapper.shared().createGenerator(result);
     }
 
+    protected CBORGenerator lenientUnicodeCborGenerator(OutputStream result)
+        throws IOException
+    {
+        CBORGenerator gen = cborGenerator(result);
+        gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
+        return gen;
+    }
+
     /*
     /**********************************************************
     /* Doc conversion
diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/gen/UnicodeGenerationTest.java
@@ -0,0 +1,114 @@
+
+import java.io.*;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.*;
+
+import org.junit.Assert;
+
+import com.fasterxml.jackson.core.JsonGenerationException;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import com.fasterxml.jackson.dataformat.cbor.CBORConstants;
+import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
+import com.fasterxml.jackson.dataformat.cbor.CBORParser;
+import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
+
+public class UnicodeGenerationTest extends CBORTestBase
+{   
+    /**
+     * Test that encoding a String containing invalid surrogates fail with an exception
+     */
+    public void testFailForInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out = new ByteArrayOutputStream();
+        CBORGenerator gen = cborGenerator(out);
+
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched first surrogate character
+        try { 
+            gen.writeString("x\ud83d");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character
+        try { 
+            gen.writeString("x\ude01");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Unmatched second surrogate character (2)
+        try { 
+            gen.writeString("x\ude01x");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+
+        // Broken surrogate pair
+        try { 
+            gen.writeString("x\ud83dx");
+        } catch (IllegalArgumentException e) {
+        }
+        assertEquals(0, gen.getOutputBuffered());
+    }
+
+    /**
+     * Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
+     */
+    public void testRecoverInvalidSurrogate() throws Exception
+    {
+        ByteArrayOutputStream out;
+        CBORGenerator gen;
+        byte[] b;
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Unmatched first surrogate character
+        gen.writeString("x\ud83d");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Unmatched second surrogate character
+        gen.writeString("x\ude01");
+        gen.close();
+        b = "x\ufffd".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Unmatched second surrogate character (2)
+        gen.writeString("x\ude01x");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+        out = new ByteArrayOutputStream();
+        gen = lenientUnicodeCborGenerator(out);
+        assertEquals(0, gen.getOutputBuffered());
+    
+        // Broken surrogate pair
+        gen.writeString("x\ud83dx");
+        gen.close();
+        b = "x\ufffdx".getBytes("utf-8");
+        _verifyBytes(out.toByteArray(),
+                (byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
+
+    }
+
+}