Skip to content

Commit 03482f4

Browse files
committed
Add a CBORGenerator feature for lenient unicode encoding
If enabled, the generator will output the Unicode Replacement Character for invalid unicode sequence (invalid surrogate chars in the Java String) instead of failing with an IllegalArgumentException
1 parent f5853dc commit 03482f4

File tree

3 files changed

+195
-69
lines changed

3 files changed

+195
-69
lines changed

cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORGenerator.java

Lines changed: 73 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
2222
{
2323
private final static int[] NO_INTS = new int[0];
2424

25+
/**
26+
* The replacement character to use to fix invalid unicode sequences.
27+
*/
28+
final static int REPLACEMENT_CHAR = 0xfffd;
29+
2530
/**
2631
* Let's ensure that we have big enough output buffer because of safety
2732
* margins we need for UTF-8 encoding.
@@ -63,7 +68,14 @@ public enum Feature implements FormatFeature {
6368
*
6469
* @since 2.5
6570
*/
66-
WRITE_TYPE_HEADER(false)
71+
WRITE_TYPE_HEADER(false),
72+
73+
/**
74+
* Feature that determines if an invalid surrogate encoding found in the
75+
* incoming String should fail with an exception or silently be outputed
76+
* as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
77+
*/
78+
LENIENT_UTF_ENCODING(false),
6779

6880
;
6981

@@ -140,6 +152,8 @@ public int getMask() {
140152

141153
protected boolean _cfgMinimalInts;
142154

155+
protected boolean _cfgLenientUnicodeEncoding;
156+
143157
/*
144158
/**********************************************************
145159
/* Output state
@@ -234,6 +248,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
234248
_cborContext = CBORWriteContext.createRootContext(dups);
235249
_formatFeatures = formatFeatures;
236250
_cfgMinimalInts = Feature.WRITE_MINIMAL_INTS.enabledIn(formatFeatures);
251+
_cfgLenientUnicodeEncoding = Feature.LENIENT_UTF_ENCODING.enabledIn(formatFeatures);
237252
_ioContext = ctxt;
238253
_out = out;
239254
_bufferRecyclable = true;
@@ -406,6 +421,9 @@ public CBORGenerator enable(Feature f) {
406421
if (f == Feature.WRITE_MINIMAL_INTS) {
407422
_cfgMinimalInts = true;
408423
}
424+
if (f == Feature.LENIENT_UTF_ENCODING) {
425+
_cfgLenientUnicodeEncoding = true;
426+
}
409427
return this;
410428
}
411429

@@ -414,6 +432,9 @@ public CBORGenerator disable(Feature f) {
414432
if (f == Feature.WRITE_MINIMAL_INTS) {
415433
_cfgMinimalInts = false;
416434
}
435+
if (f == Feature.LENIENT_UTF_ENCODING) {
436+
_cfgLenientUnicodeEncoding = false;
437+
}
417438
return this;
418439
}
419440

@@ -1424,81 +1445,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
14241445
do {
14251446
int c = str[i];
14261447
if (c > 0x7F) {
1427-
return _shortUTF8Encode2(str, i, end, outputPtr, outputStart);
1448+
return _encode2(i, outputPtr, str, end, outputStart);
14281449
}
14291450
outBuf[outputPtr++] = (byte) c;
14301451
} while (++i < end);
14311452
return outputPtr - outputStart;
14321453
}
14331454

1434-
/**
1435-
* Helper method called when the whole character sequence is known to fit in
1436-
* the output buffer, but not all characters are single-byte (ASCII)
1437-
* characters.
1438-
*/
1439-
private final int _shortUTF8Encode2(char[] str, int i, int end,
1440-
int outputPtr, int outputStart) {
1441-
final byte[] outBuf = _outputBuffer;
1442-
while (i < end) {
1443-
int c = str[i++];
1444-
if (c <= 0x7F) {
1445-
outBuf[outputPtr++] = (byte) c;
1446-
continue;
1447-
}
1448-
// Nope, multi-byte:
1449-
if (c < 0x800) { // 2-byte
1450-
outBuf[outputPtr++] = (byte) (0xc0 | (c >> 6));
1451-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1452-
continue;
1453-
}
1454-
// 3 or 4 bytes (surrogate)
1455-
// Surrogates?
1456-
if (c < SURR1_FIRST || c > SURR2_LAST) { // nope, regular 3-byte character
1457-
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
1458-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1459-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1460-
continue;
1461-
}
1462-
// Yup, a surrogate pair
1463-
if (c > SURR1_LAST) { // must be from first range; second won't do
1464-
_throwIllegalSurrogate(c);
1465-
}
1466-
// ... meaning it must have a pair
1467-
if (i >= end) {
1468-
_throwIllegalSurrogate(c);
1469-
}
1470-
c = _convertSurrogate(c, str[i++]);
1471-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1472-
_throwIllegalSurrogate(c);
1473-
}
1474-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1475-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1476-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1477-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1478-
}
1479-
return (outputPtr - outputStart);
1480-
}
1481-
14821455
private final int _encode(int outputPtr, String str, int len) {
14831456
final byte[] outBuf = _outputBuffer;
14841457
final int outputStart = outputPtr;
14851458

14861459
for (int i = 0; i < len; ++i) {
14871460
int c = str.charAt(i);
14881461
if (c > 0x7F) {
1489-
return _encode2(i, outputPtr, str, len, outputStart);
1462+
return _encode2(i, outputPtr, str.toCharArray(), len, outputStart);
14901463
}
14911464
outBuf[outputPtr++] = (byte) c;
14921465
}
14931466
return (outputPtr - outputStart);
14941467
}
14951468

1496-
private final int _encode2(int i, int outputPtr, String str, int len,
1469+
private final int _encode2(int i, int outputPtr, char[] str, int len,
14971470
int outputStart) {
14981471
final byte[] outBuf = _outputBuffer;
14991472
// no; non-ASCII stuff, slower loop
15001473
while (i < len) {
1501-
int c = str.charAt(i++);
1474+
int c = str[i++];
15021475
if (c <= 0x7F) {
15031476
outBuf[outputPtr++] = (byte) c;
15041477
continue;
@@ -1520,20 +1493,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
15201493
}
15211494
// Yup, a surrogate pair
15221495
if (c > SURR1_LAST) { // must be from first range; second won't do
1523-
_throwIllegalSurrogate(c);
1496+
if (_cfgLenientUnicodeEncoding) {
1497+
c = REPLACEMENT_CHAR;
1498+
} else {
1499+
_throwIllegalSurrogate(c);
1500+
}
15241501
}
15251502
// ... meaning it must have a pair
1526-
if (i >= len) {
1527-
_throwIllegalSurrogate(c);
1503+
else if (i >= len) {
1504+
if (_cfgLenientUnicodeEncoding) {
1505+
c = REPLACEMENT_CHAR;
1506+
} else {
1507+
_throwIllegalSurrogate(c);
1508+
}
15281509
}
1529-
c = _convertSurrogate(c, str.charAt(i++));
1530-
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1531-
_throwIllegalSurrogate(c);
1510+
// ... verify that the next character is in range
1511+
else if (str[i] < SURR2_FIRST || str[i] > SURR2_LAST) {
1512+
if (_cfgLenientUnicodeEncoding) {
1513+
c = REPLACEMENT_CHAR;
1514+
} else {
1515+
_throwIllegalSurrogatePair(c, str[i]);
1516+
}
1517+
}
1518+
// ... we have a valid surrogate pair
1519+
else {
1520+
c = _convertSurrogate(c, str[i++]);
1521+
}
1522+
// if we replaced by the replacement char we actually have a 3 bytes char
1523+
if (c == REPLACEMENT_CHAR) {
1524+
outBuf[outputPtr++] = (byte) (0xe0 | (c >> 12));
1525+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1526+
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
1527+
} else {
1528+
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1529+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1530+
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1531+
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
15321532
}
1533-
outBuf[outputPtr++] = (byte) (0xf0 | (c >> 18));
1534-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 12) & 0x3f));
1535-
outBuf[outputPtr++] = (byte) (0x80 | ((c >> 6) & 0x3f));
1536-
outBuf[outputPtr++] = (byte) (0x80 | (c & 0x3f));
15371533
}
15381534
return (outputPtr - outputStart);
15391535
}
@@ -1542,16 +1538,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
15421538
* Method called to calculate UTF codepoint, from a surrogate pair.
15431539
*/
15441540
private int _convertSurrogate(int firstPart, int secondPart) {
1545-
// Ok, then, is the second part valid?
1546-
if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST) {
1547-
throw new IllegalArgumentException(
1541+
int c = 0x10000 + ((firstPart - SURR1_FIRST) << 10)
1542+
+ (secondPart - SURR2_FIRST);
1543+
if (c > 0x10FFFF) { // illegal in JSON as well as in XML
1544+
if (_cfgLenientUnicodeEncoding) {
1545+
c = REPLACEMENT_CHAR;
1546+
} else {
1547+
_throwIllegalSurrogate(c);
1548+
}
1549+
}
1550+
return c;
1551+
}
1552+
1553+
private void _throwIllegalSurrogatePair(int firstPart, int secondPart) {
1554+
throw new IllegalArgumentException(
15481555
"Broken surrogate pair: first char 0x"
15491556
+ Integer.toHexString(firstPart) + ", second 0x"
15501557
+ Integer.toHexString(secondPart)
15511558
+ "; illegal combination");
1552-
}
1553-
return 0x10000 + ((firstPart - SURR1_FIRST) << 10)
1554-
+ (secondPart - SURR2_FIRST);
15551559
}
15561560

15571561
private void _throwIllegalSurrogate(int code) {

cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/CBORTestBase.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,14 @@ protected CBORGenerator cborGenerator(CBORFactory f,
8585
return f.createGenerator(result, null);
8686
}
8787

88+
protected CBORGenerator lenientUnicodeCborGenerator(OutputStream result)
89+
throws IOException
90+
{
91+
CBORGenerator gen = cborGenerator(result);
92+
gen.enable(CBORGenerator.Feature.LENIENT_UTF_ENCODING);
93+
return gen;
94+
}
95+
8896
/*
8997
/**********************************************************
9098
/* Additional assertion methods
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
2+
import java.io.*;
3+
import java.math.BigDecimal;
4+
import java.math.BigInteger;
5+
import java.util.*;
6+
7+
import org.junit.Assert;
8+
9+
import com.fasterxml.jackson.core.JsonGenerationException;
10+
11+
import com.fasterxml.jackson.databind.ObjectMapper;
12+
13+
import com.fasterxml.jackson.dataformat.cbor.CBORConstants;
14+
import com.fasterxml.jackson.dataformat.cbor.CBORGenerator;
15+
import com.fasterxml.jackson.dataformat.cbor.CBORParser;
16+
import com.fasterxml.jackson.dataformat.cbor.CBORTestBase;
17+
18+
public class UnicodeGenerationTest extends CBORTestBase
19+
{
20+
/**
21+
* Test that encoding a String containing invalid surrogates fail with an exception
22+
*/
23+
public void testFailForInvalidSurrogate() throws Exception
24+
{
25+
ByteArrayOutputStream out = new ByteArrayOutputStream();
26+
CBORGenerator gen = cborGenerator(out);
27+
28+
assertEquals(0, gen.getOutputBuffered());
29+
30+
// Unmatched first surrogate character
31+
try {
32+
gen.writeString("x\ud83d");
33+
} catch (IllegalArgumentException e) {
34+
}
35+
assertEquals(0, gen.getOutputBuffered());
36+
37+
// Unmatched second surrogate character
38+
try {
39+
gen.writeString("x\ude01");
40+
} catch (IllegalArgumentException e) {
41+
}
42+
assertEquals(0, gen.getOutputBuffered());
43+
44+
// Unmatched second surrogate character (2)
45+
try {
46+
gen.writeString("x\ude01x");
47+
} catch (IllegalArgumentException e) {
48+
}
49+
assertEquals(0, gen.getOutputBuffered());
50+
51+
// Broken surrogate pair
52+
try {
53+
gen.writeString("x\ud83dx");
54+
} catch (IllegalArgumentException e) {
55+
}
56+
assertEquals(0, gen.getOutputBuffered());
57+
}
58+
59+
/**
60+
* Test that when the lenient unicode feature is enabled, the replacement character is used to fix invalid sequences
61+
*/
62+
public void testRecoverInvalidSurrogate() throws Exception
63+
{
64+
ByteArrayOutputStream out;
65+
CBORGenerator gen;
66+
byte[] b;
67+
68+
out = new ByteArrayOutputStream();
69+
gen = lenientUnicodeCborGenerator(out);
70+
assertEquals(0, gen.getOutputBuffered());
71+
72+
// Unmatched first surrogate character
73+
gen.writeString("x\ud83d");
74+
gen.close();
75+
b = "x\ufffd".getBytes("utf-8");
76+
_verifyBytes(out.toByteArray(),
77+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
78+
79+
out = new ByteArrayOutputStream();
80+
gen = lenientUnicodeCborGenerator(out);
81+
assertEquals(0, gen.getOutputBuffered());
82+
83+
// Unmatched second surrogate character
84+
gen.writeString("x\ude01");
85+
gen.close();
86+
b = "x\ufffd".getBytes("utf-8");
87+
_verifyBytes(out.toByteArray(),
88+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
89+
90+
out = new ByteArrayOutputStream();
91+
gen = lenientUnicodeCborGenerator(out);
92+
assertEquals(0, gen.getOutputBuffered());
93+
94+
// Unmatched second surrogate character (2)
95+
gen.writeString("x\ude01x");
96+
gen.close();
97+
b = "x\ufffdx".getBytes("utf-8");
98+
_verifyBytes(out.toByteArray(),
99+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
100+
101+
out = new ByteArrayOutputStream();
102+
gen = lenientUnicodeCborGenerator(out);
103+
assertEquals(0, gen.getOutputBuffered());
104+
105+
// Broken surrogate pair
106+
gen.writeString("x\ud83dx");
107+
gen.close();
108+
b = "x\ufffdx".getBytes("utf-8");
109+
_verifyBytes(out.toByteArray(),
110+
(byte) (CBORConstants.PREFIX_TYPE_TEXT + b.length), b);
111+
112+
}
113+
114+
}

0 commit comments

Comments
 (0)