@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
2222{
2323 private final static int [] NO_INTS = new int [0 ];
2424
25+ /**
26+ * The replacement character to use to fix invalid unicode sequences.
27+ */
28+ final static int REPLACEMENT_CHAR = 0xfffd ;
29+
2530 /**
2631 * Let's ensure that we have big enough output buffer because of safety
2732 * margins we need for UTF-8 encoding.
@@ -63,7 +68,14 @@ public enum Feature implements FormatFeature {
6368 *
6469 * @since 2.5
6570 */
66- WRITE_TYPE_HEADER (false )
71+ WRITE_TYPE_HEADER (false ),
72+
73+ /**
74+ * Feature that determines if an invalid surrogate encoding found in the
75+ * incoming String should fail with an exception or silently be outputed
76+ * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
77+ */
78+ LENIENT_UTF_ENCODING (false ),
6779
6880 ;
6981
@@ -140,6 +152,8 @@ public int getMask() {
140152
141153 protected boolean _cfgMinimalInts ;
142154
155+ protected boolean _cfgLenientUnicodeEncoding ;
156+
143157 /*
144158 /**********************************************************
145159 /* Output state
@@ -234,6 +248,7 @@ public CBORGenerator(IOContext ctxt, int stdFeatures, int formatFeatures,
234248 _cborContext = CBORWriteContext .createRootContext (dups );
235249 _formatFeatures = formatFeatures ;
236250 _cfgMinimalInts = Feature .WRITE_MINIMAL_INTS .enabledIn (formatFeatures );
251+ _cfgLenientUnicodeEncoding = Feature .LENIENT_UTF_ENCODING .enabledIn (formatFeatures );
237252 _ioContext = ctxt ;
238253 _out = out ;
239254 _bufferRecyclable = true ;
@@ -406,6 +421,9 @@ public CBORGenerator enable(Feature f) {
406421 if (f == Feature .WRITE_MINIMAL_INTS ) {
407422 _cfgMinimalInts = true ;
408423 }
424+ if (f == Feature .LENIENT_UTF_ENCODING ) {
425+ _cfgLenientUnicodeEncoding = true ;
426+ }
409427 return this ;
410428 }
411429
@@ -414,6 +432,9 @@ public CBORGenerator disable(Feature f) {
414432 if (f == Feature .WRITE_MINIMAL_INTS ) {
415433 _cfgMinimalInts = false ;
416434 }
435+ if (f == Feature .LENIENT_UTF_ENCODING ) {
436+ _cfgLenientUnicodeEncoding = false ;
437+ }
417438 return this ;
418439 }
419440
@@ -1424,81 +1445,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
14241445 do {
14251446 int c = str [i ];
14261447 if (c > 0x7F ) {
1427- return _shortUTF8Encode2 ( str , i , end , outputPtr , outputStart );
1448+ return _encode2 ( i , outputPtr , str , end , outputStart );
14281449 }
14291450 outBuf [outputPtr ++] = (byte ) c ;
14301451 } while (++i < end );
14311452 return outputPtr - outputStart ;
14321453 }
14331454
1434- /**
1435- * Helper method called when the whole character sequence is known to fit in
1436- * the output buffer, but not all characters are single-byte (ASCII)
1437- * characters.
1438- */
1439- private final int _shortUTF8Encode2 (char [] str , int i , int end ,
1440- int outputPtr , int outputStart ) {
1441- final byte [] outBuf = _outputBuffer ;
1442- while (i < end ) {
1443- int c = str [i ++];
1444- if (c <= 0x7F ) {
1445- outBuf [outputPtr ++] = (byte ) c ;
1446- continue ;
1447- }
1448- // Nope, multi-byte:
1449- if (c < 0x800 ) { // 2-byte
1450- outBuf [outputPtr ++] = (byte ) (0xc0 | (c >> 6 ));
1451- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1452- continue ;
1453- }
1454- // 3 or 4 bytes (surrogate)
1455- // Surrogates?
1456- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte character
1457- outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1458- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1459- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1460- continue ;
1461- }
1462- // Yup, a surrogate pair
1463- if (c > SURR1_LAST ) { // must be from first range; second won't do
1464- _throwIllegalSurrogate (c );
1465- }
1466- // ... meaning it must have a pair
1467- if (i >= end ) {
1468- _throwIllegalSurrogate (c );
1469- }
1470- c = _convertSurrogate (c , str [i ++]);
1471- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1472- _throwIllegalSurrogate (c );
1473- }
1474- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1475- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1476- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1477- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1478- }
1479- return (outputPtr - outputStart );
1480- }
1481-
14821455 private final int _encode (int outputPtr , String str , int len ) {
14831456 final byte [] outBuf = _outputBuffer ;
14841457 final int outputStart = outputPtr ;
14851458
14861459 for (int i = 0 ; i < len ; ++i ) {
14871460 int c = str .charAt (i );
14881461 if (c > 0x7F ) {
1489- return _encode2 (i , outputPtr , str , len , outputStart );
1462+ return _encode2 (i , outputPtr , str . toCharArray () , len , outputStart );
14901463 }
14911464 outBuf [outputPtr ++] = (byte ) c ;
14921465 }
14931466 return (outputPtr - outputStart );
14941467 }
14951468
1496- private final int _encode2 (int i , int outputPtr , String str , int len ,
1469+ private final int _encode2 (int i , int outputPtr , char [] str , int len ,
14971470 int outputStart ) {
14981471 final byte [] outBuf = _outputBuffer ;
14991472 // no; non-ASCII stuff, slower loop
15001473 while (i < len ) {
1501- int c = str . charAt ( i ++) ;
1474+ int c = str [ i ++] ;
15021475 if (c <= 0x7F ) {
15031476 outBuf [outputPtr ++] = (byte ) c ;
15041477 continue ;
@@ -1520,20 +1493,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
15201493 }
15211494 // Yup, a surrogate pair
15221495 if (c > SURR1_LAST ) { // must be from first range; second won't do
1523- _throwIllegalSurrogate (c );
1496+ if (_cfgLenientUnicodeEncoding ) {
1497+ c = REPLACEMENT_CHAR ;
1498+ } else {
1499+ _throwIllegalSurrogate (c );
1500+ }
15241501 }
15251502 // ... meaning it must have a pair
1526- if (i >= len ) {
1527- _throwIllegalSurrogate (c );
1503+ else if (i >= len ) {
1504+ if (_cfgLenientUnicodeEncoding ) {
1505+ c = REPLACEMENT_CHAR ;
1506+ } else {
1507+ _throwIllegalSurrogate (c );
1508+ }
15281509 }
1529- c = _convertSurrogate (c , str .charAt (i ++));
1530- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1531- _throwIllegalSurrogate (c );
1510+ // ... verify that the next character is in range
1511+ else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1512+ if (_cfgLenientUnicodeEncoding ) {
1513+ c = REPLACEMENT_CHAR ;
1514+ } else {
1515+ _throwIllegalSurrogatePair (c , str [i ]);
1516+ }
1517+ }
1518+ // ... we have a valid surrogate pair
1519+ else {
1520+ c = _convertSurrogate (c , str [i ++]);
1521+ }
1522+ // if we replaced by the replacement char we actually have a 3 bytes char
1523+ if (c == REPLACEMENT_CHAR ) {
1524+ outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1525+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1526+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1527+ } else {
1528+ outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1529+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1530+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1531+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
15321532 }
1533- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1534- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1535- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1536- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
15371533 }
15381534 return (outputPtr - outputStart );
15391535 }
@@ -1542,16 +1538,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
15421538 * Method called to calculate UTF codepoint, from a surrogate pair.
15431539 */
15441540 private int _convertSurrogate (int firstPart , int secondPart ) {
1545- // Ok, then, is the second part valid?
1546- if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST ) {
1547- throw new IllegalArgumentException (
1541+ int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1542+ + (secondPart - SURR2_FIRST );
1543+ if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1544+ if (_cfgLenientUnicodeEncoding ) {
1545+ c = REPLACEMENT_CHAR ;
1546+ } else {
1547+ _throwIllegalSurrogate (c );
1548+ }
1549+ }
1550+ return c ;
1551+ }
1552+
1553+ private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1554+ throw new IllegalArgumentException (
15481555 "Broken surrogate pair: first char 0x"
15491556 + Integer .toHexString (firstPart ) + ", second 0x"
15501557 + Integer .toHexString (secondPart )
15511558 + "; illegal combination" );
1552- }
1553- return 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1554- + (secondPart - SURR2_FIRST );
15551559 }
15561560
15571561 private void _throwIllegalSurrogate (int code ) {
0 commit comments