@@ -22,6 +22,11 @@ public class CBORGenerator extends GeneratorBase
2222{
2323 private final static int [] NO_INTS = new int [0 ];
2424
25+ /**
26+ * The replacement character to use to fix invalid unicode sequences.
27+ */
28+ final static int REPLACEMENT_CHAR = 0xfffd ;
29+
2530 /**
2631 * Let's ensure that we have big enough output buffer because of safety
2732 * margins we need for UTF-8 encoding.
@@ -61,7 +66,14 @@ public enum Feature implements FormatFeature {
6166 * Default value is <code>false</code> meaning that type tag will not be
6267 * written at the beginning of a new document.
6368 */
64- WRITE_TYPE_HEADER (false )
69+ WRITE_TYPE_HEADER (false ),
70+
71+ /**
72+ * Feature that determines if an invalid surrogate encoding found in the
73+ * incoming String should fail with an exception or silently be outputed
74+ * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
75+ */
76+ LENIENT_UTF_ENCODING (false ),
6577
6678 ;
6779
@@ -138,6 +150,8 @@ public int getMask() {
138150
139151 protected boolean _cfgMinimalInts ;
140152
153+ protected boolean _cfgLenientUnicodeEncoding ;
154+
141155 /*
142156 /**********************************************************************
143157 /* Output state
@@ -231,6 +245,7 @@ public CBORGenerator(ObjectWriteContext writeCtxt, IOContext ctxt,
231245 : null ;
232246 _tokenWriteContext = CBORWriteContext .createRootContext (dups );
233247 _cfgMinimalInts = Feature .WRITE_MINIMAL_INTS .enabledIn (formatFeatures );
248+ _cfgLenientUnicodeEncoding = Feature .LENIENT_UTF_ENCODING .enabledIn (formatFeatures );
234249 _out = out ;
235250 _bufferRecyclable = true ;
236251 _outputBuffer = ctxt .allocWriteEncodingBuffer (BYTE_BUFFER_FOR_OUTPUT );
@@ -357,6 +372,9 @@ public CBORGenerator enable(Feature f) {
357372 if (f == Feature .WRITE_MINIMAL_INTS ) {
358373 _cfgMinimalInts = true ;
359374 }
375+ if (f == Feature .LENIENT_UTF_ENCODING ) {
376+ _cfgLenientUnicodeEncoding = true ;
377+ }
360378 return this ;
361379 }
362380
@@ -365,6 +383,9 @@ public CBORGenerator disable(Feature f) {
365383 if (f == Feature .WRITE_MINIMAL_INTS ) {
366384 _cfgMinimalInts = false ;
367385 }
386+ if (f == Feature .LENIENT_UTF_ENCODING ) {
387+ _cfgLenientUnicodeEncoding = false ;
388+ }
368389 return this ;
369390 }
370391
@@ -1356,81 +1377,33 @@ private final int _encode(int outputPtr, char[] str, int i, int end) {
13561377 do {
13571378 int c = str [i ];
13581379 if (c > 0x7F ) {
1359- return _shortUTF8Encode2 ( str , i , end , outputPtr , outputStart );
1380+ return _encode2 ( i , outputPtr , str , end , outputStart );
13601381 }
13611382 outBuf [outputPtr ++] = (byte ) c ;
13621383 } while (++i < end );
13631384 return outputPtr - outputStart ;
13641385 }
13651386
1366- /**
1367- * Helper method called when the whole character sequence is known to fit in
1368- * the output buffer, but not all characters are single-byte (ASCII)
1369- * characters.
1370- */
1371- private final int _shortUTF8Encode2 (char [] str , int i , int end ,
1372- int outputPtr , int outputStart ) {
1373- final byte [] outBuf = _outputBuffer ;
1374- while (i < end ) {
1375- int c = str [i ++];
1376- if (c <= 0x7F ) {
1377- outBuf [outputPtr ++] = (byte ) c ;
1378- continue ;
1379- }
1380- // Nope, multi-byte:
1381- if (c < 0x800 ) { // 2-byte
1382- outBuf [outputPtr ++] = (byte ) (0xc0 | (c >> 6 ));
1383- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1384- continue ;
1385- }
1386- // 3 or 4 bytes (surrogate)
1387- // Surrogates?
1388- if (c < SURR1_FIRST || c > SURR2_LAST ) { // nope, regular 3-byte character
1389- outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1390- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1391- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1392- continue ;
1393- }
1394- // Yup, a surrogate pair
1395- if (c > SURR1_LAST ) { // must be from first range; second won't do
1396- _throwIllegalSurrogate (c );
1397- }
1398- // ... meaning it must have a pair
1399- if (i >= end ) {
1400- _throwIllegalSurrogate (c );
1401- }
1402- c = _convertSurrogate (c , str [i ++]);
1403- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1404- _throwIllegalSurrogate (c );
1405- }
1406- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1407- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1408- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1409- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1410- }
1411- return (outputPtr - outputStart );
1412- }
1413-
14141387 private final int _encode (int outputPtr , String str , int len ) {
14151388 final byte [] outBuf = _outputBuffer ;
14161389 final int outputStart = outputPtr ;
14171390
14181391 for (int i = 0 ; i < len ; ++i ) {
14191392 int c = str .charAt (i );
14201393 if (c > 0x7F ) {
1421- return _encode2 (i , outputPtr , str , len , outputStart );
1394+ return _encode2 (i , outputPtr , str . toCharArray () , len , outputStart );
14221395 }
14231396 outBuf [outputPtr ++] = (byte ) c ;
14241397 }
14251398 return (outputPtr - outputStart );
14261399 }
14271400
1428- private final int _encode2 (int i , int outputPtr , String str , int len ,
1401+ private final int _encode2 (int i , int outputPtr , char [] str , int len ,
14291402 int outputStart ) {
14301403 final byte [] outBuf = _outputBuffer ;
14311404 // no; non-ASCII stuff, slower loop
14321405 while (i < len ) {
1433- int c = str . charAt ( i ++) ;
1406+ int c = str [ i ++] ;
14341407 if (c <= 0x7F ) {
14351408 outBuf [outputPtr ++] = (byte ) c ;
14361409 continue ;
@@ -1452,20 +1425,43 @@ private final int _encode2(int i, int outputPtr, String str, int len,
14521425 }
14531426 // Yup, a surrogate pair
14541427 if (c > SURR1_LAST ) { // must be from first range; second won't do
1455- _throwIllegalSurrogate (c );
1428+ if (_cfgLenientUnicodeEncoding ) {
1429+ c = REPLACEMENT_CHAR ;
1430+ } else {
1431+ _throwIllegalSurrogate (c );
1432+ }
14561433 }
14571434 // ... meaning it must have a pair
1458- if (i >= len ) {
1459- _throwIllegalSurrogate (c );
1435+ else if (i >= len ) {
1436+ if (_cfgLenientUnicodeEncoding ) {
1437+ c = REPLACEMENT_CHAR ;
1438+ } else {
1439+ _throwIllegalSurrogate (c );
1440+ }
14601441 }
1461- c = _convertSurrogate (c , str .charAt (i ++));
1462- if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1463- _throwIllegalSurrogate (c );
1442+ // ... verify that the next character is in range
1443+ else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1444+ if (_cfgLenientUnicodeEncoding ) {
1445+ c = REPLACEMENT_CHAR ;
1446+ } else {
1447+ _throwIllegalSurrogatePair (c , str [i ]);
1448+ }
1449+ }
1450+ // ... we have a valid surrogate pair
1451+ else {
1452+ c = _convertSurrogate (c , str [i ++]);
1453+ }
1454+ // if we replaced by the replacement char we actually have a 3 bytes char
1455+ if (c == REPLACEMENT_CHAR ) {
1456+ outBuf [outputPtr ++] = (byte ) (0xe0 | (c >> 12 ));
1457+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1458+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
1459+ } else {
1460+ outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1461+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1462+ outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1463+ outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
14641464 }
1465- outBuf [outputPtr ++] = (byte ) (0xf0 | (c >> 18 ));
1466- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 12 ) & 0x3f ));
1467- outBuf [outputPtr ++] = (byte ) (0x80 | ((c >> 6 ) & 0x3f ));
1468- outBuf [outputPtr ++] = (byte ) (0x80 | (c & 0x3f ));
14691465 }
14701466 return (outputPtr - outputStart );
14711467 }
@@ -1474,16 +1470,24 @@ private final int _encode2(int i, int outputPtr, String str, int len,
14741470 * Method called to calculate UTF codepoint, from a surrogate pair.
14751471 */
14761472 private int _convertSurrogate (int firstPart , int secondPart ) {
1477- // Ok, then, is the second part valid?
1478- if (secondPart < SURR2_FIRST || secondPart > SURR2_LAST ) {
1479- throw new IllegalArgumentException (
1473+ int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1474+ + (secondPart - SURR2_FIRST );
1475+ if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1476+ if (_cfgLenientUnicodeEncoding ) {
1477+ c = REPLACEMENT_CHAR ;
1478+ } else {
1479+ _throwIllegalSurrogate (c );
1480+ }
1481+ }
1482+ return c ;
1483+ }
1484+
1485+ private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1486+ throw new IllegalArgumentException (
14801487 "Broken surrogate pair: first char 0x"
14811488 + Integer .toHexString (firstPart ) + ", second 0x"
14821489 + Integer .toHexString (secondPart )
14831490 + "; illegal combination" );
1484- }
1485- return 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
1486- + (secondPart - SURR2_FIRST );
14871491 }
14881492
14891493 private void _throwIllegalSurrogate (int code ) {
0 commit comments