@@ -74,6 +74,8 @@ public enum Feature implements FormatFeature {
7474 * Feature that determines if an invalid surrogate encoding found in the
7575 * incoming String should fail with an exception or silently be outputed
7676 * as the Unicode 'REPLACEMENT CHARACTER' (U+FFFD)
77+ *
78+ * @since 2.12
7779 */
7880 LENIENT_UTF_ENCODING (false ),
7981
@@ -152,6 +154,11 @@ public int getMask() {
152154
153155 protected boolean _cfgMinimalInts ;
154156
157+
158+ /**
159+ * If true we will output the REPLACEMENT_CHAR for invalid unicode sequences.
160+ * If false we will throw an IllegalArgumentException for invalid unicode sequences.
161+ */
155162 protected boolean _cfgLenientUnicodeEncoding ;
156163
157164 /*
@@ -1493,27 +1500,15 @@ private final int _encode2(int i, int outputPtr, char[] str, int len,
14931500 }
14941501 // Yup, a surrogate pair
14951502 if (c > SURR1_LAST ) { // must be from first range; second won't do
1496- if (_cfgLenientUnicodeEncoding ) {
1497- c = REPLACEMENT_CHAR ;
1498- } else {
1499- _throwIllegalSurrogate (c );
1500- }
1503+ c = _illegalSurrogateFound (c );
15011504 }
15021505 // ... meaning it must have a pair
15031506 else if (i >= len ) {
1504- if (_cfgLenientUnicodeEncoding ) {
1505- c = REPLACEMENT_CHAR ;
1506- } else {
1507- _throwIllegalSurrogate (c );
1508- }
1507+ c = _illegalSurrogateFound (c );
15091508 }
15101509 // ... verify that the next character is in range
15111510 else if (str [i ] < SURR2_FIRST || str [i ] > SURR2_LAST ) {
1512- if (_cfgLenientUnicodeEncoding ) {
1513- c = REPLACEMENT_CHAR ;
1514- } else {
1515- _throwIllegalSurrogatePair (c , str [i ]);
1516- }
1511+ c = _illegalSurrogatePairFound (c , str [i ]);
15171512 }
15181513 // ... we have a valid surrogate pair
15191514 else {
@@ -1541,43 +1536,47 @@ private int _convertSurrogate(int firstPart, int secondPart) {
15411536 int c = 0x10000 + ((firstPart - SURR1_FIRST ) << 10 )
15421537 + (secondPart - SURR2_FIRST );
15431538 if (c > 0x10FFFF ) { // illegal in JSON as well as in XML
1544- if (_cfgLenientUnicodeEncoding ) {
1545- c = REPLACEMENT_CHAR ;
1546- } else {
1547- _throwIllegalSurrogate (c );
1548- }
1539+ c = _illegalSurrogatePairFound (firstPart , secondPart );
15491540 }
15501541 return c ;
15511542 }
15521543
1553- private void _throwIllegalSurrogatePair (int firstPart , int secondPart ) {
1554- throw new IllegalArgumentException (
1555- "Broken surrogate pair: first char 0x"
1556- + Integer .toHexString (firstPart ) + ", second 0x"
1557- + Integer .toHexString (secondPart )
1558- + "; illegal combination" );
1544+ private int _illegalSurrogatePairFound (int firstPart , int secondPart ) {
1545+ if (_cfgLenientUnicodeEncoding ) {
1546+ return REPLACEMENT_CHAR ;
1547+ } else {
1548+ throw new IllegalArgumentException (
1549+ "Broken surrogate pair: first char 0x"
1550+ + Integer .toHexString (firstPart ) + ", second 0x"
1551+ + Integer .toHexString (secondPart )
1552+ + "; illegal combination" );
1553+ }
15591554 }
15601555
1561- private void _throwIllegalSurrogate (int code ) {
1562- if (code > 0x10FFFF ) { // over max?
1563- throw new IllegalArgumentException ("Illegal character point (0x"
1564- + Integer .toHexString (code )
1565- + ") to output; max is 0x10FFFF as per RFC 4627" );
1566- }
1567- if (code >= SURR1_FIRST ) {
1568- if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1569- // second part?)
1556+ private int _illegalSurrogateFound (int code ) {
1557+ if (_cfgLenientUnicodeEncoding ) {
1558+ return REPLACEMENT_CHAR ;
1559+ } else {
1560+ if (code > 0x10FFFF ) { // over max?
1561+ throw new IllegalArgumentException ("Illegal character point (0x"
1562+ + Integer .toHexString (code )
1563+ + ") to output; max is 0x10FFFF as per RFC 4627" );
1564+ }
1565+ if (code >= SURR1_FIRST ) {
1566+ if (code <= SURR1_LAST ) { // Unmatched first part (closing without
1567+ // second part?)
1568+ throw new IllegalArgumentException (
1569+ "Unmatched first part of surrogate pair (0x"
1570+ + Integer .toHexString (code ) + ")" );
1571+ }
15701572 throw new IllegalArgumentException (
1571- "Unmatched first part of surrogate pair (0x"
1573+ "Unmatched second part of surrogate pair (0x"
15721574 + Integer .toHexString (code ) + ")" );
15731575 }
1574- throw new IllegalArgumentException (
1575- "Unmatched second part of surrogate pair (0x"
1576- + Integer .toHexString (code ) + ")" );
1576+ // should we ever get this?
1577+ throw new IllegalArgumentException ( "Illegal character point (0x"
1578+ + Integer .toHexString (code ) + ") to output " );
15771579 }
1578- // should we ever get this?
1579- throw new IllegalArgumentException ("Illegal character point (0x"
1580- + Integer .toHexString (code ) + ") to output" );
15811580 }
15821581
15831582 /*
0 commit comments