apache · ifly6 · Dec 13, 2020 · Dec 13, 2020 · Dec 13, 2020 · Dec 13, 2020
diff --git a/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java b/src/main/java/org/apache/commons/text/translate/NumericEntityUnescaper.java
@@ -18,8 +18,15 @@
 
 import java.io.IOException;
 import java.io.Writer;
+import java.nio.ByteBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.Set;
 
 /**
  * Translate XML numeric entities of the form &amp;#[xX]?\d+;? to
@@ -31,7 +38,7 @@
  */
 public class NumericEntityUnescaper extends CharSequenceTranslator {
 
-    /** Enumerates NumericEntityUnescaper options for unescaping. */
+    /** Enumerates {@code NumericEntityUnescaper} options for unescaping. */
     public enum OPTION {
 
         /**
@@ -53,19 +60,29 @@ public enum OPTION {
     /** EnumSet of OPTIONS, given from the constructor. */
     private final EnumSet<OPTION> options;
 
+    /** Code points which are invalid Windows-1252 points. */
+    private static final Set<Integer> INVALID_CP1252_POINTS =
+            Collections.unmodifiableSet(new HashSet<>(Arrays.asList(129, 141, 143, 144, 157)));
+
+    /** Decoder for Windows-1252 characters. */
+    // Windows-1252 is supported. See https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html.
+    private static final CharsetDecoder CP_1252_DECODER = Charset.forName("Windows-1252").newDecoder()
+            .onMalformedInput(CodingErrorAction.REPORT)
+            .onUnmappableCharacter(CodingErrorAction.REPORT);
+
     /**
-     * Create a UnicodeUnescaper.
-     *
-     * The constructor takes a list of options, only one type of which is currently
-     * available (whether to allow, error or ignore the semi-colon on the end of a
+     * Create a {@code NumericEntityUnescaper}. The constructor takes a list of options, only one type of which is
+     * currently available (whether to allow, error or ignore the semi-colon on the end of a
      * numeric entity to being missing).
      *
-     * For example, to support numeric entities without a ';':
-     *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
-     * and to throw an IllegalArgumentException when they're missing:
-     *    new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
+     * <br>
+     * <p>For example, to support numeric entities without a ';':</p>
+     *     <pre>{@code new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)}</pre>
+     *
+     * <p>and to throw an IllegalArgumentException when they're missing:</p>
+     *     <pre>{@code new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)}</pre>
      *
-     * Note that the default behavior is to ignore them.
+     * <p>Note that the default behavior is to ignore them.</p>
      *
      * @param options to apply to this unescaper
      */
@@ -128,7 +145,7 @@ public int translate(final CharSequence input, final int index, final Writer out
                 }
             }
 
-            int entityValue;
+            final int entityValue;
             try {
                 if (isHex) {
                     entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
@@ -143,8 +160,32 @@ public int translate(final CharSequence input, final int index, final Writer out
                 final char[] chrs = Character.toChars(entityValue);
                 out.write(chrs[0]);
                 out.write(chrs[1]);
+
+            } else if (128 <= entityValue && entityValue <= 159  // must be within the cp-1252 extension range
+                    && !isHex  // must be a NUMERIC entity, not hex entity (see StringEscapeUtilsTest for hex)
+                    && !INVALID_CP1252_POINTS.contains(entityValue)  // must not be an invalid code point for cp-1252
+            ) {
+                try {
+                    final String newChar = CP_1252_DECODER
+                            .decode(ByteBuffer.wrap(new byte[] {(byte) entityValue}))
+                            .toString();
+                    out.write(newChar);
+
+                } catch (final IllegalArgumentException e) {
+                    /*
+                     * Rethrow exception with causal input, as throw from Charset.decode does not include it.
+                     *
+                     * That said, the input should always be a valid byte due to the restrictions that are imposed by
+                     * the if statement; all characters should be mappable as well, as entity values that are not so are
+                     * excluded by the if statement. If something happens to violate the restrictions meant to ensure
+                     * that translation is valid and should always work, user ought to know.
+                     */
+                    throw new IllegalArgumentException(String.format("input %s is malformed input", e));
+                }
+
             } else {
                 out.write(entityValue);
+
             }
 
             return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);

diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
@@ -30,7 +30,6 @@
 import java.io.StringWriter;
 import java.lang.reflect.Constructor;
 import java.lang.reflect.Modifier;
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
@@ -49,6 +48,7 @@ public class StringEscapeUtilsTest {
     private static final String FOO = "foo";
 
     private static final String[][] HTML_ESCAPES = {
+            // name, expected, original
             {"no escaping", "plain text", "plain text"},
             {"no escaping", "plain text", "plain text"},
             {"empty string", "", ""},
@@ -63,6 +63,23 @@ public class StringEscapeUtilsTest {
             {"8-bit ascii shouldn't number-escape", "\u0080\u009F", "\u0080\u009F"},
     };
 
+    private static final String[][] HTML_ESCAPES_ONEWAY = {
+            // name, expected, original
+            /* these are one-way tests; ie they are only decoded. because HTML says to re-encode with unicode points
+             * instead of CP-1252 points and we probably ought to prefer named entities if possible, they should not
+             * be re-encoded regardless. */
+            {
+                    // tests for all CP-1252 characters between 128 and 159, with chars 123-126 and chars 161-163
+                    "cp1252",
+                    "&#124;&#125;&#126;&#128;&#130;&#131;&#132;&#133;&#134;&#135;&#136;&#137;&#138;&#139;&#140;"
+                            + "&#142;&#145;&#146;&#147;&#148;&#149;&#150;&#151;&#152;&#153;&#154;&#155;&#156;&#158;"
+                            + "&#159;&#161;&#162;&#163;",
+                    "\u007C\u007D\u007E\u20AC\u201A\u0192\u201E\u2026\u2020\u2021\u02C6\u2030\u0160\u2039\u0152\u017D"
+                            + "\u2018\u2019\u201C\u201D\u2022\u2013\u2014\u02DC\u2122\u0161\u203A\u0153\u017E\u0178"
+                            + "\u00A1\u00A2\u00A3"
+            }
+    };
+
     private void assertEscapeJava(final String escaped, final String original) throws IOException {
         assertEscapeJava(escaped, original, null);
     }
@@ -226,26 +243,41 @@ public void testEscapeHtml3() {
             final String actual = original == null ? null : sw.toString();
             assertEquals(expected, actual, message);
         }
+
+        for (final String[] e : HTML_ESCAPES_ONEWAY) {
+            final String message = e[0];
+            final String input = e[1];
+            final String answer = e[2];
+            assertEquals(answer, StringEscapeUtils.unescapeHtml3(input), message);
+        }
+
     }
 
     @Test
-        public void testEscapeHtml4() {
-            for (final String[] element : HTML_ESCAPES) {
-                final String message = element[0];
-                final String expected = element[1];
-                final String original = element[2];
-                assertEquals(expected, StringEscapeUtils.escapeHtml4(original), message);
-                final StringWriter sw = new StringWriter();
-                try {
-                    StringEscapeUtils.ESCAPE_HTML4.translate(original, sw);
-                } catch (final IOException e) {
-                    // expected
-                }
-                final String actual = original == null ? null : sw.toString();
-                assertEquals(expected, actual, message);
+    public void testEscapeHtml4() {
+        for (final String[] element : HTML_ESCAPES) {
+            final String message = element[0];
+            final String expected = element[1];
+            final String original = element[2];
+            assertEquals(expected, StringEscapeUtils.escapeHtml4(original), message);
+            final StringWriter sw = new StringWriter();
+            try {
+                StringEscapeUtils.ESCAPE_HTML4.translate(original, sw);
+            } catch (final IOException e) {
+                // expected
             }
+            final String actual = original == null ? null : sw.toString();
+            assertEquals(expected, actual, message);
         }
 
+        for (final String[] e : HTML_ESCAPES_ONEWAY) {
+            final String message = e[0];
+            final String input = e[1];
+            final String answer = e[2];
+            assertEquals(answer, StringEscapeUtils.unescapeHtml4(input), message);
+        }
+    }
+
     /**
      * Tests // https://issues.apache.org/jira/browse/LANG-480
      */
@@ -257,7 +289,7 @@ public void testEscapeHtmlHighUnicode() {
         // codepoint: U+1D362
         final byte[] data = {(byte) 0xF0, (byte) 0x9D, (byte) 0x8D, (byte) 0xA2};
 
-        final String original = new String(data, Charset.forName("UTF8"));
+        final String original = new String(data, StandardCharsets.UTF_8);
 
         final String escaped = StringEscapeUtils.escapeHtml4(original);
         assertEquals(original, escaped, "High Unicode should not have been escaped");