Skip to content

Windows-1252 encoding for HTML numeric entities #191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,15 @@

import java.io.IOException;
import java.io.Writer;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;

/**
* Translate XML numeric entities of the form &#[xX]?\d+;? to
Expand All @@ -31,7 +38,7 @@
*/
public class NumericEntityUnescaper extends CharSequenceTranslator {

/** Enumerates NumericEntityUnescaper options for unescaping. */
/** Enumerates {@code NumericEntityUnescaper} options for unescaping. */
public enum OPTION {

/**
Expand All @@ -53,19 +60,29 @@ public enum OPTION {
/** EnumSet of OPTIONS, given from the constructor. */
private final EnumSet<OPTION> options;

/** Code points which are invalid Windows-1252 points. */
private static final Set<Integer> INVALID_CP1252_POINTS =
Collections.unmodifiableSet(new HashSet<>(Arrays.asList(129, 141, 143, 144, 157)));

/** Decoder for Windows-1252 characters. */
// Windows-1252 is supported. See https://docs.oracle.com/javase/8/docs/technotes/guides/intl/encoding.doc.html.
private static final CharsetDecoder CP_1252_DECODER = Charset.forName("Windows-1252").newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);

/**
* Create a UnicodeUnescaper.
*
* The constructor takes a list of options, only one type of which is currently
* available (whether to allow, error or ignore the semi-colon on the end of a
* Create a {@code NumericEntityUnescaper}. The constructor takes a list of options, only one type of which is
* currently available (whether to allow, error or ignore the semi-colon on the end of a
* numeric entity to being missing).
*
* For example, to support numeric entities without a ';':
* new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
* and to throw an IllegalArgumentException when they're missing:
* new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
* <br>
* <p>For example, to support numeric entities without a ';':</p>
* <pre>{@code new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)}</pre>
*
* <p>and to throw an IllegalArgumentException when they're missing:</p>
* <pre>{@code new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)}</pre>
*
* Note that the default behavior is to ignore them.
* <p>Note that the default behavior is to ignore them.</p>
*
* @param options to apply to this unescaper
*/
Expand Down Expand Up @@ -128,7 +145,7 @@ public int translate(final CharSequence input, final int index, final Writer out
}
}

int entityValue;
final int entityValue;
try {
if (isHex) {
entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
Expand All @@ -143,8 +160,32 @@ public int translate(final CharSequence input, final int index, final Writer out
final char[] chrs = Character.toChars(entityValue);
out.write(chrs[0]);
out.write(chrs[1]);

} else if (128 <= entityValue && entityValue <= 159 // must be within the cp-1252 extension range
&& !isHex // must be a NUMERIC entity, not hex entity (see StringEscapeUtilsTest for hex)
&& !INVALID_CP1252_POINTS.contains(entityValue) // must not be an invalid code point for cp-1252
) {
try {
final String newChar = CP_1252_DECODER
.decode(ByteBuffer.wrap(new byte[] {(byte) entityValue}))
.toString();
out.write(newChar);

} catch (final IllegalArgumentException e) {
/*
* Rethrow exception with causal input, as throw from Charset.decode does not include it.
*
* That said, the input should always be a valid byte due to the restrictions that are imposed by
* the if statement; all characters should be mappable as well, as entity values that are not so are
* excluded by the if statement. If something happens to violate the restrictions meant to ensure
* that translation is valid and should always work, user ought to know.
*/
throw new IllegalArgumentException(String.format("input %s is malformed input", e));
}

} else {
out.write(entityValue);

}

return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
Expand Down
64 changes: 48 additions & 16 deletions src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import java.io.StringWriter;
import java.lang.reflect.Constructor;
import java.lang.reflect.Modifier;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
Expand All @@ -49,6 +48,7 @@ public class StringEscapeUtilsTest {
private static final String FOO = "foo";

private static final String[][] HTML_ESCAPES = {
// name, expected, original
{"no escaping", "plain text", "plain text"},
{"no escaping", "plain text", "plain text"},
{"empty string", "", ""},
Expand All @@ -63,6 +63,23 @@ public class StringEscapeUtilsTest {
{"8-bit ascii shouldn't number-escape", "\u0080\u009F", "\u0080\u009F"},
};

private static final String[][] HTML_ESCAPES_ONEWAY = {
// name, expected, original
/* these are one-way tests; ie they are only decoded. because HTML says to re-encode with unicode points
* instead of CP-1252 points and we probably ought to prefer named entities if possible, they should not
* be re-encoded regardless. */
{
// tests for all CP-1252 characters between 128 and 159, with chars 123-126 and chars 161-163
"cp1252",
"&#124;&#125;&#126;&#128;&#130;&#131;&#132;&#133;&#134;&#135;&#136;&#137;&#138;&#139;&#140;"
+ "&#142;&#145;&#146;&#147;&#148;&#149;&#150;&#151;&#152;&#153;&#154;&#155;&#156;&#158;"
+ "&#159;&#161;&#162;&#163;",
"\u007C\u007D\u007E\u20AC\u201A\u0192\u201E\u2026\u2020\u2021\u02C6\u2030\u0160\u2039\u0152\u017D"
+ "\u2018\u2019\u201C\u201D\u2022\u2013\u2014\u02DC\u2122\u0161\u203A\u0153\u017E\u0178"
+ "\u00A1\u00A2\u00A3"
}
};

private void assertEscapeJava(final String escaped, final String original) throws IOException {
assertEscapeJava(escaped, original, null);
}
Expand Down Expand Up @@ -226,26 +243,41 @@ public void testEscapeHtml3() {
final String actual = original == null ? null : sw.toString();
assertEquals(expected, actual, message);
}

for (final String[] e : HTML_ESCAPES_ONEWAY) {
final String message = e[0];
final String input = e[1];
final String answer = e[2];
assertEquals(answer, StringEscapeUtils.unescapeHtml3(input), message);
}

}

@Test
public void testEscapeHtml4() {
for (final String[] element : HTML_ESCAPES) {
final String message = element[0];
final String expected = element[1];
final String original = element[2];
assertEquals(expected, StringEscapeUtils.escapeHtml4(original), message);
final StringWriter sw = new StringWriter();
try {
StringEscapeUtils.ESCAPE_HTML4.translate(original, sw);
} catch (final IOException e) {
// expected
}
final String actual = original == null ? null : sw.toString();
assertEquals(expected, actual, message);
public void testEscapeHtml4() {
for (final String[] element : HTML_ESCAPES) {
final String message = element[0];
final String expected = element[1];
final String original = element[2];
assertEquals(expected, StringEscapeUtils.escapeHtml4(original), message);
final StringWriter sw = new StringWriter();
try {
StringEscapeUtils.ESCAPE_HTML4.translate(original, sw);
} catch (final IOException e) {
// expected
}
final String actual = original == null ? null : sw.toString();
assertEquals(expected, actual, message);
}

for (final String[] e : HTML_ESCAPES_ONEWAY) {
final String message = e[0];
final String input = e[1];
final String answer = e[2];
assertEquals(answer, StringEscapeUtils.unescapeHtml4(input), message);
}
}

/**
* Tests // https://issues.apache.org/jira/browse/LANG-480
*/
Expand All @@ -257,7 +289,7 @@ public void testEscapeHtmlHighUnicode() {
// codepoint: U+1D362
final byte[] data = {(byte) 0xF0, (byte) 0x9D, (byte) 0x8D, (byte) 0xA2};

final String original = new String(data, Charset.forName("UTF8"));
final String original = new String(data, StandardCharsets.UTF_8);

final String escaped = StringEscapeUtils.escapeHtml4(original);
assertEquals(original, escaped, "High Unicode should not have been escaped");
Expand Down