Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
(If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) You can confirm that the re2j engine has been enabled correctly by calling `Regex.usingRe2j()`. [#2407](https://github.com/jhy/jsoup/pull/2407)

* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser's configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
* Added a configurable maximum parser depth (number of open elements on stack) to both HTML and XML parsers. The HTML parser now defaults to a depth of 512 to match browser behavior, and protect against unbounded stack growth, while the XML parser keeps unlimited depth by default but can opt into a limit via `Parser#setMaxDepth`. [#2421](https://github.com/jhy/jsoup/issues/2421)
* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403)
* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041)

Expand Down
29 changes: 21 additions & 8 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ public class HtmlTreeBuilder extends TreeBuilder {
"button", "fieldset", "input", "keygen", "object", "output", "select", "textarea"
};

public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
/** @deprecated This is not used anymore. Will be removed in a future release. */
@Deprecated
public static final int MaxScopeSearchDepth = 100;

private HtmlTreeBuilderState state; // the current state
private HtmlTreeBuilderState originalState; // original / marked state
Expand Down Expand Up @@ -392,6 +394,8 @@ FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean
* @param el the Element to insert and make the current element
*/
private void doInsertElement(Element el) {
enforceStackDepthLimit();

if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed))
formElement.addElement(el); // connect form controls to their form element

Expand Down Expand Up @@ -498,6 +502,20 @@ boolean removeFromStack(Element el) {
return false;
}

@Override
void onStackPrunedForDepth(Element element) {
// handle other effects of popping to keep state correct
if (element == headElement) headElement = null;
if (element == formElement) setFormElement(null);
removeFromActiveFormattingElements(element);
if (element.nameIs("template")) {
clearFormattingElementsToLastMarker();
if (templateModeSize() > 0)
popTemplateMode();
resetInsertionMode();
}
}

/** Pops the stack until the given HTML element is removed. */
@Nullable
Element popStackToClose(String elName) {
Expand Down Expand Up @@ -699,9 +717,8 @@ private boolean inSpecificScope(String targetName, String[] baseTypes, String[]
private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) {
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
final int bottom = stack.size() -1;
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
// don't walk too far up the tree
for (int pos = bottom; pos >= top; pos--) {
for (int pos = bottom; pos >= 0; pos--) {
Element el = stack.get(pos);
String elName = el.normalName();
// namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg:
Expand Down Expand Up @@ -762,11 +779,7 @@ boolean inSelectScope(String targetName) {

/** Tests if there is some element on the stack that is not in the provided set. */
boolean onStackNot(String[] allowedTags) {
final int bottom = stack.size() -1;
final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
// don't walk too far up the tree

for (int pos = bottom; pos >= top; pos--) {
for (int pos = stack.size() - 1; pos >= 0; pos--) {
final String elName = stack.get(pos).normalName();
if (!inSorted(elName, allowedTags))
return true;
Expand Down
27 changes: 26 additions & 1 deletion src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public class Parser implements Cloneable {
private boolean trackPosition = false;
private @Nullable TagSet tagSet;
private final ReentrantLock lock = new ReentrantLock();
private int maxDepth;

/**
* Create a new Parser, using the specified TreeBuilder
Expand All @@ -39,6 +40,7 @@ public Parser(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
settings = treeBuilder.defaultSettings();
errors = ParseErrorList.noTracking();
maxDepth = treeBuilder.defaultMaxDepth();
}

/**
Expand All @@ -60,6 +62,7 @@ private Parser(Parser copy) {
errors = new ParseErrorList(copy.errors); // only copies size, not contents
settings = new ParseSettings(copy.settings);
trackPosition = copy.trackPosition;
maxDepth = copy.maxDepth;
tagSet = new TagSet(copy.tagSet());
}

Expand Down Expand Up @@ -195,6 +198,28 @@ public ParseSettings settings() {
return settings;
}

/**
Set the parser's maximum stack depth (maximum number of open elements). When reached, new open elements will be
removed to prevent excessive nesting. Defaults to 512 for the HTML parser, and unlimited for the XML
parser.

@param maxDepth maximum parser depth; must be >= 1
@return this Parser, for chaining
*/
public Parser setMaxDepth(int maxDepth) {
Validate.isTrue(maxDepth >= 1, "maxDepth must be >= 1");
this.maxDepth = maxDepth;
return this;
}

/**
* Get the maximum parser depth (maximum number of open elements).
* @return the current max parser depth
*/
public int getMaxDepth() {
return maxDepth;
}

/**
Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are
parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag.
Expand Down Expand Up @@ -351,6 +376,6 @@ public static Parser htmlParser() {
* @return a new simple XML parser.
*/
public static Parser xmlParser() {
return new Parser(new XmlTreeBuilder());
return new Parser(new XmlTreeBuilder()).setMaxDepth(Integer.MAX_VALUE);
}
}
30 changes: 27 additions & 3 deletions src/main/java/org/jsoup/parser/TreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@
import org.jspecify.annotations.Nullable;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.jsoup.parser.Parser.NamespaceHtml;

Expand Down Expand Up @@ -174,6 +171,33 @@ final void push(Element element) {
onNodeInserted(element);
}

/**
Ensures the stack respects {@link Parser#getMaxDepth()} by closing the deepest open elements until there is room for
a new insertion.
*/
final void enforceStackDepthLimit() {
final int maxDepth = parser.getMaxDepth();
if (maxDepth == Integer.MAX_VALUE) return;
while (stack.size() >= maxDepth) {
Element trimmed = pop();
onStackPrunedForDepth(trimmed);
}
}

/**
Hook for the HTML Tree Builder that needs to clean up when an element is removed due to the depth limit
*/
void onStackPrunedForDepth(Element element) {
// default no-op
}

/**
Default maximum depth for parsers using this tree builder.
*/
int defaultMaxDepth() {
return 512;
}

/**
Get the current element (last on the stack). If all items have been removed, returns the document instead
(which might not actually be on the stack; use stack.size() == 0 to test if required.
Expand Down
7 changes: 7 additions & 0 deletions src/main/java/org/jsoup/parser/XmlTreeBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ TagSet defaultTagSet() {
return new TagSet(); // an empty tagset
}

@Override
int defaultMaxDepth() {
return Integer.MAX_VALUE;
}

@Override
protected boolean process(Token token) {
currentToken = token;
Expand Down Expand Up @@ -151,6 +156,8 @@ void insertElementFor(Token.StartTag startTag) {
applyNamespacesToAttributes(attributes, namespaces);
}

enforceStackDepthLimit();

String tagName = startTag.tagName.value();
String ns = resolveNamespace(tagName, namespaces);
Tag tag = tagFor(tagName, startTag.normalName, ns, settings);
Expand Down
4 changes: 3 additions & 1 deletion src/test/java/org/jsoup/nodes/ElementIT.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jsoup.nodes;

import org.jsoup.Jsoup;
import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;

Expand Down Expand Up @@ -125,6 +126,7 @@ public void testFastReparentExistingContent() {
@Test void wrapNoOverflow() {
// deepChild was recursive, so could overflow if presented with a fairly insane wrap
Document doc = new Document("https://example.com/");
doc.parser().setMaxDepth(Integer.MAX_VALUE); // don't limit to 512
Element el = doc.body().appendElement("p");
int num = 50000;
StringBuilder sb = new StringBuilder();
Expand All @@ -134,7 +136,7 @@ public void testFastReparentExistingContent() {
el.wrap(sb.toString());
String html = doc.body().html();
assertTrue(html.startsWith("<div>"));
assertEquals(num + 3, el.parents().size());
assertEquals(num + 3, el.parents().size()); // + 3 is for body, html, document
}

@Test
Expand Down
110 changes: 110 additions & 0 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.jsoup.nodes.*;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
Expand Down Expand Up @@ -2173,7 +2174,7 @@
@Test void dropsNullsFromBody() {
// https://github.com/jhy/jsoup/issues/2395
String html = "<p>\u0000</p><p>\u0000\u0000</p><p>Hi\u0000</p>";

Parser parser = Parser.htmlParser();
parser.setTrackErrors(10);

Expand Down Expand Up @@ -2205,6 +2206,115 @@
assertEquals("<1:26>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString());
assertEquals("<1:27>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString());
assertEquals("<1:43>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString());
}

@Nested class DeepHtmlTrees {
private int depth(Element el) {
int d = 0;
while ((el = el.parent()) != null) {
d++;
} while (el != null);

Check warning

Code scanning / CodeQL

Constant loop condition Warning test

Loop
might not terminate, as this loop condition is constant within the loop.
return d;
}

/**
* Parse the HTML code in `contents`, wrapped in enough divs to ensure that the root elements
* of contents are at depth `startingDepth`.
*/
private Element parseDeepHtml(int startingDepth, String contents) {
StringBuilder html = new StringBuilder();
html.append("<html><body>");
for (int i = 0; i < startingDepth - 4; i++) {
html.append("<div>");
}
html.append("<div id='container'>");
html.append(contents);

Parser parser = Parser.htmlParser();
Document doc = Jsoup.parse(html.toString(), parser);
Element container = doc.getElementById("container");
assertNotNull(container);
assertEquals(startingDepth - 1, depth(container));

return container;
}

@Test void nestedDivs() {
Element container = parseDeepHtml(511, "<div><div><div>");

assertEquals("<div>\n <div></div>\n <div></div>\n</div>", container.html());
}

@Test void closingTagOfTagClosedByDepthLimit() {
// The <a></a> tag would be nested too deep, so it first closes the innermost <span>.
// This means that the first </span> will close the outer <span>, as it's the only
// one that is currently open. The last </span> is then just ignored, as there is no
// open <span> left to close.
Element container = parseDeepHtml(511, "<span><span><a></a></span><b></b></span>");

assertEquals("<span><span></span><a></a></span><b></b>", container.html());
}

@Test void tableAtDepthLimitWithDirectTd() {
Element container = parseDeepHtml(512, "<table><td>");

assertEquals("<table></table>\n<tbody></tbody>\n<tr></tr>\n<td></td>", container.html());
}

@Test void tableRightBeforeDepthLimitWithDirectTd() {
Element container = parseDeepHtml(511, "<table><td>");

assertEquals("<table>\n <tbody></tbody>\n <tr></tr>\n <td></td>\n</table>", container.html());
}

@Test void customDepthLimit() {
Parser parser = Parser.htmlParser().setMaxDepth(5);
String input = "<html><body><div><div><div><div><div><div>";

Document doc = Jsoup.parse(input, parser);
String expected = new StringBuilder()
.append("<html>\n")
.append(" <head></head>\n")
.append(" <body>\n")
.append(" <div>\n")
.append(" <div>\n")
.append(" <div></div>\n")
.append(" <div></div>\n")
.append(" <div></div>\n")
.append(" <div></div>\n")
.append(" </div>\n")
.append(" </div>\n")
.append(" </body>\n")
.append("</html>")
.toString();

assertEquals(expected, doc.html());
}

@Test void formControlsDetachWhenFormTrimmed() {
Parser parser = Parser.htmlParser().setMaxDepth(3);
String input = "<form id='f'><div><input name='foo'></div></form>";

Document doc = Jsoup.parse(input, "", parser);
Element formEl = doc.getElementById("f");
assertNotNull(formEl);
assertTrue(formEl instanceof FormElement);
FormElement form = (FormElement) formEl;
assertEquals("", form.html());
assertEquals(0, form.elements().size());
}

@Test void templateModesClearedWhenTrimmed() {
Parser parser = Parser.htmlParser().setMaxDepth(3);
String input = "<template id='tmpl'><div><span>One</span></div></template><p>Two</p>";

Document doc = Jsoup.parse(input, "", parser);
Element template = doc.getElementById("tmpl");
assertNotNull(template);
assertEquals("", template.html());
Element paragraph = doc.selectFirst("p");
assertNotNull(paragraph);
assertEquals("Two", paragraph.text());
}
}
}
10 changes: 9 additions & 1 deletion src/test/java/org/jsoup/parser/ParserIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,16 @@ public void handlesDeepStack() {
long start = System.currentTimeMillis();
Document doc = Parser.parseBodyFragment(longBody.toString(), "");

int depth = 1;
Element el = doc.body();
while (el.childrenSize() > 0) {
el = el.child(0);
depth++;
}

// Assert
assertEquals(2, doc.body().childNodeSize());
assertEquals(1, doc.body().childrenSize());
assertEquals(512, depth);
assertEquals(25000, doc.select("dd").size());
assertTrue(System.currentTimeMillis() - start < 20000); // I get ~ 1.5 seconds, but others have reported slower
// was originally much longer, or stack overflow.
Expand Down
Loading