Skip to content

Commit dd2d6bf

Browse files
committed
encodings: decode utf-8 with errors='replace' when confident
"Confident" means "metadata of the document explicitly indicates that the encoding is UTF-8". This prevents feedparser from falling back to other encodings when there are only tiny errors.
1 parent 0af72dc commit dd2d6bf

7 files changed

+76
-1
lines changed

CONTRIBUTORS.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,4 @@ bug report!
2929
* `Aaron Swartz <http://www.aaronsw.com/>`_
3030
* `Jakub Wilk <http://jwilk.net/>`_
3131
* `Nestor Rodriguez <https://github.com/n3s7or>`_
32+
* `Rong Zhang <https://github.com/Rongronggg9>`_
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Fixed
2+
-----
3+
4+
* If the metadata of a feed explicitly indicates that the encoding is UTF-8,
5+
try decode it with ``errors="replace"`` when decoding fails. This prevents
6+
feeds from being decoded with wrong encodings when they are mostly UTF-8 but
7+
contain a few invalid bytes.

feedparser/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from .api import parse
2929
from .datetimes import registerDateHandler
3030
from .exceptions import (
31+
CharacterEncodingErrorsReplace,
3132
CharacterEncodingOverride,
3233
CharacterEncodingUnknown,
3334
FeedparserError,
@@ -64,6 +65,7 @@
6465
"registerDateHandler",
6566
"FeedParserDict",
6667
"FeedparserError",
68+
"CharacterEncodingErrorsReplace",
6769
"CharacterEncodingOverride",
6870
"CharacterEncodingUnknown",
6971
"NonXMLContentType",

feedparser/encodings.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def lazy_chardet_encoding(data):
4747

4848

4949
from .exceptions import (
50+
CharacterEncodingErrorsReplace,
5051
CharacterEncodingOverride,
5152
CharacterEncodingUnknown,
5253
FeedparserError,
@@ -218,6 +219,21 @@ def convert_to_utf8(
218219
http_content_type = http_headers.get("content-type") or ""
219220
http_content_type, http_encoding = parse_content_type(http_content_type)
220221

222+
# Some UTF-8 documents may contain invalid characters, resulting in
223+
# falling back to lazy_chardet_encoding or iso-8859-2.
224+
# In such a case, lazy_chardet_encoding may not be able to detect the
225+
# encoding correctly, and iso-8859-2 is apparently a wrong guess.
226+
227+
# Therefore, we use the flag to allow decoding UTF-8 documents with
228+
# errors='replace'.
229+
230+
# Considering the fact that UTF-8 is the most popular encoding,
231+
# the flag can be safely set if any metadata of the document explicitly
232+
# indicates that the encoding is UTF-8.
233+
234+
# 1st pass: adhere to HTTP encoding (Content-Type)
235+
utf_8_confident = http_encoding == "utf-8"
236+
221237
acceptable_content_type = 0
222238
application_content_types = (
223239
"application/xml",
@@ -232,6 +248,11 @@ def convert_to_utf8(
232248
and http_content_type.endswith("+xml")
233249
):
234250
acceptable_content_type = 1
251+
# 2nd pass: adhere to the declared XML encoding
252+
# (but not in the inconsistent case)
253+
utf_8_confident = utf_8_confident or (
254+
xml_encoding == "utf-8" and not http_encoding
255+
)
235256
rfc3023_encoding = http_encoding or xml_encoding or "utf-8"
236257
elif http_content_type in text_content_types or (
237258
http_content_type.startswith("text/") and http_content_type.endswith("+xml")
@@ -298,7 +319,18 @@ def convert_to_utf8(
298319
try:
299320
text = data.decode(proposed_encoding)
300321
except (UnicodeDecodeError, LookupError):
301-
continue
322+
if proposed_encoding != "utf-8" or not utf_8_confident:
323+
continue
324+
# try utf-8 with errors='replace' if we are confident
325+
try:
326+
text = data.decode("utf-8", errors="replace")
327+
error = CharacterEncodingErrorsReplace(
328+
"document explicitly declared its encoding as utf-8, "
329+
"but has encoding errors, "
330+
"which has been replaced with � (U+FFFD)"
331+
)
332+
except (UnicodeDecodeError, LookupError):
333+
continue
302334

303335
known_encoding = True
304336
if not json:

feedparser/exceptions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828

2929
__all__ = [
3030
"FeedparserError",
31+
"CharacterEncodingErrorsReplace",
3132
"CharacterEncodingOverride",
3233
"CharacterEncodingUnknown",
3334
"NonXMLContentType",
@@ -39,6 +40,10 @@ class FeedparserError(Exception):
3940
pass
4041

4142

43+
class CharacterEncodingErrorsReplace(FeedparserError):
44+
pass
45+
46+
4247
class CharacterEncodingOverride(FeedparserError):
4348
pass
4449

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<!--
3+
Header: Content-type: application/rss+xml
4+
Description: Replace errors instead of falling back to other encodings when application/*xml w/ encoding="utf-8" header
5+
Expect: bozo and encoding == 'utf-8' and entries[0].summary == '𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔'
6+
-->
7+
8+
<rss version="2.0">
9+
<channel>
10+
<item>
11+
<description><![CDATA[ 𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆©©ð’ð’“𝒔 ]]></description>
12+
</item>
13+
</channel>
14+
</rss>
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
<?xml version="1.0"?>
2+
<!--
3+
Header: Content-type: text/rss+xml; charset="utf-8"
4+
Description: Replace errors instead of falling back to other encodings when charset="utf-8"
5+
Expect: bozo and encoding == 'utf-8' and entries[0].summary == '𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔'
6+
-->
7+
8+
<rss version="2.0">
9+
<channel>
10+
<item>
11+
<description><![CDATA[ 𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆©©ð’ð’“𝒔 ]]></description>
12+
</item>
13+
</channel>
14+
</rss>

0 commit comments

Comments
 (0)