encodings: decode utf-8 with errors='replace' when confident

Rongronggg9 · Rongronggg9 · commit dd2d6bfe87cd · 2023-12-27T02:25:44.000+08:00
"Confident" means "metadata of the document explicitly indicates that
the encoding is UTF-8". This prevents feedparser from falling back to
other encodings when there are only tiny errors.
diff --git a/CONTRIBUTORS.rst b/CONTRIBUTORS.rst
@@ -29,3 +29,4 @@ bug report!
 * `Aaron Swartz <http://www.aaronsw.com/>`_
 * `Jakub Wilk <http://jwilk.net/>`_
 * `Nestor Rodriguez <https://github.com/n3s7or>`_
+* `Rong Zhang <https://github.com/Rongronggg9>`_
diff --git a/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst b/changelog.d/20231227_020825_rongronggg9_utf-8_errors_replace.rst
@@ -0,0 +1,7 @@
+Fixed
+-----
+
+*   If the metadata of a feed explicitly indicates that the encoding is UTF-8,
+try decode it with ``errors="replace"`` when decoding fails. This prevents
+feeds from being decoded with wrong encodings when they are mostly UTF-8 but
+contain a few invalid bytes.
diff --git a/feedparser/__init__.py b/feedparser/__init__.py
@@ -28,6 +28,7 @@
 from .api import parse
 from .datetimes import registerDateHandler
 from .exceptions import (
+    CharacterEncodingErrorsReplace,
     CharacterEncodingOverride,
     CharacterEncodingUnknown,
     FeedparserError,
@@ -64,6 +65,7 @@
     "registerDateHandler",
     "FeedParserDict",
     "FeedparserError",
+    "CharacterEncodingErrorsReplace",
     "CharacterEncodingOverride",
     "CharacterEncodingUnknown",
     "NonXMLContentType",
diff --git a/feedparser/encodings.py b/feedparser/encodings.py
@@ -47,6 +47,7 @@ def lazy_chardet_encoding(data):
 
 
 from .exceptions import (
+    CharacterEncodingErrorsReplace,
     CharacterEncodingOverride,
     CharacterEncodingUnknown,
     FeedparserError,
@@ -218,6 +219,21 @@ def convert_to_utf8(
     http_content_type = http_headers.get("content-type") or ""
     http_content_type, http_encoding = parse_content_type(http_content_type)
 
+    # Some UTF-8 documents may contain invalid characters, resulting in
+    # falling back to lazy_chardet_encoding or iso-8859-2.
+    # In such a case, lazy_chardet_encoding may not be able to detect the
+    # encoding correctly, and iso-8859-2 is apparently a wrong guess.
+
+    # Therefore, we use the flag to allow decoding UTF-8 documents with
+    # errors='replace'.
+
+    # Considering the fact that UTF-8 is the most popular encoding,
+    # the flag can be safely set if any metadata of the document explicitly
+    # indicates that the encoding is UTF-8.
+
+    # 1st pass: adhere to HTTP encoding (Content-Type)
+    utf_8_confident = http_encoding == "utf-8"
+
     acceptable_content_type = 0
     application_content_types = (
         "application/xml",
@@ -232,6 +248,11 @@ def convert_to_utf8(
         and http_content_type.endswith("+xml")
     ):
         acceptable_content_type = 1
+        # 2nd pass: adhere to the declared XML encoding
+        #           (but not in the inconsistent case)
+        utf_8_confident = utf_8_confident or (
+            xml_encoding == "utf-8" and not http_encoding
+        )
         rfc3023_encoding = http_encoding or xml_encoding or "utf-8"
     elif http_content_type in text_content_types or (
         http_content_type.startswith("text/") and http_content_type.endswith("+xml")
@@ -298,7 +319,18 @@ def convert_to_utf8(
         try:
             text = data.decode(proposed_encoding)
         except (UnicodeDecodeError, LookupError):
-            continue
+            if proposed_encoding != "utf-8" or not utf_8_confident:
+                continue
+            # try utf-8 with errors='replace' if we are confident
+            try:
+                text = data.decode("utf-8", errors="replace")
+                error = CharacterEncodingErrorsReplace(
+                    "document explicitly declared its encoding as utf-8, "
+                    "but has encoding errors, "
+                    "which has been replaced with � (U+FFFD)"
+                )
+            except (UnicodeDecodeError, LookupError):
+                continue
 
         known_encoding = True
         if not json:
diff --git a/feedparser/exceptions.py b/feedparser/exceptions.py
@@ -28,6 +28,7 @@
 
 __all__ = [
     "FeedparserError",
+    "CharacterEncodingErrorsReplace",
     "CharacterEncodingOverride",
     "CharacterEncodingUnknown",
     "NonXMLContentType",
@@ -39,6 +40,10 @@ class FeedparserError(Exception):
     pass
 
 
+class CharacterEncodingErrorsReplace(FeedparserError):
+    pass
+
+
 class CharacterEncodingOverride(FeedparserError):
     pass
 
diff --git a/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml b/tests/encoding/bozo_http_application_xml_encoding_utf-8_errors_replace.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Header:      Content-type: application/rss+xml
+Description: Replace errors instead of falling back to other encodings when application/*xml w/ encoding="utf-8" header
+Expect:      bozo and encoding == 'utf-8' and entries[0].summary == '𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔'
+-->
+
+<rss version="2.0">
+<channel>
+<item>
+<description><![CDATA[ 𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔 ]]></description>
+</item>
+</channel>
+</rss>
diff --git a/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml b/tests/encoding/bozo_http_charset_utf-8_errors_replace.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/rss+xml; charset="utf-8"
+Description: Replace errors instead of falling back to other encodings when charset="utf-8"
+Expect:      bozo and encoding == 'utf-8' and entries[0].summary == '𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔'
+-->
+
+<rss version="2.0">
+<channel>
+<item>
+<description><![CDATA[ 𝐔𝐓𝐅-𝟖, 𝑏𝑢𝑡 𝒆��𝒐𝒓𝒔 ]]></description>
+</item>
+</channel>
+</rss>