Fix infinite loop in sphinx.writers.text.TextWrapper (#13762)

picnixz · AA-Turner · web-flow · commit cad3c43f4be2 · 2025-11-29T02:34:37.000Z
Co-authored-by: Adam Turner &lt;9087854+aa-turner@users.noreply.github.com&gt;
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -185,6 +185,8 @@ Bugs fixed
 * #14067: EPUB: unify path separators in manifest items to forward slashes;
   resolve duplicates in the manifest on Windows.
   Patch by Akihiro Takizawa.
+* #13741: text builder: fix an infinite loop when processing CSV tables.
+  Patch by Bénédikt Tran.
 
 
 Testing
diff --git a/sphinx/writers/text.py b/sphinx/writers/text.py
@@ -292,18 +292,18 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:
             else:
                 indent = self.initial_indent
 
+            # Note that column_width(x) > len(x) is possible,
+            # but _handle_long_word() handles negative widths.
             width = self.width - column_width(indent)
 
             if self.drop_whitespace and not chunks[-1].strip() and lines:
                 del chunks[-1]
 
             while chunks:
-                l = column_width(chunks[-1])
-
-                if cur_len + l <= width:
+                chunk_width = column_width(chunks[-1])
+                if cur_len + chunk_width <= width:
                     cur_line.append(chunks.pop())
-                    cur_len += l
-
+                    cur_len += chunk_width
                 else:
                     break
 
@@ -318,47 +318,73 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:
 
         return lines
 
-    def _break_word(self, word: str, space_left: int) -> tuple[str, str]:
-        """Break line by unicode width instead of len(word)."""
+    @staticmethod
+    def _find_break_end(word: str, space_left: int) -> int:
+        """Break word by Unicode width instead of len(word).
+
+        The returned position 'end' satisfies::
+
+            assert column_width(word[:end]) <= space_left
+            assert end == len(word) or column_width(word[:end+1]) > space_left
+        """
         total = 0
-        for i, c in enumerate(word):
+        for end, c in enumerate(word, start=1):
             total += column_width(c)
             if total > space_left:
-                return word[: i - 1], word[i - 1 :]
-        return word, ''
+                return end - 1
+        return len(word)
 
     def _split(self, text: str) -> list[str]:
         """Override original method that only split by 'wordsep_re'.
 
         This '_split' splits wide-characters into chunks by one character.
         """
-
-        def split(t: str) -> list[str]:
-            return super(TextWrapper, self)._split(t)
-
         chunks: list[str] = []
-        for chunk in split(text):
-            for w, g in groupby(chunk, column_width):
-                if w == 1:
-                    chunks.extend(split(''.join(g)))
+        for chunk in super()._split(text):
+            for w, g in groupby(chunk, _column_width_safe):
+                if w <= 1:
+                    chunks += super()._split(''.join(g))
                 else:
-                    chunks.extend(list(g))
+                    chunks += g
         return chunks
 
     def _handle_long_word(
         self, reversed_chunks: list[str], cur_line: list[str], cur_len: int, width: int
     ) -> None:
-        """Override original method for using self._break_word() instead of slice."""
-        space_left = max(width - cur_len, 1)
-        if self.break_long_words:
-            l, r = self._break_word(reversed_chunks[-1], space_left)
-            cur_line.append(l)
-            reversed_chunks[-1] = r
+        """Override using self._find_break() instead of str.find()."""
+        # Make sure at least one character is stripped off on every pass.
+        #
+        # Do NOT use space_left = max(width - cur_len, 1) as corner cases
+        # with "self.drop_whitespace == False" and "self.width == 1" fail.
+        space_left = 1 if width < 1 else (width - cur_len)
 
+        if self.break_long_words:
+            # Some characters may have len(X) < space_left < column_width(X)
+            # so we should only wrap chunks for which len(X) > space_left.
+            end = space_left
+            chunk = reversed_chunks[-1]
+            if space_left > 0:
+                end = self._find_break_end(chunk, space_left)
+            if end == 0 and space_left:
+                # force processing at least one character
+                end = 1
+            cur_line.append(chunk[:end])
+            reversed_chunks[-1] = chunk[end:]
         elif not cur_line:
             cur_line.append(reversed_chunks.pop())
 
 
+def _column_width_safe(x: str) -> int:
+    # Handle characters that are 0-width. We should refine
+    # the grouping to prevent splitting a word at combining
+    # characters or in a group of combining characters with
+    # at most one non-combining character as the combining
+    # characters may act on the right or left character.
+    #
+    # See https://github.com/sphinx-doc/sphinx/issues/13741.
+    return max(1, column_width(x))
+
+
 MAXWIDTH = 70
 STDINDENT = 3
 
diff --git a/tests/test_writers/test_writer_text.py b/tests/test_writers/test_writer_text.py
@@ -0,0 +1,183 @@
+"""Test the LaTeX writer"""
+
+from __future__ import annotations
+
+import pytest
+from docutils.utils import column_width
+
+from sphinx.writers.text import TextWrapper
+
+find_break_end = TextWrapper._find_break_end
+
+
+@pytest.mark.parametrize(
+    # glyph of column width 0
+    'glyph',
+    ['', '\N{COMBINING TILDE}'],
+)
+def test_text_wrapper_break_phantom_symbol(glyph: str) -> None:
+    assert column_width(glyph) == 0
+    glyph_length = len(glyph)
+
+    for n in range(1, 5):
+        # Since the glyph has length 0 and column width 0,
+        # we can always take the entire glpyh.
+        assert find_break_end(glyph, n) == glyph_length
+        for m in range(1, 5):
+            # The multiplied glyph may have non-zero length
+            # but its column width will always be 0, so we
+            # take the entire glyph again.
+            assert find_break_end(m * glyph, n) == m * glyph_length
+
+
+@pytest.mark.parametrize(
+    ('text', 'colwidth'),
+    [
+        # Glyph of length 1 and column width 1
+        ('X', 1),
+        # Glyph of length 1 and column width 2
+        ('\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
+        # Glyph of length 2 and column width 1
+        ('\N{COMBINING TILDE}X', 1),
+        # Glyph of length 2 and column width 2
+        ('\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
+        # Glyph of length 3 and column width 1
+        ('\N{COMBINING TILDE}\N{COMBINING BREVE}X', 1),
+    ],
+)
+def test_text_wrapper_break_visible_symbol(text: str, colwidth: int) -> None:
+    assert column_width(text) == colwidth
+    for n in range(1, 5):
+        end = find_break_end(text, n)
+        assert column_width(text[:end]) <= n
+        for m in range(2, 5):
+            m_text = m * text
+            end = find_break_end(m_text, n)
+            assert column_width(m_text[:end]) <= n
+            assert end == m * len(text) or column_width(m_text[: end + 1]) > n
+
+
+def test_text_wrapper_break_stop_after_combining_symbols() -> None:
+    tilde = '\N{COMBINING TILDE}'
+    multi = '\N{CJK UNIFIED IDEOGRAPH-65E5}'
+
+    head = tilde + tilde + '....'
+    tail = multi + tilde + tilde
+    text = head + tail
+    assert find_break_end(head + tail, column_width(head)) == len(head)
+
+
+@pytest.mark.parametrize(
+    ('text', 'results'),
+    [
+        ('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
+        (
+            'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
+            {
+                1: list('Helloa\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
+                2: ['He', 'll', 'o', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
+                3: ['Hel', 'lo', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
+            },
+        ),
+        (
+            'ab c\N{COMBINING TILDE}def',
+            {
+                1: ['a', 'b', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
+                2: ['ab', 'c\N{COMBINING TILDE}d', 'ef'],
+                3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
+            },
+        ),
+        (
+            'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
+            {
+                1: [
+                    'a',
+                    'b',
+                    'c\N{COMBINING TILDE}',
+                    '\N{CJK UNIFIED IDEOGRAPH-65E5}',
+                    'd',
+                    'e',
+                    'f',
+                ],
+                2: [
+                    'ab',
+                    'c\N{COMBINING TILDE}',
+                    '\N{CJK UNIFIED IDEOGRAPH-65E5}',
+                    'de',
+                    'f',
+                ],
+                3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
+            },
+        ),
+        (
+            'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
+            {
+                1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
+                2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
+                3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
+            },
+        ),
+    ],
+)
+def test_text_wrapper(text: str, results: dict[int, list[str]]) -> None:
+    for width, expected in results.items():
+        w = TextWrapper(width=width, drop_whitespace=True)
+        assert w.wrap(text) == expected
+
+
+@pytest.mark.parametrize(
+    ('text', 'results'),
+    [
+        ('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
+        (
+            'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
+            {
+                1: list('Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
+                2: ['He', 'll', 'o ', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
+                3: ['Hel', 'lo ', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
+            },
+        ),
+        (
+            'ab c\N{COMBINING TILDE}def',
+            {
+                1: ['a', 'b', ' ', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
+                2: ['ab', ' c\N{COMBINING TILDE}', 'de', 'f'],
+                3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
+            },
+        ),
+        (
+            'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
+            {
+                1: [
+                    'a',
+                    'b',
+                    'c\N{COMBINING TILDE}',
+                    '\N{CJK UNIFIED IDEOGRAPH-65E5}',
+                    'd',
+                    'e',
+                    'f',
+                ],
+                2: [
+                    'ab',
+                    'c\N{COMBINING TILDE}',
+                    '\N{CJK UNIFIED IDEOGRAPH-65E5}',
+                    'de',
+                    'f',
+                ],
+                3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
+            },
+        ),
+        (
+            'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
+            {
+                1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
+                2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
+                3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
+            },
+        ),
+    ],
+)
+def test_text_wrapper_drop_ws(text: str, results: dict[int, list[str]]) -> None:
+    for width, expected in results.items():
+        w = TextWrapper(width=width, drop_whitespace=False)
+        assert w.wrap(text) == expected