Skip to content

Commit cad3c43

Browse files
picnixzAA-Turner
andauthored
Fix infinite loop in sphinx.writers.text.TextWrapper (#13762)
Co-authored-by: Adam Turner <[email protected]>
1 parent e591423 commit cad3c43

File tree

3 files changed

+236
-25
lines changed

3 files changed

+236
-25
lines changed

CHANGES.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,8 @@ Bugs fixed
185185
* #14067: EPUB: unify path separators in manifest items to forward slashes;
186186
resolve duplicates in the manifest on Windows.
187187
Patch by Akihiro Takizawa.
188+
* #13741: text builder: fix an infinite loop when processing CSV tables.
189+
Patch by Bénédikt Tran.
188190

189191

190192
Testing

sphinx/writers/text.py

Lines changed: 51 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -292,18 +292,18 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:
292292
else:
293293
indent = self.initial_indent
294294

295+
# Note that column_width(x) > len(x) is possible,
296+
# but _handle_long_word() handles negative widths.
295297
width = self.width - column_width(indent)
296298

297299
if self.drop_whitespace and not chunks[-1].strip() and lines:
298300
del chunks[-1]
299301

300302
while chunks:
301-
l = column_width(chunks[-1])
302-
303-
if cur_len + l <= width:
303+
chunk_width = column_width(chunks[-1])
304+
if cur_len + chunk_width <= width:
304305
cur_line.append(chunks.pop())
305-
cur_len += l
306-
306+
cur_len += chunk_width
307307
else:
308308
break
309309

@@ -318,47 +318,73 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:
318318

319319
return lines
320320

321-
def _break_word(self, word: str, space_left: int) -> tuple[str, str]:
322-
"""Break line by unicode width instead of len(word)."""
321+
@staticmethod
322+
def _find_break_end(word: str, space_left: int) -> int:
323+
"""Break word by Unicode width instead of len(word).
324+
325+
The returned position 'end' satisfies::
326+
327+
assert column_width(word[:end]) <= space_left
328+
assert end == len(word) or column_width(word[:end+1]) > space_left
329+
"""
323330
total = 0
324-
for i, c in enumerate(word):
331+
for end, c in enumerate(word, start=1):
325332
total += column_width(c)
326333
if total > space_left:
327-
return word[: i - 1], word[i - 1 :]
328-
return word, ''
334+
return end - 1
335+
return len(word)
329336

330337
def _split(self, text: str) -> list[str]:
331338
"""Override original method that only split by 'wordsep_re'.
332339
333340
This '_split' splits wide-characters into chunks by one character.
334341
"""
335-
336-
def split(t: str) -> list[str]:
337-
return super(TextWrapper, self)._split(t)
338-
339342
chunks: list[str] = []
340-
for chunk in split(text):
341-
for w, g in groupby(chunk, column_width):
342-
if w == 1:
343-
chunks.extend(split(''.join(g)))
343+
for chunk in super()._split(text):
344+
for w, g in groupby(chunk, _column_width_safe):
345+
if w <= 1:
346+
chunks += super()._split(''.join(g))
344347
else:
345-
chunks.extend(list(g))
348+
chunks += g
346349
return chunks
347350

348351
def _handle_long_word(
349352
self, reversed_chunks: list[str], cur_line: list[str], cur_len: int, width: int
350353
) -> None:
351-
"""Override original method for using self._break_word() instead of slice."""
352-
space_left = max(width - cur_len, 1)
353-
if self.break_long_words:
354-
l, r = self._break_word(reversed_chunks[-1], space_left)
355-
cur_line.append(l)
356-
reversed_chunks[-1] = r
354+
"""Override using self._find_break() instead of str.find()."""
355+
# Make sure at least one character is stripped off on every pass.
356+
#
357+
# Do NOT use space_left = max(width - cur_len, 1) as corner cases
358+
# with "self.drop_whitespace == False" and "self.width == 1" fail.
359+
space_left = 1 if width < 1 else (width - cur_len)
357360

361+
if self.break_long_words:
362+
# Some characters may have len(X) < space_left < column_width(X)
363+
# so we should only wrap chunks for which len(X) > space_left.
364+
end = space_left
365+
chunk = reversed_chunks[-1]
366+
if space_left > 0:
367+
end = self._find_break_end(chunk, space_left)
368+
if end == 0 and space_left:
369+
# force processing at least one character
370+
end = 1
371+
cur_line.append(chunk[:end])
372+
reversed_chunks[-1] = chunk[end:]
358373
elif not cur_line:
359374
cur_line.append(reversed_chunks.pop())
360375

361376

377+
def _column_width_safe(x: str) -> int:
378+
# Handle characters that are 0-width. We should refine
379+
# the grouping to prevent splitting a word at combining
380+
# characters or in a group of combining characters with
381+
# at most one non-combining character as the combining
382+
# characters may act on the right or left character.
383+
#
384+
# See https://github.com/sphinx-doc/sphinx/issues/13741.
385+
return max(1, column_width(x))
386+
387+
362388
MAXWIDTH = 70
363389
STDINDENT = 3
364390

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
"""Test the LaTeX writer"""
2+
3+
from __future__ import annotations
4+
5+
import pytest
6+
from docutils.utils import column_width
7+
8+
from sphinx.writers.text import TextWrapper
9+
10+
find_break_end = TextWrapper._find_break_end
11+
12+
13+
@pytest.mark.parametrize(
14+
# glyph of column width 0
15+
'glyph',
16+
['', '\N{COMBINING TILDE}'],
17+
)
18+
def test_text_wrapper_break_phantom_symbol(glyph: str) -> None:
19+
assert column_width(glyph) == 0
20+
glyph_length = len(glyph)
21+
22+
for n in range(1, 5):
23+
# Since the glyph has length 0 and column width 0,
24+
# we can always take the entire glpyh.
25+
assert find_break_end(glyph, n) == glyph_length
26+
for m in range(1, 5):
27+
# The multiplied glyph may have non-zero length
28+
# but its column width will always be 0, so we
29+
# take the entire glyph again.
30+
assert find_break_end(m * glyph, n) == m * glyph_length
31+
32+
33+
@pytest.mark.parametrize(
34+
('text', 'colwidth'),
35+
[
36+
# Glyph of length 1 and column width 1
37+
('X', 1),
38+
# Glyph of length 1 and column width 2
39+
('\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
40+
# Glyph of length 2 and column width 1
41+
('\N{COMBINING TILDE}X', 1),
42+
# Glyph of length 2 and column width 2
43+
('\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
44+
# Glyph of length 3 and column width 1
45+
('\N{COMBINING TILDE}\N{COMBINING BREVE}X', 1),
46+
],
47+
)
48+
def test_text_wrapper_break_visible_symbol(text: str, colwidth: int) -> None:
49+
assert column_width(text) == colwidth
50+
for n in range(1, 5):
51+
end = find_break_end(text, n)
52+
assert column_width(text[:end]) <= n
53+
for m in range(2, 5):
54+
m_text = m * text
55+
end = find_break_end(m_text, n)
56+
assert column_width(m_text[:end]) <= n
57+
assert end == m * len(text) or column_width(m_text[: end + 1]) > n
58+
59+
60+
def test_text_wrapper_break_stop_after_combining_symbols() -> None:
61+
tilde = '\N{COMBINING TILDE}'
62+
multi = '\N{CJK UNIFIED IDEOGRAPH-65E5}'
63+
64+
head = tilde + tilde + '....'
65+
tail = multi + tilde + tilde
66+
text = head + tail
67+
assert find_break_end(head + tail, column_width(head)) == len(head)
68+
69+
70+
@pytest.mark.parametrize(
71+
('text', 'results'),
72+
[
73+
('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
74+
(
75+
'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
76+
{
77+
1: list('Helloa\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
78+
2: ['He', 'll', 'o', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
79+
3: ['Hel', 'lo', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
80+
},
81+
),
82+
(
83+
'ab c\N{COMBINING TILDE}def',
84+
{
85+
1: ['a', 'b', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
86+
2: ['ab', 'c\N{COMBINING TILDE}d', 'ef'],
87+
3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
88+
},
89+
),
90+
(
91+
'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
92+
{
93+
1: [
94+
'a',
95+
'b',
96+
'c\N{COMBINING TILDE}',
97+
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
98+
'd',
99+
'e',
100+
'f',
101+
],
102+
2: [
103+
'ab',
104+
'c\N{COMBINING TILDE}',
105+
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
106+
'de',
107+
'f',
108+
],
109+
3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
110+
},
111+
),
112+
(
113+
'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
114+
{
115+
1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
116+
2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
117+
3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
118+
},
119+
),
120+
],
121+
)
122+
def test_text_wrapper(text: str, results: dict[int, list[str]]) -> None:
123+
for width, expected in results.items():
124+
w = TextWrapper(width=width, drop_whitespace=True)
125+
assert w.wrap(text) == expected
126+
127+
128+
@pytest.mark.parametrize(
129+
('text', 'results'),
130+
[
131+
('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
132+
(
133+
'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
134+
{
135+
1: list('Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
136+
2: ['He', 'll', 'o ', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
137+
3: ['Hel', 'lo ', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
138+
},
139+
),
140+
(
141+
'ab c\N{COMBINING TILDE}def',
142+
{
143+
1: ['a', 'b', ' ', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
144+
2: ['ab', ' c\N{COMBINING TILDE}', 'de', 'f'],
145+
3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
146+
},
147+
),
148+
(
149+
'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
150+
{
151+
1: [
152+
'a',
153+
'b',
154+
'c\N{COMBINING TILDE}',
155+
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
156+
'd',
157+
'e',
158+
'f',
159+
],
160+
2: [
161+
'ab',
162+
'c\N{COMBINING TILDE}',
163+
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
164+
'de',
165+
'f',
166+
],
167+
3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
168+
},
169+
),
170+
(
171+
'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
172+
{
173+
1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
174+
2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
175+
3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
176+
},
177+
),
178+
],
179+
)
180+
def test_text_wrapper_drop_ws(text: str, results: dict[int, list[str]]) -> None:
181+
for width, expected in results.items():
182+
w = TextWrapper(width=width, drop_whitespace=False)
183+
assert w.wrap(text) == expected

0 commit comments

Comments
 (0)