Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ Bugs fixed
Patch by Bénédikt Tran.
* #13712: intersphinx: Don't add "v" prefix to non-numeric versions.
Patch by Szymon Karpinski.
* #13741: text: fix an infinite loop when processing CSV tables
Patch by Bénédikt Tran.


Testing
-------
70 changes: 48 additions & 22 deletions sphinx/writers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,18 +292,18 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:
else:
indent = self.initial_indent

# Note that column_width(x) > len(x) is possible,
# but _handle_long_word() handles negative widths.
width = self.width - column_width(indent)

if self.drop_whitespace and not chunks[-1].strip() and lines:
del chunks[-1]

while chunks:
l = column_width(chunks[-1])

if cur_len + l <= width:
chunk_width = column_width(chunks[-1])
if cur_len + chunk_width <= width:
cur_line.append(chunks.pop())
cur_len += l

cur_len += chunk_width
else:
break

Expand All @@ -318,43 +318,69 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:

return lines

def _break_word(self, word: str, space_left: int) -> tuple[str, str]:
"""Break line by unicode width instead of len(word)."""
@staticmethod
def _find_break_end(word: str, space_left: int) -> int:
"""Break word by unicode width instead of len(word).

The returned position 'end' satisfies::

assert column_width(word[:end]) <= space_left
assert end == len(word) or column_width(word[:end+1]) > space_left
"""
total = 0
for i, c in enumerate(word):
for end, c in enumerate(word, 1):
total += column_width(c)
if total > space_left:
return word[: i - 1], word[i - 1 :]
return word, ''
return end - 1
return len(word)

def _split(self, text: str) -> list[str]:
"""Override original method that only split by 'wordsep_re'.

This '_split' splits wide-characters into chunks by one character.
"""

def split(t: str) -> list[str]:
return super(TextWrapper, self)._split(t)
def column_width_safe(x: str) -> int:
# Handle characters that are 0-width. We should refine
# the grouping to prevent splitting a word at combining
# characters or in a group of combining characters with
# at most one non-combining character as the combining
# characters may act on the right or left character.
#
# See https://github.com/sphinx-doc/sphinx/issues/13741.
return max(1, column_width(x))

chunks: list[str] = []
for chunk in split(text):
for w, g in groupby(chunk, column_width):
if w == 1:
chunks.extend(split(''.join(g)))
for chunk in super()._split(text):
for w, g in groupby(chunk, column_width_safe):
if w <= 1:
chunks.extend(super()._split(''.join(g)))
else:
chunks.extend(list(g))
return chunks

def _handle_long_word(
self, reversed_chunks: list[str], cur_line: list[str], cur_len: int, width: int
) -> None:
"""Override original method for using self._break_word() instead of slice."""
space_left = max(width - cur_len, 1)
if self.break_long_words:
l, r = self._break_word(reversed_chunks[-1], space_left)
cur_line.append(l)
reversed_chunks[-1] = r
"""Override using self._find_break() instead of str.find()."""
# Make sure at least one character is stripped off on every pass.
#
# Do NOT use space_left = max(width - cur_len, 1) as corner cases
# with "self.drop_whitespace == False" and "self.width == 1" fail.
space_left = 1 if width < 1 else (width - cur_len)

if self.break_long_words:
# Some characters may have len(X) < space_left < column_width(X)
# so we should only wrap chunks for which len(X) > space_left.
end = space_left
chunk = reversed_chunks[-1]
if space_left > 0:
end = self._find_break_end(chunk, space_left)
if end == 0 and space_left:
# force processing at least one character
end = 1
cur_line.append(chunk[:end])
reversed_chunks[-1] = chunk[end:]
elif not cur_line:
cur_line.append(reversed_chunks.pop())

Expand Down
183 changes: 183 additions & 0 deletions tests/test_writers/test_writer_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Test the LaTeX writer"""

from __future__ import annotations

import pytest
from docutils.utils import column_width

from sphinx.writers.text import TextWrapper

find_break_end = TextWrapper._find_break_end


@pytest.mark.parametrize(
# glyph of column width 0
'glyph',
['', '\N{COMBINING TILDE}'],
)
def test_text_wrapper_break_phantom_symbol(glyph: str) -> None:
assert column_width(glyph) == 0
glyph_length = len(glyph)

for n in range(1, 5):
# Since the glyph has length 0 and column width 0,
# we can always take the entire glpyh.
assert find_break_end(glyph, n) == glyph_length
for m in range(1, 5):
# The multiplied glyph may have non-zero length
# but its column width will always be 0, so we
# take the entire glyph again.
assert find_break_end(m * glyph, n) == m * glyph_length


@pytest.mark.parametrize(
('text', 'colwidth'),
[
# Glyph of length 1 and column width 1
('X', 1),
# Glyph of length 1 and column width 2
('\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
# Glyph of length 2 and column width 1
('\N{COMBINING TILDE}X', 1),
# Glyph of length 2 and column width 2
('\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
# Glyph of length 3 and column width 1
('\N{COMBINING TILDE}\N{COMBINING BREVE}X', 1),
],
)
def test_text_wrapper_break_visible_symbol(text: str, colwidth: int) -> None:
assert column_width(text) == colwidth
for n in range(1, 5):
end = find_break_end(text, n)
assert column_width(text[:end]) <= n
for m in range(2, 5):
m_text = m * text
end = find_break_end(m_text, n)
assert column_width(m_text[:end]) <= n
assert end == m * len(text) or column_width(m_text[: end + 1]) > n


def test_text_wrapper_break_stop_after_combining_symbols() -> None:
tilde = '\N{COMBINING TILDE}'
multi = '\N{CJK UNIFIED IDEOGRAPH-65E5}'

head = tilde + tilde + '....'
tail = multi + tilde + tilde
text = head + tail
assert find_break_end(head + tail, column_width(head)) == len(head)


@pytest.mark.parametrize(
('text', 'results'),
[
('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
(
'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
{
1: list('Helloa\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
2: ['He', 'll', 'o', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
3: ['Hel', 'lo', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
},
),
(
'ab c\N{COMBINING TILDE}def',
{
1: ['a', 'b', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
2: ['ab', 'c\N{COMBINING TILDE}d', 'ef'],
3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
},
),
(
'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
{
1: [
'a',
'b',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'd',
'e',
'f',
],
2: [
'ab',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'de',
'f',
],
3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
},
),
(
'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
{
1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
},
),
],
)
def test_text_wrapper(text: str, results: dict[int, list[str]]) -> None:
for width, expected in results.items():
w = TextWrapper(width=width, drop_whitespace=True)
assert w.wrap(text) == expected


@pytest.mark.parametrize(
('text', 'results'),
[
('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
(
'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
{
1: list('Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
2: ['He', 'll', 'o ', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
3: ['Hel', 'lo ', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
},
),
(
'ab c\N{COMBINING TILDE}def',
{
1: ['a', 'b', ' ', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
2: ['ab', ' c\N{COMBINING TILDE}', 'de', 'f'],
3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
},
),
(
'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
{
1: [
'a',
'b',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'd',
'e',
'f',
],
2: [
'ab',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'de',
'f',
],
3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
},
),
(
'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
{
1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
},
),
],
)
def test_text_wrapper_drop_ws(text: str, results: dict[int, list[str]]) -> None:
for width, expected in results.items():
w = TextWrapper(width=width, drop_whitespace=False)
assert w.wrap(text) == expected
Loading