Skip to content

Commit a61fa95

Browse files
committed
Fix handling of whitespace/non breaking spaces
This is based on a currently unmerged PR to the main repo: hgmnz#69 It fixes an issue where truncate_html would remove non breaking spaces, resulting in words being joined together incorrectly. Now they are treated like other whitespaces.
1 parent a09ddcd commit a61fa95

File tree

3 files changed

+5
-7
lines changed

3 files changed

+5
-7
lines changed

lib/truncate_html.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
TruncateHtml.configure do |config|
88
config.length = 100
99
config.omission = '...'
10-
config.word_boundary = /\S/
10+
config.word_boundary = /(?![[:space:]])./
1111
end
1212

1313

lib/truncate_html/html_string.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module TruncateHtml
33
class HtmlString < String
44

55
UNPAIRED_TAGS = %w(br hr img).freeze
6-
REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
6+
REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|[[:space:]]+|[[:punct:]]/.freeze
77

88
def initialize(original_html)
99
super(original_html)
@@ -13,9 +13,7 @@ def html_tokens
1313
scan(REGEX).map do |token|
1414
HtmlString.new(
1515
token.gsub(
16-
/\n/,' ' #replace newline characters with a whitespace
17-
).gsub(
18-
/\s+/, ' ' #clean out extra consecutive whitespace
16+
/[[:space:]]+/, ' ' #clean out extra consecutive whitespace
1917
)
2018
)
2119
end

spec/truncate_html/html_string_spec.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ def html_string(original_string)
99

1010
describe '#html_tokens' do
1111
it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
12-
html = '<h1>Hi there</h1> <p>This is sweet!</p> <p> squaremeter m² </p>'
12+
html = "<h1>Hi there</h1> <p>This is sweet!</p> \r\n<p> squaremeter m² </p><div>Non-breaking\nspace here: </div>"
1313
html_string(html).html_tokens.should == ['<h1>', 'Hi', ' ', 'there', '</h1>', ' ', '<p>', 'This', ' ', 'is', ' ', 'sweet!', '</p>',
14-
' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>']
14+
' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>', '<div>', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '</div>']
1515
end
1616
end
1717

0 commit comments

Comments
 (0)