From 458ab1074a47154f6614b12b051584681f8667f9 Mon Sep 17 00:00:00 2001 From: David Butler Date: Wed, 25 May 2016 15:31:30 -0700 Subject: [PATCH 1/2] Handle whitespace more generically This fixes the issue of non-breaking spaces being discarded, resulting in words being joined together. `\s` and `\S` only match on space (character 32). HTML often uses non-breaking spaces (character 160). A more general approach is to use `[[:space:]]` instead. According to the `Regexp` documentation, `[[:space:]]` matches "Whitespace character ([:blank:], newline, carriage return, etc.)" Fixes https://github.com/hgmnz/truncate_html/issues/68 --- lib/truncate_html.rb | 2 +- lib/truncate_html/html_string.rb | 6 ++---- spec/truncate_html/html_string_spec.rb | 4 ++-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/truncate_html.rb b/lib/truncate_html.rb index 95b7923..3987501 100644 --- a/lib/truncate_html.rb +++ b/lib/truncate_html.rb @@ -7,7 +7,7 @@ TruncateHtml.configure do |config| config.length = 100 config.omission = '...' - config.word_boundary = /\S/ + config.word_boundary = /(?![[:space:]])./ end diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 76d82e9..62d8675 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -3,7 +3,7 @@ module TruncateHtml class HtmlString < String UNPAIRED_TAGS = %w(br hr img).freeze - REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze + REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|[[:space:]]+|[[:punct:]]/.freeze def initialize(original_html) super(original_html) @@ -13,9 +13,7 @@ def html_tokens scan(REGEX).map do |token| HtmlString.new( token.gsub( - /\n/,' ' #replace newline characters with a whitespace - ).gsub( - /\s+/, ' ' #clean out extra consecutive whitespace + /[[:space:]]+/, ' ' #clean out extra consecutive whitespace ) ) end diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb index 064f87b..ebc45cd 100644 --- a/spec/truncate_html/html_string_spec.rb +++ b/spec/truncate_html/html_string_spec.rb @@ -9,9 +9,9 @@ def html_string(original_string) describe '#html_tokens' do it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do - html = '

Hi there

This is sweet!

squaremeter m²

' + html = '

Hi there

This is sweet!

squaremeter m²

Non-breaking space here: 
' html_string(html).html_tokens.should == ['

', 'Hi', ' ', 'there', '

', ' ', '

', 'This', ' ', 'is', ' ', 'sweet!', '

', - ' ', '

', ' ', 'squaremeter', ' ', 'm²', ' ', '

'] + ' ', '

', ' ', 'squaremeter', ' ', 'm²', ' ', '

', '
', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '
'] end end From 86e291e133ffcb120e4ff29c216b490fa4d94af8 Mon Sep 17 00:00:00 2001 From: David Butler Date: Wed, 25 May 2016 15:34:21 -0700 Subject: [PATCH 2/2] Test newlines and carriage returns --- spec/truncate_html/html_string_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb index ebc45cd..b60e165 100644 --- a/spec/truncate_html/html_string_spec.rb +++ b/spec/truncate_html/html_string_spec.rb @@ -9,7 +9,7 @@ def html_string(original_string) describe '#html_tokens' do it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do - html = '

Hi there

This is sweet!

squaremeter m²

Non-breaking space here: 
' + html = "

Hi there

This is sweet!

\r\n

squaremeter m²

Non-breaking\nspace here: 
" html_string(html).html_tokens.should == ['

', 'Hi', ' ', 'there', '

', ' ', '

', 'This', ' ', 'is', ' ', 'sweet!', '

', ' ', '

', ' ', 'squaremeter', ' ', 'm²', ' ', '

', '
', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '
'] end