From e4719390cbdeb4c89f1e1bc8dbd531c5bbf9b2c9 Mon Sep 17 00:00:00 2001 From: abonec Date: Wed, 19 Feb 2014 18:06:25 +0400 Subject: [PATCH] treat words divided by no-break space as a single token --- lib/truncate_html/html_string.rb | 2 +- spec/truncate_html/html_string_spec.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 76d82e9..9cd410e 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -3,7 +3,7 @@ module TruncateHtml class HtmlString < String UNPAIRED_TAGS = %w(br hr img).freeze - REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze + REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?\u00a0]+|\s+|[[:punct:]]/.freeze def initialize(original_html) super(original_html) diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb index 064f87b..4e5396c 100644 --- a/spec/truncate_html/html_string_spec.rb +++ b/spec/truncate_html/html_string_spec.rb @@ -9,9 +9,9 @@ def html_string(original_string) describe '#html_tokens' do it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do - html = '

Hi there

This is sweet!

squaremeter m²

' + html = "

Hi there

This is sweet!

squaremeter m² and no\u00a0break space

" html_string(html).html_tokens.should == ['

', 'Hi', ' ', 'there', '

', ' ', '

', 'This', ' ', 'is', ' ', 'sweet!', '

', - ' ', '

', ' ', 'squaremeter', ' ', 'm²', ' ', '

'] + ' ', '

', ' ', 'squaremeter', ' ', 'm²', ' ', 'and', ' ', "no break", ' ', 'space', ' ', '

'] end end