.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|[[:space:]]+|[[:punct:]]/.freeze
def initialize(original_html)
super(original_html)
@@ -13,9 +13,7 @@ def html_tokens
scan(REGEX).map do |token|
HtmlString.new(
token.gsub(
- /\n/,' ' #replace newline characters with a whitespace
- ).gsub(
- /\s+/, ' ' #clean out extra consecutive whitespace
+ /[[:space:]]+/, ' ' #clean out extra consecutive whitespace
)
)
end
diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb
index 064f87b..b60e165 100644
--- a/spec/truncate_html/html_string_spec.rb
+++ b/spec/truncate_html/html_string_spec.rb
@@ -9,9 +9,9 @@ def html_string(original_string)
describe '#html_tokens' do
it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
- html = 'Hi there
This is sweet!
squaremeter m²
'
+ html = "Hi there
This is sweet!
\r\n squaremeter m²
Non-breaking\nspace here:
"
html_string(html).html_tokens.should == ['', 'Hi', ' ', 'there', '
', ' ', '', 'This', ' ', 'is', ' ', 'sweet!', '
',
- ' ', '', ' ', 'squaremeter', ' ', 'm²', ' ', '
']
+ ' ', '', ' ', 'squaremeter', ' ', 'm²', ' ', '
', '', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '
']
end
end