From 458ab1074a47154f6614b12b051584681f8667f9 Mon Sep 17 00:00:00 2001
From: David Butler <dwbutler@ucla.edu>
Date: Wed, 25 May 2016 15:31:30 -0700
Subject: [PATCH 1/2] Handle whitespace more generically

This fixes the issue of non-breaking spaces being discarded, resulting in words being joined together. `\s` and `\S` only match on space (character 32). HTML often uses non-breaking spaces (character 160). A more general approach is to use `[[:space:]]` instead. According to the `Regexp` documentation, `[[:space:]]` matches "Whitespace character ([:blank:], newline, carriage return, etc.)"

Fixes https://github.com/hgmnz/truncate_html/issues/68
---
 lib/truncate_html.rb                   | 2 +-
 lib/truncate_html/html_string.rb       | 6 ++----
 spec/truncate_html/html_string_spec.rb | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)
diff --git a/lib/truncate_html.rb b/lib/truncate_html.rb
index 95b7923..3987501 100644
--- a/lib/truncate_html.rb
+++ b/lib/truncate_html.rb
@@ -7,7 +7,7 @@
 TruncateHtml.configure do |config|
   config.length        = 100
   config.omission      = '...'
-  config.word_boundary = /\S/
+  config.word_boundary = /(?![[:space:]])./
 end
 
 
diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb
index 76d82e9..62d8675 100644
--- a/lib/truncate_html/html_string.rb
+++ b/lib/truncate_html/html_string.rb
@@ -3,7 +3,7 @@ module TruncateHtml
   class HtmlString < String
 
     UNPAIRED_TAGS = %w(br hr img).freeze
-    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|[[:space:]]+|[[:punct:]]/.freeze
 
     def initialize(original_html)
       super(original_html)
@@ -13,9 +13,7 @@ def html_tokens
       scan(REGEX).map do |token|
         HtmlString.new(
           token.gsub(
-            /\n/,' ' #replace newline characters with a whitespace
-          ).gsub(
-            /\s+/, ' ' #clean out extra consecutive whitespace
+            /[[:space:]]+/, ' ' #clean out extra consecutive whitespace
           )
         )
       end
diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb
index 064f87b..ebc45cd 100644
--- a/spec/truncate_html/html_string_spec.rb
+++ b/spec/truncate_html/html_string_spec.rb
@@ -9,9 +9,9 @@ def html_string(original_string)
 
   describe '#html_tokens' do
     it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
-      html = '<h1>Hi there</h1> <p>This          is sweet!</p> <p> squaremeter m² </p>'
+      html = '<h1>Hi there</h1> <p>This          is sweet!</p> <p> squaremeter m² </p><div>Non-breaking space here: </div>'
       html_string(html).html_tokens.should == ['<h1>', 'Hi', ' ', 'there', '</h1>', ' ', '<p>', 'This', ' ', 'is', ' ', 'sweet!', '</p>',
-        ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>']
+        ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>', '<div>', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '</div>']
     end
   end
 

From 86e291e133ffcb120e4ff29c216b490fa4d94af8 Mon Sep 17 00:00:00 2001
From: David Butler <dwbutler@ucla.edu>
Date: Wed, 25 May 2016 15:34:21 -0700
Subject: [PATCH 2/2] Test newlines and carriage returns

---
 spec/truncate_html/html_string_spec.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb
index ebc45cd..b60e165 100644
--- a/spec/truncate_html/html_string_spec.rb
+++ b/spec/truncate_html/html_string_spec.rb
@@ -9,7 +9,7 @@ def html_string(original_string)
 
   describe '#html_tokens' do
     it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
-      html = '<h1>Hi there</h1> <p>This          is sweet!</p> <p> squaremeter m² </p><div>Non-breaking space here: </div>'
+      html = "<h1>Hi there</h1> <p>This          is sweet!</p> \r\n<p> squaremeter m² </p><div>Non-breaking\nspace here: </div>"
       html_string(html).html_tokens.should == ['<h1>', 'Hi', ' ', 'there', '</h1>', ' ', '<p>', 'This', ' ', 'is', ' ', 'sweet!', '</p>',
         ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>', '<div>', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '</div>']
     end