Fix handling of whitespace/non breaking spaces

sonjapeterson · sonjapeterson · commit a61fa95f175c · 2016-11-30T10:05:44.000-05:00
This is based on a currently unmerged PR to the main repo: hgmnz#69 It fixes an issue where truncate_html would remove non breaking spaces, resulting in words being joined together incorrectly. Now they are treated like other whitespaces.
diff --git a/lib/truncate_html.rb b/lib/truncate_html.rb
@@ -7,7 +7,7 @@
 TruncateHtml.configure do |config|
   config.length        = 100
   config.omission      = '...'
-  config.word_boundary = /\S/
+  config.word_boundary = /(?![[:space:]])./
 end
 
 
diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb
@@ -3,7 +3,7 @@ module TruncateHtml
   class HtmlString < String
 
     UNPAIRED_TAGS = %w(br hr img).freeze
-    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|[[:space:]]+|[[:punct:]]/.freeze
 
     def initialize(original_html)
       super(original_html)
@@ -13,9 +13,7 @@ def html_tokens
       scan(REGEX).map do |token|
         HtmlString.new(
           token.gsub(
-            /\n/,' ' #replace newline characters with a whitespace
-          ).gsub(
-            /\s+/, ' ' #clean out extra consecutive whitespace
+            /[[:space:]]+/, ' ' #clean out extra consecutive whitespace
           )
         )
       end
diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb
@@ -9,9 +9,9 @@ def html_string(original_string)
 
   describe '#html_tokens' do
     it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
-      html = '<h1>Hi there</h1> <p>This          is sweet!</p> <p> squaremeter m² </p>'
+      html = "<h1>Hi there</h1> <p>This          is sweet!</p> \r\n<p> squaremeter m² </p><div>Non-breaking\nspace here: </div>"
       html_string(html).html_tokens.should == ['<h1>', 'Hi', ' ', 'there', '</h1>', ' ', '<p>', 'This', ' ', 'is', ' ', 'sweet!', '</p>',
-        ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>']
+        ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>', '<div>', 'Non-breaking', ' ', 'space', ' ', 'here:', ' ', '</div>']
     end
   end