From e4719390cbdeb4c89f1e1bc8dbd531c5bbf9b2c9 Mon Sep 17 00:00:00 2001
From: abonec <abonec@gmail.com>
Date: Wed, 19 Feb 2014 18:06:25 +0400
Subject: [PATCH] treat words divided by no-break space as a single token

---
 lib/truncate_html/html_string.rb       | 2 +-
 spec/truncate_html/html_string_spec.rb | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb
index 76d82e9..9cd410e 100644
--- a/lib/truncate_html/html_string.rb
+++ b/lib/truncate_html/html_string.rb
@@ -3,7 +3,7 @@ module TruncateHtml
   class HtmlString < String
 
     UNPAIRED_TAGS = %w(br hr img).freeze
-    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?\u00a0]+|\s+|[[:punct:]]/.freeze
 
     def initialize(original_html)
       super(original_html)
diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb
index 064f87b..4e5396c 100644
--- a/spec/truncate_html/html_string_spec.rb
+++ b/spec/truncate_html/html_string_spec.rb
@@ -9,9 +9,9 @@ def html_string(original_string)
 
   describe '#html_tokens' do
     it 'returns each token in the string as an array element removing any consecutive whitespace from the string' do
-      html = '<h1>Hi there</h1> <p>This          is sweet!</p> <p> squaremeter m² </p>'
+      html = "<h1>Hi there</h1> <p>This          is sweet!</p> <p> squaremeter m² and no\u00a0break space </p>"
       html_string(html).html_tokens.should == ['<h1>', 'Hi', ' ', 'there', '</h1>', ' ', '<p>', 'This', ' ', 'is', ' ', 'sweet!', '</p>',
-        ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', '</p>']
+        ' ', '<p>', ' ', 'squaremeter', ' ', 'm²', ' ', 'and', ' ', "no break", ' ', 'space', ' ', '</p>']
     end
   end