hgmnz · AlexGunslinger · Oct 31, 2014 · Oct 31, 2014 · Nov 3, 2014
diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb
@@ -3,21 +3,16 @@ module TruncateHtml
   class HtmlString < String
 
     UNPAIRED_TAGS = %w(br hr img).freeze
-    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+    REGEX    = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+    HTMLTAGS = /<script\b[^>]*>([\s\S]*?)<\/script>|<("[^"]*"|'[^']*'|[^'">])*>/.freeze
 
     def initialize(original_html)
       super(original_html)
     end
 
     def html_tokens
       scan(REGEX).map do |token|
-        HtmlString.new(
-          token.gsub(
-            /\n/,' ' #replace newline characters with a whitespace
-          ).gsub(
-            /\s+/, ' ' #clean out extra consecutive whitespace
-          )
-        )
+        HtmlString.new(token).replace_newline.clean_whitespaces
       end
     end
 
@@ -37,5 +32,17 @@ def matching_close_tag
       gsub(/<(\w+)\s?.*>/, '</\1>').strip
     end
 
+    def clean_html
+      gsub(HTMLTAGS, '').replace_newline.clean_whitespaces
+    end
+
+    def replace_newline
+      gsub(/\n/, ' ')
+    end
+
+    def clean_whitespaces
+      gsub(/\s+/, ' ')
+    end
+
   end
 end
diff --git a/lib/truncate_html/html_truncator.rb b/lib/truncate_html/html_truncator.rb
@@ -3,16 +3,17 @@ class HtmlTruncator
 
     def initialize(original_html, options = {})
       @original_html   = original_html
-      length           = options[:length]       || TruncateHtml.configuration.length
+      @length          = options[:length]       || TruncateHtml.configuration.length
       @omission        = options[:omission]     || TruncateHtml.configuration.omission
       @word_boundary   = (options.has_key?(:word_boundary) ? options[:word_boundary] : TruncateHtml.configuration.word_boundary)
       @break_token     = options[:break_token] || TruncateHtml.configuration.break_token || nil
-      @chars_remaining = length - @omission.length
+      @chars_remaining = @length - @omission.length
       @open_tags, @closing_tags, @truncated_html = [], [], ['']
     end
 
     def truncate
       return @omission if @chars_remaining < 0
+      return @original_html if return_html?
       @original_html.html_tokens.each do |token|
         if @chars_remaining <= 0 || truncate_token?(token)
           close_open_tags
@@ -93,5 +94,9 @@ def remove_latest_open_tag(close_tag)
     def truncate_token?(token)
       @break_token and token == @break_token
     end
+
+    def return_html?
+      @original_html.clean_html.length <= @length && !@original_html.html_tokens.include?(@break_token)
+    end
   end
 end
diff --git a/spec/truncate_html/html_string_spec.rb b/spec/truncate_html/html_string_spec.rb
@@ -79,4 +79,37 @@ def html_string(original_string)
       html_string('foo').should_not be_html_comment
     end
   end
+
+  describe '#replace_newline' do
+    it 'returns the string with whitespaces instead of newlines' do
+      html     = 'This is a string.
+With newlines.
+Dont want them.'
+      expected = 'This is a string. With newlines. Dont want them.'
+
+      html_string(html).replace_newline.should == expected 
+    end
+  end
+
+  describe '#clean_whitespace' do
+    it 'returns the string with only single whitespaces' do
+      html     = 'This is a string.  With double  white  spaces.'
+      expected = 'This is a string. With double white spaces.'
+      html_string(html).clean_whitespaces.should == expected
+    end
+  end
+
+  describe '#clean_html' do
+    it 'returns the html string without any html tags' do
+      html     = '<b>This is bold</b>. <script> alert("Dont show!") </script> <div class="this-class">This will show</div>' 
+      expected = 'This is bold. This will show'
+      html_string(html).clean_html.should == expected
+    end
+
+    it 'returns the html string without any comments' do
+      html     = '<b>This is bold</b>. <!-- this is a comment --> And this will show' 
+      expected = 'This is bold. And this will show'
+      html_string(html).clean_html.should == expected
+    end
+  end
 end
diff --git a/spec/truncate_html/html_truncator_spec.rb b/spec/truncate_html/html_truncator_spec.rb
@@ -15,12 +15,12 @@ def truncate(html, opts = {})
 
     it 'retains the tags within the text' do
       html = 'some text <span class="caps">CAPS</span> some text'
-      truncate(html, :length => 25, :word_boundary => false).should == 'some text <span class="caps">CAPS</span> some te...'
+      truncate(html, :length => 19, :word_boundary => false).should == 'some text <span class="caps">CAPS</span> s...'
     end
 
     context 'and a custom omission value is passed' do
       it 'retains the omission text' do
-        truncate("testtest", :length => 10, :omission => '..', :word_boundary => false).should == 'testtest..'
+        truncate("testtest", :length => 7, :omission => '..', :word_boundary => false).should == 'testt..'
       end
 
       it 'handles multibyte characters' do
@@ -204,4 +204,34 @@ def truncate(html, opts = {})
         '<h1>hello <!-- stuff --> and <!-- la -->...</h1>'
     end
   end
+
+  context 'when the clean string length is the same than the length param' do
+    it 'does not truncate the string' do
+      html = 'exact string length'
+      truncate(html, length: 19).should == html
+    end
+
+    it 'does not truncate the string even if it contains html tags' do
+      html = '<b>exact</b> <span class="this-class">string</span> length'
+      truncate(html, length: 19).should == html
+    end
+
+    it 'does not truncate the string even if it contains html comments' do
+      html = 'exact <!-- stuff --> string length'
+      truncate(html, length: 19).should == html
+    end
+
+    context 'when the break_token is set' do
+      it 'truncates before the break_token if included in the string' do
+        html = 'exact <!-- stuff --> string length'
+        expected = 'exact <!-- stuff --> string'
+        truncate(html, length: 19, break_token: 'length').should == expected
+      end
+
+      it 'does not truncate before if break token is not in the string' do
+        html = 'exact <!-- stuff --> string length'
+        truncate(html, length: 19, break_token: 'nothere').should == html
+      end
+    end
+  end
 end