diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb
index 76d82e9..5be96dc 100644
--- a/lib/truncate_html/html_string.rb
+++ b/lib/truncate_html/html_string.rb
@@ -3,7 +3,23 @@ module TruncateHtml
class HtmlString < String
UNPAIRED_TAGS = %w(br hr img).freeze
- REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+ TAG_BODY_CHARACTERS =
+ '[[:alnum:]]' + # Match unicode alpha numberic characters
+ '\p{Sc}' + # Match unicode currency characters
+ '\p{So}' + # Match unicode other symbols
+ '[\p{Sm}&&[^<]]' + # Match unicode math symbols except ascii <. < opens html tags.
+ '[\p{Zs}&&[^\s]]' + # Match unicode space characters except \s+
+ %q(\|^` ̄`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters
+ '[[:punct:]]' # Don't gobble up chinese punctuation characters
+ REGEX = %r{
+ (?:.*<\/script>)+ # Match script tags. They aren't counted in length.
+ |
+ <\/?[^>]+> # Match html tags
+ |
+ \s+ # Match consecutive spaces. They are later truncated to a single space.
+ |
+ [#{TAG_BODY_CHARACTERS}]+ # Match tag body
+ }x.freeze
def initialize(original_html)
super(original_html)
diff --git a/spec/truncate_html/html_truncator_spec.rb b/spec/truncate_html/html_truncator_spec.rb
index 0da14d6..f326078 100644
--- a/spec/truncate_html/html_truncator_spec.rb
+++ b/spec/truncate_html/html_truncator_spec.rb
@@ -204,4 +204,15 @@ def truncate(html, opts = {})
'hello and ...
'
end
end
+
+ it "doesn't gobble up non alphabetical unicode characters" do
+ truncate('+
ー
〜
=
─
a (double-byte space)
¥
&
%
#
$
!
?
><
・
/
「」
@
、。', length: 100).should ==
+ '+
ー
〜
=
─
a (double-byte space)
¥
&
%
#
$
!
?
><
・
/
「」
@
、。'
+ end
+
+ it "doesn't gobble up halfwidth and fullwidth forms of unicode charecters" do
+ input = '!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~⦅⦆。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙ᄀᄁᆪᄂᆬᆭᄃᄄᄅᆰᆱᆲᆳᆴᆵᄚᄆᄇᄈᄡᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵ¢£¬ ̄¦¥₩│←↑→↓■○0123456789'
+ output = truncate(input, length: 300)
+ output.should == input
+ end
end