diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb
index 76d82e9..5be96dc 100644
--- a/lib/truncate_html/html_string.rb
+++ b/lib/truncate_html/html_string.rb
@@ -3,7 +3,23 @@ module TruncateHtml
   class HtmlString < String
 
     UNPAIRED_TAGS = %w(br hr img).freeze
-    REGEX = /(?:<script.*>.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze
+    TAG_BODY_CHARACTERS =
+      '[[:alnum:]]' + # Match unicode alpha numberic characters
+      '\p{Sc}' + # Match unicode currency characters
+      '\p{So}' + # Match unicode other symbols
+      '[\p{Sm}&&[^<]]' + # Match unicode math symbols except ascii <. < opens html tags.
+      '[\p{Zs}&&[^\s]]' + # Match unicode space characters except \s+
+      %q(\|＾｀￣`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters
+      '[[:punct:]]' # Don't gobble up chinese punctuation characters
+    REGEX = %r{
+      (?:<script.*>.*<\/script>)+ # Match script tags. They aren't counted in length.
+      |
+      <\/?[^>]+> # Match html tags
+      |
+      \s+ # Match consecutive spaces. They are later truncated to a single space.
+      |
+      [#{TAG_BODY_CHARACTERS}]+ # Match tag body
+    }x.freeze
 
     def initialize(original_html)
       super(original_html)
diff --git a/spec/truncate_html/html_truncator_spec.rb b/spec/truncate_html/html_truncator_spec.rb
index 0da14d6..f326078 100644
--- a/spec/truncate_html/html_truncator_spec.rb
+++ b/spec/truncate_html/html_truncator_spec.rb
@@ -204,4 +204,15 @@ def truncate(html, opts = {})
         '<h1>hello <!-- stuff --> and <!-- la -->...</h1>'
     end
   end
+
+  it "doesn't gobble up non alphabetical unicode characters" do
+    truncate('＋<br />ー<br />〜<br />＝<br />─<br />a　(double-byte space)<br />￥<br />＆<br />％<br />＃<br />＄<br />！<br />？<br />＞＜<br />・<br />／<br />「」<br />＠<br />、。', length: 100).should ==
+      '＋<br />ー<br />〜<br />＝<br />─<br />a　(double-byte space)<br />￥<br />＆<br />％<br />＃<br />＄<br />！<br />？<br />＞＜<br />・<br />／<br />「」<br />＠<br />、。'
+  end
+
+  it "doesn't gobble up halfwidth and fullwidth forms of unicode charecters" do
+    input = '！＂＃＄％＆＇（）＊＋，－．／０１２３４５６７８９：；＜＝＞？＠ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ［＼］＾＿｀ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ｛｜｝～｟｠｡｢｣､･ｦｧｨｩｪｫｬｭｮｯｰｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃﾄﾅﾆﾇﾈﾉﾊﾋﾌﾍﾎﾏﾐﾑﾒﾓﾔﾕﾖﾗﾘﾙﾚﾛﾜﾝﾞﾡﾢﾣﾤﾥﾦﾧﾨﾩﾪﾫﾬﾭﾮﾯﾰﾱﾲﾳﾴﾵﾶﾷﾸﾹﾺﾻﾼﾽﾾￂￃￄￅￆￇￊￋￌￍￎￏￒￓￔￕￖￗￚￛￜ￠￡￢￣￤￥￦￨￩￪￫￬￭￮0123456789'
+    output = truncate(input, length: 300)
+    output.should == input 
+  end
 end