diff --git a/lib/truncate_html/html_string.rb b/lib/truncate_html/html_string.rb index 76d82e9..5be96dc 100644 --- a/lib/truncate_html/html_string.rb +++ b/lib/truncate_html/html_string.rb @@ -3,7 +3,23 @@ module TruncateHtml class HtmlString < String UNPAIRED_TAGS = %w(br hr img).freeze - REGEX = /(?:.*<\/script>)+|<\/?[^>]+>|[[[:alpha:]][0-9]\|`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?]+|\s+|[[:punct:]]/.freeze + TAG_BODY_CHARACTERS = + '[[:alnum:]]' + # Match unicode alpha numberic characters + '\p{Sc}' + # Match unicode currency characters + '\p{So}' + # Match unicode other symbols + '[\p{Sm}&&[^<]]' + # Match unicode math symbols except ascii <. < opens html tags. + '[\p{Zs}&&[^\s]]' + # Match unicode space characters except \s+ + %q(\|^` ̄`~!@#\$%^&*\(\)\-_\+=\[\]{}:;'²³§",\.\/?) + # Match some special characters + '[[:punct:]]' # Don't gobble up chinese punctuation characters + REGEX = %r{ + (?:.*<\/script>)+ # Match script tags. They aren't counted in length. + | + <\/?[^>]+> # Match html tags + | + \s+ # Match consecutive spaces. They are later truncated to a single space. + | + [#{TAG_BODY_CHARACTERS}]+ # Match tag body + }x.freeze def initialize(original_html) super(original_html) diff --git a/spec/truncate_html/html_truncator_spec.rb b/spec/truncate_html/html_truncator_spec.rb index 0da14d6..f326078 100644 --- a/spec/truncate_html/html_truncator_spec.rb +++ b/spec/truncate_html/html_truncator_spec.rb @@ -204,4 +204,15 @@ def truncate(html, opts = {}) '

hello and ...

' end end + + it "doesn't gobble up non alphabetical unicode characters" do + truncate('+




a (double-byte space)







><


「」

、。', length: 100).should == + '+




a (double-byte space)







><


「」

、。' + end + + it "doesn't gobble up halfwidth and fullwidth forms of unicode charecters" do + input = '!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~⦅⦆。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙ᄀᄁᆪᄂᆬᆭᄃᄄᄅᆰᆱᆲᆳᆴᆵᄚᄆᄇᄈᄡᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵ¢£¬ ̄¦¥₩│←↑→↓■○0123456789' + output = truncate(input, length: 300) + output.should == input + end end