From 0076a733372b63041c0efdeaff246bd279535b61 Mon Sep 17 00:00:00 2001 From: Claudia Date: Thu, 7 Sep 2017 00:57:56 +0200 Subject: [PATCH 1/2] Add failing test for combining characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In Unicode, a combining character is a character which can be stacked on top of the character preceding it. For example: - The character LATIN SMALL LETTER U (`u`) has the codepoint U+0075 assigned. - The character COMBINING DIAERESIS, which looks similar to the symbol `¨` but is actually a combining character, has the codepoint U+0308 assigned. - Writing both characters in sequence yields the letter `ü`, which looks just like `ü` but is actually *two* characters. On a Mac, such combinations are very common, especially in filenames due to an oddity in the HFS+ filesystem. In the `applescript-json` library however, the `encodeString` function always fails when the input contains a combining character. In detail, the code assumes that inside the `repeat with ch` loop, `ch` will be always one single character, and its `id` property will always return an integer. However, in reality `ch` will contain more than one character if a combining character is involved. Because of that, the `id` property will return a list instead of an integer. The code is not prepared to handle the list, which triggers the error. This commit adds a simple test case for the “u followed by ̈” scenario described above. It also includes the expected JSON output, which would be `u\u0308`. --- tests.applescript | 1 + 1 file changed, 1 insertion(+) diff --git a/tests.applescript b/tests.applescript index 6a23d4f..7fbdf7e 100644 --- a/tests.applescript +++ b/tests.applescript @@ -32,6 +32,7 @@ assert_eq(json's encode("foo"), "\"foo\"") assert_eq(json's encode(""), "\"\"") assert_eq(json's encode("\n"), "\"\\u000a\"") assert_eq(json's encode("ș"), "\"\\u0219\"") +assert_eq(json's encode("u" & "̈"), "\"u\\u0308\"") assert_eq(json's encode("\"bar\""), "\"\\\"bar\\\"\"") assert_eq(json's encode("\\"), "\"\\\\\"") From 5684b4ec3c4ba4ad1febc5a3478e6c416e5f1184 Mon Sep 17 00:00:00 2001 From: Claudia Date: Thu, 7 Sep 2017 00:59:07 +0200 Subject: [PATCH 2/2] Add support for combining characters This commit fixes the bug described in the previous commit. The trick is to fetch the `id` property on the entire input string _before_ we iterate over it. That way, `id` returns a simple list of integer codepoints for the entire string, which can be iterated safely. --- json.applescript | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/json.applescript b/json.applescript index 97fa183..972e677 100644 --- a/json.applescript +++ b/json.applescript @@ -25,15 +25,22 @@ end on encodeString(value) set rv to "" - repeat with ch in value - if id of ch = 34 + set codepoints to id of value + + if (class of codepoints) is not list + set codepoints to {codepoints} + end + + repeat with codepoint in codepoints + set codepoint to codepoint as integer + if codepoint = 34 set quoted_ch to "\\\"" - else if id of ch = 92 then + else if codepoint = 92 then set quoted_ch to "\\\\" - else if id of ch >= 32 and id of ch < 127 - set quoted_ch to ch + else if codepoint >= 32 and codepoint < 127 + set quoted_ch to character id codepoint else - set quoted_ch to "\\u" & hex4(id of ch) + set quoted_ch to "\\u" & hex4(codepoint) end set rv to rv & quoted_ch end