Skip to content

Update to UnicodeData 17.0.0 #292

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ endif()
if(UTF8PROC_ENABLE_TESTING)
enable_testing()
file(MAKE_DIRECTORY data)
set(UNICODE_VERSION 16.0.0)
set(UNICODE_VERSION 17.0.0)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/NormalizationTest.txt ${CMAKE_BINARY_DIR}/data/NormalizationTest.txt SHOW_PROGRESS)
file(DOWNLOAD https://www.unicode.org/Public/${UNICODE_VERSION}/ucd/auxiliary/GraphemeBreakTest.txt ${CMAKE_BINARY_DIR}/data/GraphemeBreakTest.txt SHOW_PROGRESS)
add_executable(case test/tests.h test/tests.c utf8proc.h test/case.c)
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ The C library is found in this directory after successful compilation
and is named `libutf8proc.a` (for the static library) and
`libutf8proc.so` (for the dynamic library).

The Unicode version supported is 16.0.0.
The Unicode version supported is 17.0.0.

For Unicode normalizations, the following options are used:

Expand Down
2 changes: 1 addition & 1 deletion data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ utf8proc_data.c.new: data_generator.jl $(RAWDATA)
$(JULIA) --project=. data_generator.jl > $@

# Unicode data version (must also update utf8proc_unicode_version function)
UNICODE_VERSION=16.0.0
UNICODE_VERSION=17.0.0

UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
Expand Down
5 changes: 3 additions & 2 deletions test/graphemetest.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ void checkline(const char *_buf, bool verbose) {
bi += 1;
}
else { /* hex-encoded codepoint */
size_t len = encode((unsigned char*) (src + si), buf + bi) - 1;
while (src[si]) ++si; /* advance to NUL termination */
size_t dest_len;
size_t len = encode((unsigned char*) (src + si), &dest_len, buf + bi) - 1;
si += dest_len; /* advance to NUL termination */
bi += len;
}
}
Expand Down
5 changes: 3 additions & 2 deletions test/iscase.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@ int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
size_t len = simple_getline(buf, f);
size_t pos = skipspaces(buf, 0);
unsigned char s[16];
size_t s_len;
if (pos == len || buf[pos] == '#') return 0;
pos += encode(s, buf + pos) - 1;
pos += encode(s, &s_len, buf + pos) - 1;
check(s[0], "invalid line %s in data", buf);
utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
if (buf[pos] == '.' && buf[pos+1] == '.') {
encode(s, buf + pos + 2);
encode(s, &s_len, buf + pos + 2);
check(s[0], "invalid line %s in data", buf);
utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
}
Expand Down
11 changes: 6 additions & 5 deletions test/normtest.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ int main(int argc, char **argv)

if (buf[0] == '#') continue;

offset = encode(source, buf);
offset += encode(NFC, buf + offset);
offset += encode(NFD, buf + offset);
offset += encode(NFKC, buf + offset);
offset += encode(NFKD, buf + offset);
size_t len;
offset = encode(source, &len, buf);
offset += encode(NFC, &len, buf + offset);
offset += encode(NFD, &len, buf + offset);
offset += encode(NFKC, &len, buf + offset);
offset += encode(NFKD, &len, buf + offset);

CHECK_NORM(NFC, NFC, source);
CHECK_NORM(NFC, NFC, NFC);
Expand Down
3 changes: 2 additions & 1 deletion test/tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ size_t skipspaces(const unsigned char *buf, size_t i)
separated by whitespace, and terminated by any character not in
[0-9a-fA-F] or whitespace, then stores the corresponding utf8 string
in dest, returning the number of bytes read from buf */
size_t encode(unsigned char *dest, const unsigned char *buf)
size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf)
{
size_t i = 0, j;
utf8proc_ssize_t d = 0;
Expand All @@ -38,6 +38,7 @@ size_t encode(unsigned char *dest, const unsigned char *buf)
; /* find end of hex input */
if (j == i) { /* no codepoint found */
dest[d] = 0; /* NUL-terminate destination string */
*dest_len = (size_t)d;
return i + 1;
}
check(sscanf((char *) (buf + i), "%x", (unsigned int *)&c) == 1, "invalid hex input %s", buf+i);
Expand Down
2 changes: 1 addition & 1 deletion test/tests.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ extern size_t lineno;

void check(int cond, const char *format, ...);
size_t skipspaces(const unsigned char *buf, size_t i);
size_t encode(unsigned char *dest, const unsigned char *buf);
size_t encode(unsigned char *dest, size_t *dest_len, const unsigned char *buf);
size_t simple_getline(unsigned char buf[8192], FILE *f);
2 changes: 1 addition & 1 deletion utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
}

UTF8PROC_DLLEXPORT const char *utf8proc_unicode_version(void) {
return "16.0.0";
return "17.0.0";
}

UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
Expand Down
Loading
Loading