diff options
author | Igor Sysoev <igor@sysoev.ru> | 2017-01-17 20:00:00 +0300 |
---|---|---|
committer | Igor Sysoev <igor@sysoev.ru> | 2017-01-17 20:00:00 +0300 |
commit | 16cbf3c076a0aca6d47adaf3f719493674cf2363 (patch) | |
tree | e6530480020f62a2bdbf249988ec3e2a751d3927 /src/nxt_utf8.c | |
download | unit-16cbf3c076a0aca6d47adaf3f719493674cf2363.tar.gz unit-16cbf3c076a0aca6d47adaf3f719493674cf2363.tar.bz2 |
Initial version.
Diffstat (limited to 'src/nxt_utf8.c')
-rw-r--r-- | src/nxt_utf8.c | 273 |
1 files changed, 273 insertions, 0 deletions
diff --git a/src/nxt_utf8.c b/src/nxt_utf8.c new file mode 100644 index 00000000..56cd3dcd --- /dev/null +++ b/src/nxt_utf8.c @@ -0,0 +1,273 @@ + +/* + * Copyright (C) Igor Sysoev + * Copyright (C) NGINX, Inc. + */ + +#include <nxt_main.h> + +/* + * The nxt_unicode_lowcase.h file is the auto-generated file from + * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.: + * + * ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt + * + * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h + * file and utf8_file_name_test should be built with this file. + * Then a correct system specific file should be generated: + * + * ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl + * + * Only common and simple case foldings are supported. Full case foldings + * is not supported. Combined characters are also not supported. + */ + +#if (NXT_MACOSX) +#include <nxt_unicode_macosx_lowcase.h> + +#else +#include <nxt_unicode_lowcase.h> +#endif + + +u_char * +nxt_utf8_encode(u_char *p, uint32_t u) +{ + if (u < 0x80) { + *p++ = (u_char) (u & 0xff); + return p; + } + + if (u < 0x0800) { + *p++ = (u_char) (( u >> 6) | 0xc0); + *p++ = (u_char) (( u & 0x3f) | 0x80); + return p; + } + + if (u < 0x10000) { + *p++ = (u_char) ( (u >> 12) | 0xe0); + *p++ = (u_char) (((u >> 6) & 0x3f) | 0x80); + *p++ = (u_char) (( u & 0x3f) | 0x80); + return p; + } + + if (u < 0x110000) { + *p++ = (u_char) ( (u >> 18) | 0xf0); + *p++ = (u_char) (((u >> 12) & 0x3f) | 0x80); + *p++ = (u_char) (((u >> 6) & 0x3f) | 0x80); + *p++ = (u_char) (( u & 0x3f) | 0x80); + return p; + } + + return NULL; +} + + +/* + * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid + * character 0x00 - 0x10ffff, or 0xffffffff for invalid or overlong + * UTF-8 sequence. + */ + +uint32_t +nxt_utf8_decode(const u_char **start, const u_char *end) +{ + uint32_t u; + + u = (uint32_t) **start; + + if (u < 0x80) { + (*start)++; + return u; + } + + return nxt_utf8_decode2(start, end); +} + + +/* + * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only + * and returns a valid character 0x80 - 0x10ffff, or 0xffffffff for + * invalid or overlong UTF-8 sequence. + */ + +uint32_t +nxt_utf8_decode2(const u_char **start, const u_char *end) +{ + u_char c; + size_t n; + uint32_t u, overlong; + const u_char *p; + + p = *start; + u = (uint32_t) *p; + + if (u >= 0xe0) { + + if (u >= 0xf0) { + + if (nxt_slow_path(u > 0xf4)) { + /* + * The maximum valid Unicode character is 0x10ffff + * which is encoded as 0xf4 0x8f 0xbf 0xbf. + */ + return 0xffffffff; + } + + u &= 0x07; + overlong = 0x00ffff; + n = 3; + + } else { + u &= 0x0f; + overlong = 0x07ff; + n = 2; + } + + } else if (u >= 0xc2) { + + /* 0x80 is encoded as 0xc2 0x80. */ + + u &= 0x1f; + overlong = 0x007f; + n = 1; + + } else { + /* u <= 0xc2 */ + return 0xffffffff; + } + + p++; + + if (nxt_fast_path(p + n <= end)) { + + do { + c = *p++; + /* + * The byte must in the 0x80 - 0xbf range. + * Values below 0x80 become >= 0x80. + */ + c = c - 0x80; + + if (nxt_slow_path(c > 0x3f)) { + return 0xffffffff; + } + + u = (u << 6) | c; + n--; + + } while (n != 0); + + if (overlong < u && u < 0x110000) { + *start = p; + return u; + } + } + + return 0xffffffff; +} + + +/* + * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but + * requires lengths of both strings because otherwise nxt_utf8_decode2() + * may fail due to incomplete sequence. + */ + +nxt_int_t +nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1, + size_t len2) +{ + int32_t n; + uint32_t u1, u2; + const u_char *end1, *end2; + + end1 = start1 + len1; + end2 = start2 + len2; + + while (start1 < end1 && start2 < end2) { + + u1 = nxt_utf8_lowcase(&start1, end1); + + u2 = nxt_utf8_lowcase(&start2, end2); + + if (nxt_slow_path((u1 | u2) == 0xffffffff)) { + return NXT_UTF8_SORT_INVALID; + } + + n = u1 - u2; + + if (n != 0) { + return (nxt_int_t) n; + } + } + + return 0; +} + + +uint32_t +nxt_utf8_lowcase(const u_char **start, const u_char *end) +{ + uint32_t u; + const uint32_t *block; + + u = (uint32_t) **start; + + if (nxt_fast_path(u < 0x80)) { + (*start)++; + + return nxt_unicode_block_000[u]; + } + + u = nxt_utf8_decode2(start, end); + + if (u <= NXT_UNICODE_MAX_LOWCASE) { + block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE]; + + if (block != NULL) { + return block[u % NXT_UNICODE_BLOCK_SIZE]; + } + } + + return u; +} + + +ssize_t +nxt_utf8_length(const u_char *p, size_t len) +{ + ssize_t length; + const u_char *end; + + length = 0; + + end = p + len; + + while (p < end) { + if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) { + return -1; + } + + length++; + } + + return length; +} + + +nxt_bool_t +nxt_utf8_is_valid(const u_char *p, size_t len) +{ + const u_char *end; + + end = p + len; + + while (p < end) { + if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xffffffff)) { + return 0; + } + } + + return 1; +} |