src/nxt_utf8.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273


/*
 * Copyright (C) Igor Sysoev
 * Copyright (C) NGINX, Inc.
 */

#include <nxt_main.h>

/*
 * The nxt_unicode_lowcase.h file is the auto-generated file from
 * the CaseFolding-6.3.0.txt file provided by Unicode, Inc.:
 *
 *   ./lib/src/nxt_unicode_lowcase.pl CaseFolding-6.3.0.txt
 *
 * This file should be copied to system specific nxt_unicode_SYSTEM_lowcase.h
 * file and utf8_file_name_test should be built with this file.
 * Then a correct system specific file should be generated:
 *
 *   ./build/utf8_file_name_test | ./lib/src/nxt_unicode_lowcase.pl
 *
 * Only common and simple case foldings are supported.  Full case foldings
 * is not supported.  Combined characters are also not supported.
 */

#if (NXT_MACOSX)
#include <nxt_unicode_macosx_lowcase.h>

#else
#include <nxt_unicode_lowcase.h>
#endif


u_char *
nxt_utf8_encode(u_char *p, uint32_t u)
{
    if (u < 0x80) {
        *p++ = (u_char) (u & 0xFF);
        return p;
    }

    if (u < 0x0800) {
        *p++ = (u_char) (( u >> 6)          | 0xC0);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    if (u < 0x10000) {
        *p++ = (u_char) ( (u >> 12)         | 0xE0);
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    if (u < 0x110000) {
        *p++ = (u_char) ( (u >> 18)         | 0xF0);
        *p++ = (u_char) (((u >> 12) & 0x3F) | 0x80);
        *p++ = (u_char) (((u >>  6) & 0x3F) | 0x80);
        *p++ = (u_char) (( u        & 0x3F) | 0x80);
        return p;
    }

    return NULL;
}


/*
 * nxt_utf8_decode() decodes UTF-8 sequences and returns a valid
 * character 0x00 - 0x10FFFF, or 0xFFFFFFFF for invalid or overlong
 * UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode(const u_char **start, const u_char *end)
{
    uint32_t  u;

    u = (uint32_t) **start;

    if (u < 0x80) {
        (*start)++;
        return u;
    }

    return nxt_utf8_decode2(start, end);
}


/*
 * nxt_utf8_decode2() decodes two and more bytes UTF-8 sequences only
 * and returns a valid character 0x80 - 0x10FFFF, or 0xFFFFFFFF for
 * invalid or overlong UTF-8 sequence.
 */

uint32_t
nxt_utf8_decode2(const u_char **start, const u_char *end)
{
    u_char        c;
    size_t        n;
    uint32_t      u, overlong;
    const u_char  *p;

    p = *start;
    u = (uint32_t) *p;

    if (u >= 0xE0) {

        if (u >= 0xF0) {

            if (nxt_slow_path(u > 0xF4)) {
                /*
                 * The maximum valid Unicode character is 0x10FFFF
                 * which is encoded as 0xF4 0x8F 0xBF 0xBF.
                 */
                return 0xFFFFFFFF;
            }

            u &= 0x07;
            overlong = 0x00FFFF;
            n = 3;

        } else {
            u &= 0x0F;
            overlong = 0x07FF;
            n = 2;
        }

    } else if (u >= 0xC2) {

        /* 0x80 is encoded as 0xC2 0x80. */

        u &= 0x1F;
        overlong = 0x007F;
        n = 1;

    } else {
        /* u <= 0xC2 */
        return 0xFFFFFFFF;
    }

    p++;

    if (nxt_fast_path(p + n <= end)) {

        do {
            c = *p++;
            /*
             * The byte must in the 0x80 - 0xBF range.
             * Values below 0x80 become >= 0x80.
             */
            c = c - 0x80;

            if (nxt_slow_path(c > 0x3F)) {
                return 0xFFFFFFFF;
            }

            u = (u << 6) | c;
            n--;

        } while (n != 0);

        if (overlong < u && u < 0x110000) {
            *start = p;
            return u;
        }
    }

    return 0xFFFFFFFF;
}


/*
 * nxt_utf8_casecmp() tests only up to the minimum of given lengths, but
 * requires lengths of both strings because otherwise nxt_utf8_decode2()
 * may fail due to incomplete sequence.
 */

nxt_int_t
nxt_utf8_casecmp(const u_char *start1, const u_char *start2, size_t len1,
    size_t len2)
{
    int32_t       n;
    uint32_t      u1, u2;
    const u_char  *end1, *end2;

    end1 = start1 + len1;
    end2 = start2 + len2;

    while (start1 < end1 && start2 < end2) {

        u1 = nxt_utf8_lowcase(&start1, end1);

        u2 = nxt_utf8_lowcase(&start2, end2);

        if (nxt_slow_path((u1 | u2) == 0xFFFFFFFF)) {
            return NXT_UTF8_SORT_INVALID;
        }

        n = u1 - u2;

        if (n != 0) {
            return (nxt_int_t) n;
        }
    }

    return 0;
}


uint32_t
nxt_utf8_lowcase(const u_char **start, const u_char *end)
{
    uint32_t        u;
    const uint32_t  *block;

    u = (uint32_t) **start;

    if (nxt_fast_path(u < 0x80)) {
        (*start)++;

        return nxt_unicode_block_000[u];
    }

    u = nxt_utf8_decode2(start, end);

    if (u <= NXT_UNICODE_MAX_LOWCASE) {
        block = nxt_unicode_blocks[u / NXT_UNICODE_BLOCK_SIZE];

        if (block != NULL) {
            return block[u % NXT_UNICODE_BLOCK_SIZE];
        }
    }

    return u;
}


ssize_t
nxt_utf8_length(const u_char *p, size_t len)
{
    ssize_t       length;
    const u_char  *end;

    length = 0;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
            return -1;
        }

        length++;
    }

    return length;
}


nxt_bool_t
nxt_utf8_is_valid(const u_char *p, size_t len)
{
    const u_char  *end;

    end = p + len;

    while (p < end) {
        if (nxt_slow_path(nxt_utf8_decode(&p, end) == 0xFFFFFFFF)) {
            return 0;
        }
    }

    return 1;
}