Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/normalize.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
0
#define UNI_CP_MALE_SIGN             0x2642
3
0
#define UNI_CP_FEMALE_SIGN           0x2640
4
53.6k
#define UNI_CP_ZERO_WIDTH_JOINER     0x200D
5
53.6k
#define UNI_CP_ZERO_WIDTH_NON_JOINER 0x200C
6
7
/* Maps outside-world normalization form codes to our internal set, validating
8
 * that we got something valid. */
9
0
MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in) {
10
0
    switch (form_in) {
11
0
    case 1: return MVM_NORMALIZE_NFC;
12
0
    case 2: return MVM_NORMALIZE_NFD;
13
0
    case 3: return MVM_NORMALIZE_NFKC;
14
0
    case 4: return MVM_NORMALIZE_NFKD;
15
0
    default: MVM_exception_throw_adhoc(tc, "Invalid normalization form %d", (int)form_in);
16
0
    }
17
0
}
18
19
/* Takes two objects, which must be of VMArray representation and holding
20
 * 32-bit integers. Performs normalization to the specified form. */
21
0
static void assert_codepoint_array(MVMThreadContext *tc, const MVMObject *arr, char *error) {
22
0
    if (IS_CONCRETE(arr) && REPR(arr)->ID == MVM_REPR_ID_VMArray) {
23
0
        MVMuint8 slot_type = ((MVMArrayREPRData *)STABLE(arr)->REPR_data)->slot_type;
24
0
        if (slot_type == MVM_ARRAY_I32 || slot_type == MVM_ARRAY_U32)
25
0
            return;
26
0
    }
27
0
    MVM_exception_throw_adhoc(tc, "%s", error);
28
0
}
29
0
MVM_STATIC_INLINE void maybe_grow_result(MVMCodepoint **result, MVMint64 *result_alloc, MVMint64 needed) {
30
0
    if (needed >= *result_alloc) {
31
0
        while (needed >= *result_alloc)
32
0
            *result_alloc += 32;
33
0
        *result = MVM_realloc(*result, *result_alloc * sizeof(MVMCodepoint));
34
0
    }
35
0
}
36
0
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form) {
37
0
    MVMNormalizer  norm;
38
0
    MVMCodepoint  *input;
39
0
    MVMCodepoint  *result;
40
0
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
41
0
    MVMint32       ready;
42
0
43
0
    /* Validate input/output array. */
44
0
    assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers");
45
0
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
46
0
47
0
    /* Get input array; if it's empty, we're done already. */
48
0
    input       = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start;
49
0
    input_codes = ((MVMArray *)in)->body.elems;
50
0
    if (input_codes == 0)
51
0
        return;
52
0
53
0
    /* Guess output size based on input size. */
54
0
    result_alloc = input_codes;
55
0
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
56
0
57
0
    /* Perform normalization. */
58
0
    MVM_unicode_normalizer_init(tc, &norm, form);
59
0
    input_pos  = 0;
60
0
    result_pos = 0;
61
0
    while (input_pos < input_codes) {
62
0
        MVMCodepoint cp;
63
0
        ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp);
64
0
        if (ready) {
65
0
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
66
0
            result[result_pos++] = cp;
67
0
            while (--ready > 0)
68
0
                result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
69
0
        }
70
0
        input_pos++;
71
0
    }
72
0
    MVM_unicode_normalizer_eof(tc, &norm);
73
0
    ready = MVM_unicode_normalizer_available(tc, &norm);
74
0
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
75
0
    while (ready--)
76
0
        result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
77
0
    MVM_unicode_normalizer_cleanup(tc, &norm);
78
0
79
0
    /* Put result into array body. */
80
0
    ((MVMArray *)out)->body.slots.u32 = (MVMuint32 *) result;
81
0
    ((MVMArray *)out)->body.start     = 0;
82
0
    ((MVMArray *)out)->body.elems     = result_pos;
83
0
}
84
0
MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count) {
85
0
    MVMNormalizer  norm;
86
0
    MVMint64       input_pos, result_pos, result_alloc;
87
0
    MVMGrapheme32 *result;
88
0
    MVMint32       ready;
89
0
    MVMString     *str;
90
0
91
0
    if (cp_count == 0)
92
0
        return tc->instance->str_consts.empty;
93
0
94
0
    /* Guess output size based on cp_v size. */
95
0
    result_alloc = cp_count;
96
0
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
97
0
98
0
    /* Perform normalization at grapheme level. */
99
0
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
100
0
    input_pos  = 0;
101
0
    result_pos = 0;
102
0
    while (input_pos < cp_count) {
103
0
        MVMGrapheme32 g;
104
0
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp_v[input_pos], &g);
105
0
        if (ready) {
106
0
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
107
0
            result[result_pos++] = g;
108
0
            while (--ready > 0)
109
0
                result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
110
0
        }
111
0
        input_pos++;
112
0
    }
113
0
    MVM_unicode_normalizer_eof(tc, &norm);
114
0
    ready = MVM_unicode_normalizer_available(tc, &norm);
115
0
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
116
0
    while (ready--)
117
0
        result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
118
0
    MVM_unicode_normalizer_cleanup(tc, &norm);
119
0
120
0
    /* Produce an MVMString of the result. */
121
0
    str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
122
0
    str->body.storage.blob_32 = result;
123
0
    str->body.storage_type    = MVM_STRING_GRAPHEME_32;
124
0
    str->body.num_graphs      = result_pos;
125
0
    return str;
126
0
}
127
128
/* Takes an object, which must be of VMArray representation and holding
129
 * 32-bit integers. Treats them as Unicode codepoints, normalizes them at
130
 * Grapheme level, and returns the resulting NFG string. */
131
0
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, const MVMObject *codes) {
132
0
    MVMCodepoint  *input;
133
0
    MVMint64       input_codes;
134
0
135
0
    assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers");
136
0
137
0
    input       = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start;
138
0
    input_codes = ((MVMArray *)codes)->body.elems;
139
0
    return MVM_unicode_codepoints_c_array_to_nfg_string(tc, input, input_codes);
140
0
}
141
142
/* Takes an NFG string and populates the array out, which must be a 32-bit
143
 * integer array, with codepoints normalized according to the specified
144
 * normalization form. */
145
0
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) {
146
0
    MVMCodepoint     *result;
147
0
    MVMint64          result_pos, result_alloc;
148
0
    MVMCodepointIter  ci;
149
0
150
0
    /* Validate output array and set up result storage. */
151
0
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
152
0
    result_alloc = s->body.num_graphs;
153
0
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
154
0
    result_pos   = 0;
155
0
156
0
    /* Create codepoint iterator. */
157
0
    MVM_string_ci_init(tc, &ci, s, 0);
158
0
159
0
    /* If we want NFC, just iterate, since NFG is constructed out of NFC. */
160
0
    if (form == MVM_NORMALIZE_NFC) {
161
0
        while (MVM_string_ci_has_more(tc, &ci)) {
162
0
            maybe_grow_result(&result, &result_alloc, result_pos + 1);
163
0
            result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci);
164
0
        }
165
0
    }
166
0
167
0
    /* Otherwise, need to feed it through a normalizer. */
168
0
    else {
169
0
        MVMNormalizer norm;
170
0
        MVMint32      ready;
171
0
        MVM_unicode_normalizer_init(tc, &norm, form);
172
0
        while (MVM_string_ci_has_more(tc, &ci)) {
173
0
            MVMCodepoint cp;
174
0
            ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp);
175
0
            if (ready) {
176
0
                maybe_grow_result(&result, &result_alloc, result_pos + ready);
177
0
                result[result_pos++] = cp;
178
0
                while (--ready > 0)
179
0
                    result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
180
0
            }
181
0
        }
182
0
        MVM_unicode_normalizer_eof(tc, &norm);
183
0
        ready = MVM_unicode_normalizer_available(tc, &norm);
184
0
        maybe_grow_result(&result, &result_alloc, result_pos + ready);
185
0
        while (ready--)
186
0
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
187
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
188
0
    }
189
0
190
0
    /* Put result into array body. */
191
0
    ((MVMArray *)out)->body.slots.u32 = (MVMuint32 *)result;
192
0
    ((MVMArray *)out)->body.start     = 0;
193
0
    ((MVMArray *)out)->body.elems     = result_pos;
194
0
}
195
196
/* Initialize the MVMNormalizer pointed to to perform the specified kind of
197
 * normalization. */
198
42.0k
void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization form) {
199
42.0k
    n->form               = form;
200
42.0k
    n->buffer_size        = 32;
201
42.0k
    n->buffer             = MVM_malloc(n->buffer_size * sizeof(MVMCodepoint));
202
42.0k
    n->buffer_start       = 0;
203
42.0k
    n->buffer_end         = 0;
204
42.0k
    n->buffer_norm_end    = 0;
205
42.0k
    n->translate_newlines = 0;
206
42.0k
    switch (n->form) {
207
0
        case MVM_NORMALIZE_NFD:
208
0
            n->first_significant    = MVM_NORMALIZE_FIRST_SIG_NFD;
209
0
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFD_QC;
210
0
            break;
211
0
        case MVM_NORMALIZE_NFKD:
212
0
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKD;
213
0
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFKD_QC;
214
0
            break;
215
0
        case MVM_NORMALIZE_NFC:
216
0
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC;
217
0
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFC_QC;
218
0
            break;
219
0
        case MVM_NORMALIZE_NFKC:
220
0
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKC;
221
0
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFKC_QC;
222
0
            break;
223
42.0k
        case MVM_NORMALIZE_NFG:
224
42.0k
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFG_QC;
225
42.0k
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC;
226
42.0k
            break;
227
0
        default:
228
0
            abort();
229
42.0k
    }
230
42.0k
}
231
232
/* Enable translation of newlines from \r\n to \n. */
233
187
void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n) {
234
187
    n->translate_newlines = 1;
235
187
}
236
237
/* Cleanup an MVMNormalization once we're done normalizing. */
238
42.0k
void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n) {
239
42.0k
    free(n->buffer);
240
42.0k
}
241
242
/* Adds a codepoint into the buffer, making sure there's space. */
243
78.0k
static void add_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
244
78.0k
    if (n->buffer_end == n->buffer_size) {
245
1.01k
        if (n->buffer_start != 0) {
246
1.01k
            MVMint32 shuffle = n->buffer_start;
247
1.01k
            MVMint32 to_move = n->buffer_end - n->buffer_start;
248
1.01k
            memmove(n->buffer, n->buffer + n->buffer_start, to_move * sizeof(MVMCodepoint));
249
1.01k
            n->buffer_start = 0;
250
1.01k
            n->buffer_end -= shuffle;
251
1.01k
            n->buffer_norm_end -= shuffle;
252
1.01k
        }
253
0
        else {
254
0
            n->buffer_size *= 2;
255
0
            n->buffer = MVM_realloc(n->buffer, n->buffer_size * sizeof(MVMCodepoint));
256
0
        }
257
1.01k
    }
258
78.0k
    n->buffer[n->buffer_end++] = cp;
259
78.0k
}
260
261
/* Hangul-related constants from Unicode spec 3.12, following naming
262
 * convention from spec. */
263
static const int
264
    SBase = 0xAC00,
265
    LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
266
    LCount = 19, VCount = 21, TCount = 28,
267
    NCount = 588, /* VCount * TCount */
268
    SCount = 11172; /* LCount * NCount */
269
270
/* Decomposes a Hangul codepoint and add it into the buffer. */
271
0
static void decomp_hangul_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint s) {
272
0
    /* Algorithm from Unicode spec 3.12, following naming convention from spec. */
273
0
    int SIndex = s - SBase;
274
0
    if (SIndex < 0 || SIndex >= SCount) {
275
0
        add_codepoint_to_buffer(tc, n, s);
276
0
    }
277
0
    else {
278
0
        int L = LBase + SIndex / NCount;
279
0
        int V = VBase + (SIndex % NCount) / TCount;
280
0
        int T = TBase + SIndex % TCount;
281
0
        add_codepoint_to_buffer(tc, n, (MVMCodepoint)L);
282
0
        add_codepoint_to_buffer(tc, n, (MVMCodepoint)V);
283
0
        if (T != TBase)
284
0
            add_codepoint_to_buffer(tc, n, (MVMCodepoint)T);
285
0
    }
286
0
}
287
288
/* Decompose the codepoint and add it into the buffer. */
289
1.54k
static void decomp_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
290
1.54k
    /* See if we actually need to decompose (can skip if the decomposition
291
1.54k
     * type is None, or we're only doing Canonical decomposition and it is
292
1.54k
     * anything except Canonical). */
293
1.54k
    MVMint16 cp_DT = MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE);
294
1.54k
    MVMint64 decompose = 1;
295
1.54k
    if (cp_DT == MVM_UNICODE_PVALUE_DT_NONE)
296
1.52k
        decompose = 0;
297
22
    else if (!MVM_NORMALIZE_COMPAT_DECOMP(n->form) && cp_DT != MVM_UNICODE_PVALUE_DT_CANONICAL )
298
11
        decompose = 0;
299
1.54k
    if (decompose) {
300
11
        /* We need to decompose. Get the decomp spec and go over the things in
301
11
         * it; things without a decomp spec are presumably Hangul and need the
302
11
         * algorithmic treatment. */
303
11
        char *spec = (char *)MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMP_SPEC);
304
11
        if (spec && spec[0]) {
305
11
            char *end = spec + strlen(spec);
306
22
            while (spec < end) {
307
11
                /* Parse hex character code, and then recurse to do any further
308
11
                * decomposition on it; this recursion terminates when we find a
309
11
                * non-decomposable thing and add it to the buffer. */
310
11
                MVMCodepoint decomp_char = (MVMCodepoint)strtol(spec, &spec, 16);
311
11
                decomp_codepoint_to_buffer(tc, n, decomp_char);
312
11
            }
313
11
        }
314
0
        else {
315
0
            decomp_hangul_to_buffer(tc, n, cp);
316
0
        }
317
11
    }
318
1.53k
    else {
319
1.53k
        /* Don't need to decompose; add it right into the buffer. */
320
1.53k
        add_codepoint_to_buffer(tc, n, cp);
321
1.53k
    }
322
1.54k
}
323
324
/* Checks if the specified character answers "yes" on the appropriate quick check. */
325
152k
static MVMint64 passes_quickcheck(MVMThreadContext *tc, const MVMNormalizer *n, MVMCodepoint cp) {
326
152k
    const char *pval = MVM_unicode_codepoint_get_property_cstr(tc, cp, n->quick_check_property);
327
152k
    return pval && pval[0] == 'Y';
328
152k
}
329
330
/* Gets the canonical combining class for a codepoint. */
331
214k
static MVMint64 ccc(MVMThreadContext *tc, MVMCodepoint cp) {
332
214k
    if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) {
333
113k
        return 0;
334
113k
    }
335
101k
    else {
336
101k
        const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS);
337
101k
        return !ccc_str || strlen(ccc_str) > 3 ? 0 : fast_atoi(ccc_str);
338
101k
    }
339
214k
}
340
341
/* Checks if the thing we have is a control character (for the definition in
342
 * the Unicode Standard Annex #29). Assumes it doesn't have to care about any
343
 * of the controls in the Latin-1 range, because those were already covered in
344
 * a fast path. */
345
53.6k
static MVMint32 is_control_beyond_latin1(MVMThreadContext *tc, MVMCodepoint in) {
346
53.6k
    /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded. */
347
53.6k
    if (in != UNI_CP_ZERO_WIDTH_NON_JOINER && in != UNI_CP_ZERO_WIDTH_JOINER) {
348
53.6k
        /* Consider general property. */
349
53.6k
        const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in,
350
53.6k
            MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
351
53.6k
        switch (genprop[0]) {
352
2.80k
            case 'Z':
353
2.80k
                /* Line_Separator and Paragraph_Separator are controls. */
354
2.66k
                return genprop[1] == 'l' || genprop[1] == 'p';
355
281
            case 'C':
356
281
                /* Control, Surrogate are controls. */
357
281
                if (genprop[1] == 'c' || genprop[1] == 's') {
358
0
                    return 1;
359
0
                }
360
281
                if (genprop[1] == 'f' ) {
361
277
                    /* Format can have special properties (not control) */
362
277
                    return 0;
363
277
                }
364
281
                /* Unassigned is, but only for Default_Ignorable_Code_Point. */
365
4
                if (genprop[1] == 'n') {
366
0
                    return MVM_unicode_codepoint_get_property_int(tc, in,
367
0
                        MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0;
368
0
                }
369
53.6k
        }
370
53.6k
    }
371
50.5k
    return 0;
372
53.6k
}
373
374
/* Implements the Unicode Canonical Ordering algorithm (3.11, D109). */
375
62.6k
static void canonical_sort(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
376
62.6k
    /* Yes, this is the simplest possible thing. Key thing if you decide to
377
62.6k
     * replace it with something more optimal: it must not re-order code
378
62.6k
     * points with equal CCC. */
379
62.6k
    MVMint32 reordered = 1;
380
125k
    while (reordered) {
381
62.6k
        MVMint32 i = from;
382
62.6k
        reordered = 0;
383
78.2k
        while (i < to - 1) {
384
15.5k
            MVMint64 cccA = ccc(tc, n->buffer[i]);
385
15.5k
            MVMint64 cccB = ccc(tc, n->buffer[i + 1]);
386
15.5k
            if (cccA > cccB && cccB > 0) {
387
0
                MVMCodepoint tmp = n->buffer[i];
388
0
                n->buffer[i] = n->buffer[i + 1];
389
0
                n->buffer[i + 1] = tmp;
390
0
                reordered = 1;
391
0
            }
392
15.5k
            i++;
393
15.5k
        }
394
62.6k
    }
395
62.6k
}
396
397
/* Implements the Unicode Canonical Composition algorithm (3.11, D117). */
398
62.6k
static void canonical_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
399
62.6k
    MVMint32 c_idx = from + 1;
400
78.2k
    while (c_idx < to) {
401
15.5k
        /* Search for the last non-blocked starter. */
402
15.5k
        MVMint32 ss_idx = c_idx - 1;
403
15.5k
        MVMint32 c_ccc  = ccc(tc, n->buffer[c_idx]);
404
15.5k
        while (ss_idx >= from) {
405
15.5k
            /* Make sure we don't go past a code point that blocks a starter
406
15.5k
             * from the current character we're considering. */
407
15.5k
            MVMint32 ss_ccc = ccc(tc, n->buffer[ss_idx]);
408
15.5k
            if (ss_ccc >= c_ccc && ss_ccc != 0)
409
0
                break;
410
15.5k
411
15.5k
            /* Have we found a starter? */
412
15.5k
            if (ss_ccc == 0) {
413
15.5k
                /* See if there's a primary composite for the starter and the
414
15.5k
                 * current code point under consideration. */
415
15.5k
                MVMCodepoint pc = MVM_unicode_find_primary_composite(tc, n->buffer[ss_idx], n->buffer[c_idx]);
416
15.5k
                if (pc > 0) {
417
2
                    /* Replace the starter with the primary composite. */
418
2
                    n->buffer[ss_idx] = pc;
419
2
420
2
                    /* Move the rest of the buffer back one position. */
421
2
                    memmove(n->buffer + c_idx, n->buffer + c_idx + 1,
422
2
                        (n->buffer_end - (c_idx + 1)) * sizeof(MVMCodepoint));
423
2
                    n->buffer_end--;
424
2
425
2
                    /* Sync cc_idx and to with the change. */
426
2
                    c_idx--;
427
2
                    to--;
428
2
                }
429
15.5k
430
15.5k
                /* Don't look back beyond this starter; covers the ccc(B) = 0
431
15.5k
                 * case of D105. */
432
15.5k
                break;
433
15.5k
            }
434
0
            ss_idx--;
435
0
        }
436
15.5k
437
15.5k
        /* Move on to the next character. */
438
15.5k
        c_idx++;
439
15.5k
    }
440
62.6k
441
62.6k
    /* Make another pass for the Hangul special case. (A future optimization
442
62.6k
     * may be to incorporate this into the above loop.) */
443
62.6k
    c_idx = from;
444
78.2k
    while (c_idx < to - 1) {
445
15.5k
        /* Do we have a potential LPart? */
446
15.5k
        MVMCodepoint LPart = n->buffer[c_idx];
447
15.5k
        if (LPart >= LBase && LPart <= (LBase + LCount)) {
448
0
            /* Yes, now see if it's followed by a VPart (always safe to look
449
0
             * due to "to - 1" in loop condition above). */
450
0
            MVMCodepoint LIndex = LPart - LBase;
451
0
            MVMCodepoint VPart  = n->buffer[c_idx + 1];
452
0
            if (VPart >= VBase && VPart <= (VBase + VCount)) {
453
0
                /* Certainly something to compose; compute that. */
454
0
                MVMCodepoint VIndex = VPart - VBase;
455
0
                MVMCodepoint LVIndex = LIndex * NCount + VIndex * TCount;
456
0
                MVMCodepoint s = SBase + LVIndex;
457
0
                MVMint32 composed = 1;
458
0
459
0
                /* Is there a TPart too? */
460
0
                if (c_idx < to - 2) {
461
0
                    MVMCodepoint TPart  = n->buffer[c_idx + 2];
462
0
                    if (TPart >= TBase && TPart <= (TBase + TCount)) {
463
0
                        /* We need to compose 3 things. */
464
0
                        MVMCodepoint TIndex = TPart - TBase;
465
0
                        s += TIndex;
466
0
                        composed = 2;
467
0
                    }
468
0
                }
469
0
470
0
                /* Put composed codepoint into the buffer. */
471
0
                n->buffer[c_idx] = s;
472
0
473
0
                /* Shuffle codepoints after this in the buffer back. */
474
0
                memmove(n->buffer + c_idx + 1, n->buffer + c_idx + 1 + composed,
475
0
                        (n->buffer_end - (c_idx + 1 + composed)) * sizeof(MVMCodepoint));
476
0
                n->buffer_end -= composed;
477
0
478
0
                /* Sync to with updated buffer size. */
479
0
                to -= composed;
480
0
            }
481
0
        }
482
15.5k
        c_idx++;
483
15.5k
    }
484
62.6k
}
485
486
/* Performs grapheme composition (to get Normal Form Grapheme) on the range of
487
 * codepoints provided. This follows the algorithm in the Unicode Standard
488
 * Annex #29 on grapheme cluster boundaries. Note that we have already done
489
 * the handling of breaking around controls much earlier, so don't have to
490
 * consider that case. */
491
15.1k
static MVMint32 maybe_hangul(MVMCodepoint cp) {
492
15.1k
    return (cp >= 0x1100 && cp < 0x1200) || (cp >= 0xA960 && cp < 0xD7FC);
493
15.1k
}
494
0
static MVMint32 is_grapheme_extend(MVMThreadContext *tc, MVMCodepoint cp) {
495
0
    return MVM_unicode_codepoint_get_property_int(tc, cp,
496
0
        MVM_UNICODE_PROPERTY_GRAPHEME_EXTEND);
497
0
}
498
104k
static MVMint32 is_grapheme_prepend(MVMThreadContext *tc, MVMCodepoint cp) {
499
104k
    return MVM_unicode_codepoint_get_property_int(tc, cp,
500
104k
        MVM_UNICODE_PROPERTY_PREPENDED_CONCATENATION_MARK);
501
104k
}
502
15.5k
static MVMint32 should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b) {
503
15.5k
    int GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
504
15.5k
    int GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
505
15.5k
    /* Don't break between \r and \n, but otherwise break around \r. */
506
15.5k
    if (a == 0x0D && b == 0x0A)
507
179
        return 0;
508
15.3k
    if (a == 0x0D || b == 0x0D)
509
170
        return 1;
510
15.3k
511
15.3k
    /* Hangul. Avoid property lookup with a couple of quick range checks. */
512
15.1k
    if (maybe_hangul(a) && maybe_hangul(b)) {
513
0
        const char *hst_a = MVM_unicode_codepoint_get_property_cstr(tc, a,
514
0
            MVM_UNICODE_PROPERTY_HANGUL_SYLLABLE_TYPE);
515
0
        const char *hst_b = MVM_unicode_codepoint_get_property_cstr(tc, b,
516
0
            MVM_UNICODE_PROPERTY_HANGUL_SYLLABLE_TYPE);
517
0
        if (strcmp(hst_a, "L") == 0)
518
0
            return !(strcmp(hst_b, "L") == 0 || strcmp(hst_b, "V") == 0 ||
519
0
                     strcmp(hst_b, "LV") == 0 || strcmp(hst_b, "LVT") == 0);
520
0
        else if (strcmp(hst_a, "LV") == 0 || strcmp(hst_a, "V") == 0)
521
0
            return !(strcmp(hst_b, "V") == 0 || strcmp(hst_b, "T") == 0);
522
0
        else if (strcmp(hst_a, "LVT") == 0 || strcmp(hst_a, "T") == 0)
523
0
            return !(strcmp(hst_b, "T") == 0);
524
0
    }
525
15.1k
526
15.1k
    switch (GCB_a) {
527
0
        case MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR:
528
0
            if ( GCB_b == MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR )
529
0
                return 0;
530
0
            break;
531
0
        /* Don't break after Prepend Grapheme_Cluster_Break=Prepend */
532
0
        case MVM_UNICODE_PVALUE_GCB_PREPEND:
533
0
            /* If it's a control character remember to break */
534
0
            if (is_control_beyond_latin1(tc, b )) {
535
0
                return 1;
536
0
            }
537
0
            /* Otherwise don't break */
538
0
            return 0;
539
0
        /* Don't break after ZWJ for E_Base_GAZ or Glue_After_ZWJ */
540
0
        case MVM_UNICODE_PVALUE_GCB_ZWJ:
541
0
            switch (GCB_b) {
542
0
                case MVM_UNICODE_PVALUE_GCB_E_BASE_GAZ:
543
0
                case MVM_UNICODE_PVALUE_GCB_ZWJ:
544
0
                case MVM_UNICODE_PVALUE_GCB_GLUE_AFTER_ZWJ:
545
0
                    return 0;
546
0
            }
547
0
            if ( b == UNI_CP_FEMALE_SIGN || b == UNI_CP_MALE_SIGN )
548
0
                return 0;
549
0
        case MVM_UNICODE_PVALUE_GCB_E_MODIFIER:
550
0
            if (MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_EMOJI_MODIFIER_BASE)) {
551
0
                /* Don't break after ZWJ if it's an Emoji Sequence.
552
0
                 * At the moment FEMALE SIGN and MALE SIGN don't have different
553
0
                 * GCB properties, or any special Emoji properties (Unicode 9.0),
554
0
                 * so we explictly check these codepoints here */
555
0
                if ( b == UNI_CP_FEMALE_SIGN || b == UNI_CP_MALE_SIGN )
556
0
                    return 0;
557
0
            }
558
0
            break;
559
15.1k
    }
560
15.1k
    switch (GCB_b) {
561
15.1k
        /* Don't break before extending chars */
562
0
        case MVM_UNICODE_PVALUE_GCB_EXTEND:
563
0
            return 0;
564
0
        /* Don't break before ZWJ */
565
0
        case MVM_UNICODE_PVALUE_GCB_ZWJ:
566
0
            return 0;
567
0
        case MVM_UNICODE_PVALUE_GCB_E_MODIFIER:
568
0
            switch (GCB_a) {
569
0
                case MVM_UNICODE_PVALUE_GCB_E_BASE_GAZ:
570
0
                    return 0;
571
0
                case MVM_UNICODE_PVALUE_GCB_E_BASE:
572
0
                    return 0;
573
0
                /* Don't break
574
0
                 * when in Emoji Sequences
575
0
                 * we don't save state so can't support this now
576
0
                 *case MVM_UNICODE_PVALUE_GCB_EXTEND:
577
0
                 *    return 0; */
578
0
            }
579
0
            break;
580
0
        /* Don't break before spacing marks. */
581
0
        case MVM_UNICODE_PVALUE_GCB_SPACINGMARK:
582
0
            return 0;
583
15.1k
    }
584
15.1k
585
15.1k
    /* Otherwise break. */
586
15.1k
    return 1;
587
15.1k
}
588
62.6k
static void grapheme_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
589
62.6k
    if (to - from >= 2) {
590
15.4k
        MVMint32 starterish = from;
591
15.4k
        MVMint32 insert_pos = from;
592
15.4k
        MVMint32 pos        = from;
593
46.4k
        while (pos < to) {
594
31.0k
            MVMint32 next_pos = pos + 1;
595
31.0k
            if (next_pos == to || should_break(tc, n->buffer[pos], n->buffer[next_pos])) {
596
30.8k
                /* Last in buffer or next code point is a non-starter; turn
597
30.8k
                 * sequence into a synthetic. */
598
30.8k
                MVMGrapheme32 g = MVM_nfg_codes_to_grapheme(tc, n->buffer + starterish, next_pos - starterish);
599
30.8k
                if (n->translate_newlines && g == MVM_nfg_crlf_grapheme(tc))
600
3
                    g = '\n';
601
30.8k
                n->buffer[insert_pos++] = g;
602
30.8k
603
30.8k
                /* The next code point is our new starterish (harmless if we
604
30.8k
                 * are already at the end of the buffer). */
605
30.8k
                starterish = next_pos;
606
30.8k
            }
607
31.0k
            pos++;
608
31.0k
        }
609
15.4k
        memmove(n->buffer + insert_pos, n->buffer + to, (n->buffer_end - to) * sizeof(MVMCodepoint));
610
15.4k
        n->buffer_end -= to - insert_pos;
611
15.4k
    }
612
62.6k
}
613
614
/* Called when the very fast case of normalization fails (that is, when we get
615
 * any two codepoints in a row where at least one is greater than the first
616
 * significant codepoint identified by a quick check for the target form). We
617
 * may find the quick check itself is enough; if not, we have to do real work
618
 * compute the normalization. */
619
104k
MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
620
104k
    MVMint64 qc_in, ccc_in;
621
104k
    int is_prepend = is_grapheme_prepend(tc, in);
622
104k
    /* If it's a control character (outside of the range we checked in the
623
104k
     * fast path) then it's a normalization terminator. */
624
104k
    if (in > 0xFF && is_control_beyond_latin1(tc, in) && !is_prepend) {
625
272
        return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
626
272
    }
627
104k
628
104k
    /* Do a quickcheck on the codepoint we got in and get its CCC. */
629
104k
    qc_in  = passes_quickcheck(tc, n, in);
630
104k
    ccc_in = ccc(tc, in);
631
104k
632
104k
    /* Fast cases when we pass quick check and what we got in has CCC = 0. */
633
104k
    if (qc_in && ccc_in == 0) {
634
103k
        if (MVM_NORMALIZE_COMPOSE(n->form)) {
635
103k
            /* We're composing. If we have exactly one thing in the buffer and
636
103k
             * it also passes the quick check, and both it and the thing in the
637
103k
             * buffer have a CCC of zero, we can hand back the first of the
638
103k
             * two - effectively replacing what's in the buffer with the new
639
103k
             * codepoint coming in. Note that the NFG quick-check property
640
103k
             * factors in grapheme extenders that don't have a CCC of zero,
641
103k
             * so we're safe. */
642
103k
            if (n->buffer_end - n->buffer_start == 1) {
643
47.5k
                MVMCodepoint maybe_result = n->buffer[n->buffer_start];
644
47.5k
                if (passes_quickcheck(tc, n, maybe_result) && ccc(tc, maybe_result) == 0) {
645
47.5k
                    *out = n->buffer[n->buffer_start];
646
47.5k
                    n->buffer[n->buffer_start] = in;
647
47.5k
                    return 1;
648
47.5k
                }
649
47.5k
            }
650
103k
        }
651
0
        else {
652
0
            /* We're only decomposing. There should probably be nothing in the
653
0
             * buffer in this case; if so we can simply return the codepoint. */
654
0
            if (n->buffer_start == n->buffer_end) {
655
0
                *out = in;
656
0
                return 1;
657
0
            }
658
0
        }
659
103k
    }
660
104k
661
104k
    /* If we didn't pass quick check... */
662
57.1k
    if (!qc_in) {
663
1.14k
        /* If we're composing, then decompose the last thing placed in the
664
1.14k
         * buffer, if any. We need to do this since it may have passed
665
1.14k
         * quickcheck, but having seen some character that does pass then we
666
1.14k
         * must make sure we decomposed the prior passing one too. */
667
1.14k
        if (MVM_NORMALIZE_COMPOSE(n->form) && n->buffer_end != n->buffer_norm_end && !is_prepend) {
668
194
            MVMCodepoint decomp = n->buffer[n->buffer_end - 1];
669
194
            n->buffer_end--;
670
194
            decomp_codepoint_to_buffer(tc, n, decomp);
671
194
        }
672
1.14k
673
1.14k
        /* Decompose this new character into the buffer. We'll need to see
674
1.14k
         * more before we can go any further. */
675
1.14k
        decomp_codepoint_to_buffer(tc, n, in);
676
1.14k
        return 0;
677
1.14k
    }
678
57.1k
679
57.1k
    /* Since anything we have at this point does pass quick check, add it to
680
57.1k
     * the buffer directly. */
681
56.0k
    add_codepoint_to_buffer(tc, n, in);
682
56.0k
683
56.0k
    /* If the codepoint has a CCC that is non-zero, it's not a starter so we
684
56.0k
     * should see more before normalizing. */
685
56.0k
    if (ccc_in > 0)
686
0
        return 0;
687
56.0k
688
56.0k
    /* If we don't have at least one codepoint in the buffer, it's too early
689
56.0k
     * to hand anything back. */
690
56.0k
    if (n->buffer_end - n->buffer_start <= 1)
691
55.8k
        return 0;
692
56.0k
693
56.0k
    /* Perform canonical sorting on everything from the start of the not yet
694
56.0k
     * normalized things in the buffer, up to but excluding the quick-check
695
56.0k
     * passing thing we just added. */
696
157
    canonical_sort(tc, n, n->buffer_norm_end, n->buffer_end - 1);
697
157
698
157
    /* Perform canonical composition and grapheme composition if needed. */
699
157
    if (MVM_NORMALIZE_COMPOSE(n->form)) {
700
157
        canonical_composition(tc, n, n->buffer_norm_end, n->buffer_end - 1);
701
157
        if (MVM_NORMALIZE_GRAPHEME(n->form))
702
157
            grapheme_composition(tc, n, n->buffer_norm_end, n->buffer_end - 1);
703
157
    }
704
157
705
157
    /* We've now normalized all except the latest, quick-check-passing
706
157
     * codepoint. */
707
157
    n->buffer_norm_end = n->buffer_end - 1;
708
157
709
157
    /* Hand back a codepoint, and flag how many more are available. */
710
157
    *out = n->buffer[n->buffer_start];
711
157
    return n->buffer_norm_end - n->buffer_start++;
712
56.0k
}
713
714
/* Push a number of codepoints into the "to normalize" buffer. */
715
98
void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints) {
716
98
    MVMint32 i;
717
294
    for (i = 0; i < num_codepoints; i++)
718
196
        decomp_codepoint_to_buffer(tc, n, in[i]);
719
98
}
720
721
/* Processes a codepoint that we regard as a "normalization terminator". These
722
 * never have a decomposition, and for all practical purposes will not have a
723
 * combiner on them. We treat them specially so we don't, during I/O, block on
724
 * seeing a codepoint after them, which for things like REPLs that need to see
725
 * input right after a \n makes for problems. */
726
20.5k
MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
727
20.5k
    /* Add the codepoint into the buffer. */
728
20.5k
    add_codepoint_to_buffer(tc, n, in);
729
20.5k
730
20.5k
    /* Treat this as an "eof", which really means "normalize what ya got". */
731
20.5k
    MVM_unicode_normalizer_eof(tc, n);
732
20.5k
733
20.5k
    /* Hand back a normalized codepoint, and the number available (have to
734
20.5k
     * compensate for the one we steal for *out). */
735
20.5k
    *out = MVM_unicode_normalizer_get_codepoint(tc, n);
736
20.5k
    return 1 + MVM_unicode_normalizer_available(tc, n);
737
20.5k
}
738
739
/* Called when we are expecting no more codepoints. */
740
62.5k
void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n) {
741
62.5k
    /* Perform canonical ordering and, if needed, canonical composition on
742
62.5k
     * what remains. */
743
62.5k
    canonical_sort(tc, n, n->buffer_norm_end, n->buffer_end);
744
62.5k
    if (MVM_NORMALIZE_COMPOSE(n->form)) {
745
62.5k
        canonical_composition(tc, n, n->buffer_norm_end, n->buffer_end);
746
62.5k
        if (MVM_NORMALIZE_GRAPHEME(n->form))
747
62.5k
            grapheme_composition(tc, n, n->buffer_norm_end, n->buffer_end);
748
62.5k
    }
749
62.5k
750
62.5k
    /* We've now normalized all that remains. */
751
62.5k
    n->buffer_norm_end = n->buffer_end;
752
62.5k
}