Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/normalize.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
0
#define UNI_CP_MALE_SIGN             0x2642
3
0
#define UNI_CP_FEMALE_SIGN           0x2640
4
59.2k
#define UNI_CP_ZERO_WIDTH_JOINER     0x200D
5
59.2k
#define UNI_CP_ZERO_WIDTH_NON_JOINER 0x200C
6
7
/* Maps outside-world normalization form codes to our internal set, validating
8
 * that we got something valid. */
9
3
MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in) {
10
3
    switch (form_in) {
11
3
    case 1: return MVM_NORMALIZE_NFC;
12
0
    case 2: return MVM_NORMALIZE_NFD;
13
0
    case 3: return MVM_NORMALIZE_NFKC;
14
0
    case 4: return MVM_NORMALIZE_NFKD;
15
0
    default: MVM_exception_throw_adhoc(tc, "Invalid normalization form %d", (int)form_in);
16
3
    }
17
3
}
18
19
/* Takes two objects, which must be of VMArray representation and holding
20
 * 32-bit integers. Performs normalization to the specified form. */
21
8
static void assert_codepoint_array(MVMThreadContext *tc, const MVMObject *arr, char *error) {
22
8
    if (IS_CONCRETE(arr) && REPR(arr)->ID == MVM_REPR_ID_VMArray) {
23
8
        MVMuint8 slot_type = ((MVMArrayREPRData *)STABLE(arr)->REPR_data)->slot_type;
24
8
        if (slot_type == MVM_ARRAY_I32 || slot_type == MVM_ARRAY_U32)
25
8
            return;
26
8
    }
27
0
    MVM_exception_throw_adhoc(tc, "%s", error);
28
0
}
29
96
MVM_STATIC_INLINE void maybe_grow_result(MVMCodepoint **result, MVMint64 *result_alloc, MVMint64 needed) {
30
96
    if (needed >= *result_alloc) {
31
8
        while (needed >= *result_alloc)
32
4
            *result_alloc += 32;
33
4
        *result = MVM_realloc(*result, *result_alloc * sizeof(MVMCodepoint));
34
4
    }
35
96
}
36
1
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form) {
37
1
    MVMNormalizer  norm;
38
1
    MVMCodepoint  *input;
39
1
    MVMCodepoint  *result;
40
1
    MVMint64       input_pos, input_codes, result_pos, result_alloc;
41
1
    MVMint32       ready;
42
1
43
1
    /* Validate input/output array. */
44
1
    assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers");
45
1
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
46
1
47
1
    /* Get input array; if it's empty, we're done already. */
48
1
    input       = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start;
49
1
    input_codes = ((MVMArray *)in)->body.elems;
50
1
    if (input_codes == 0)
51
0
        return;
52
1
53
1
    /* Guess output size based on input size. */
54
1
    result_alloc = input_codes;
55
1
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
56
1
57
1
    /* Perform normalization. */
58
1
    MVM_unicode_normalizer_init(tc, &norm, form);
59
1
    input_pos  = 0;
60
1
    result_pos = 0;
61
3
    while (input_pos < input_codes) {
62
2
        MVMCodepoint cp;
63
2
        ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp);
64
2
        if (ready) {
65
0
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
66
0
            result[result_pos++] = cp;
67
0
            while (--ready > 0)
68
0
                result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
69
0
        }
70
2
        input_pos++;
71
2
    }
72
1
    MVM_unicode_normalizer_eof(tc, &norm);
73
1
    ready = MVM_unicode_normalizer_available(tc, &norm);
74
1
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
75
3
    while (ready--)
76
2
        result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
77
1
    MVM_unicode_normalizer_cleanup(tc, &norm);
78
1
79
1
    /* Put result into array body. */
80
1
    ((MVMArray *)out)->body.slots.u32 = (MVMuint32 *) result;
81
1
    ((MVMArray *)out)->body.start     = 0;
82
1
    ((MVMArray *)out)->body.elems     = result_pos;
83
1
}
84
53
MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count) {
85
53
    MVMNormalizer  norm;
86
53
    MVMint64       input_pos, result_pos, result_alloc;
87
53
    MVMGrapheme32 *result;
88
53
    MVMint32       ready;
89
53
    MVMString     *str;
90
53
91
53
    if (cp_count == 0)
92
0
        return tc->instance->str_consts.empty;
93
53
94
53
    /* Guess output size based on cp_v size. */
95
53
    result_alloc = cp_count;
96
53
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
97
53
98
53
    /* Perform normalization at grapheme level. */
99
53
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
100
53
    input_pos  = 0;
101
53
    result_pos = 0;
102
170
    while (input_pos < cp_count) {
103
117
        MVMGrapheme32 g;
104
117
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp_v[input_pos], &g);
105
117
        if (ready) {
106
38
            maybe_grow_result(&result, &result_alloc, result_pos + ready);
107
38
            result[result_pos++] = g;
108
39
            while (--ready > 0)
109
1
                result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
110
38
        }
111
117
        input_pos++;
112
117
    }
113
53
    MVM_unicode_normalizer_eof(tc, &norm);
114
53
    ready = MVM_unicode_normalizer_available(tc, &norm);
115
53
    maybe_grow_result(&result, &result_alloc, result_pos + ready);
116
73
    while (ready--)
117
20
        result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
118
53
    MVM_unicode_normalizer_cleanup(tc, &norm);
119
53
120
53
    /* Produce an MVMString of the result. */
121
53
    str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
122
53
    str->body.storage.blob_32 = result;
123
53
    str->body.storage_type    = MVM_STRING_GRAPHEME_32;
124
53
    str->body.num_graphs      = result_pos;
125
53
    return str;
126
53
}
127
128
/* Takes an object, which must be of VMArray representation and holding
129
 * 32-bit integers. Treats them as Unicode codepoints, normalizes them at
130
 * Grapheme level, and returns the resulting NFG string. */
131
4
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, const MVMObject *codes) {
132
4
    MVMCodepoint  *input;
133
4
    MVMint64       input_codes;
134
4
135
4
    assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers");
136
4
137
4
    input       = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start;
138
4
    input_codes = ((MVMArray *)codes)->body.elems;
139
4
    return MVM_unicode_codepoints_c_array_to_nfg_string(tc, input, input_codes);
140
4
}
141
142
/* Takes an NFG string and populates the array out, which must be a 32-bit
143
 * integer array, with codepoints normalized according to the specified
144
 * normalization form. */
145
2
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) {
146
2
    MVMCodepoint     *result;
147
2
    MVMint64          result_pos, result_alloc;
148
2
    MVMCodepointIter  ci;
149
2
150
2
    /* Validate output array and set up result storage. */
151
2
    assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers");
152
2
    result_alloc = s->body.num_graphs;
153
2
    result       = MVM_malloc(result_alloc * sizeof(MVMCodepoint));
154
2
    result_pos   = 0;
155
2
156
2
    /* Create codepoint iterator. */
157
2
    MVM_string_ci_init(tc, &ci, s, 0, 0);
158
2
159
2
    /* If we want NFC, just iterate, since NFG is constructed out of NFC. */
160
2
    if (form == MVM_NORMALIZE_NFC) {
161
6
        while (MVM_string_ci_has_more(tc, &ci)) {
162
4
            maybe_grow_result(&result, &result_alloc, result_pos + 1);
163
4
            result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci);
164
4
        }
165
2
    }
166
2
167
2
    /* Otherwise, need to feed it through a normalizer. */
168
0
    else {
169
0
        MVMNormalizer norm;
170
0
        MVMint32      ready;
171
0
        MVM_unicode_normalizer_init(tc, &norm, form);
172
0
        while (MVM_string_ci_has_more(tc, &ci)) {
173
0
            MVMCodepoint cp;
174
0
            ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp);
175
0
            if (ready) {
176
0
                maybe_grow_result(&result, &result_alloc, result_pos + ready);
177
0
                result[result_pos++] = cp;
178
0
                while (--ready > 0)
179
0
                    result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
180
0
            }
181
0
        }
182
0
        MVM_unicode_normalizer_eof(tc, &norm);
183
0
        ready = MVM_unicode_normalizer_available(tc, &norm);
184
0
        maybe_grow_result(&result, &result_alloc, result_pos + ready);
185
0
        while (ready--)
186
0
            result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm);
187
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
188
0
    }
189
2
190
2
    /* Put result into array body. */
191
2
    ((MVMArray *)out)->body.slots.u32 = (MVMuint32 *)result;
192
2
    ((MVMArray *)out)->body.start     = 0;
193
2
    ((MVMArray *)out)->body.elems     = result_pos;
194
2
}
195
196
/* Initialize the MVMNormalizer pointed to to perform the specified kind of
197
 * normalization. */
198
56.6k
void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization form) {
199
56.6k
    n->form               = form;
200
56.6k
    n->buffer_size        = 32;
201
56.6k
    n->buffer             = MVM_malloc(n->buffer_size * sizeof(MVMCodepoint));
202
56.6k
    n->buffer_start       = 0;
203
56.6k
    n->buffer_end         = 0;
204
56.6k
    n->buffer_norm_end    = 0;
205
56.6k
    n->translate_newlines = 0;
206
56.6k
    n->prepend_buffer     = 0;
207
56.6k
    n->regional_indicator = 0;
208
56.6k
    switch (n->form) {
209
2.77k
        case MVM_NORMALIZE_NFD:
210
2.77k
            n->first_significant    = MVM_NORMALIZE_FIRST_SIG_NFD;
211
2.77k
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFD_QC;
212
2.77k
            break;
213
0
        case MVM_NORMALIZE_NFKD:
214
0
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKD;
215
0
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFKD_QC;
216
0
            break;
217
1
        case MVM_NORMALIZE_NFC:
218
1
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC;
219
1
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFC_QC;
220
1
            break;
221
0
        case MVM_NORMALIZE_NFKC:
222
0
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKC;
223
0
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFKC_QC;
224
0
            break;
225
53.9k
        case MVM_NORMALIZE_NFG:
226
53.9k
            n->quick_check_property = MVM_UNICODE_PROPERTY_NFG_QC;
227
53.9k
            n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC;
228
53.9k
            break;
229
0
        default:
230
0
            abort();
231
56.6k
    }
232
56.6k
}
233
234
/* Enable translation of newlines from \r\n to \n. */
235
647
void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n) {
236
647
    n->translate_newlines = 1;
237
647
}
238
239
/* Cleanup an MVMNormalization once we're done normalizing. */
240
56.1k
void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n) {
241
56.1k
    free(n->buffer);
242
56.1k
}
243
244
/* Adds a codepoint into the buffer, making sure there's space. */
245
52.6k
static void add_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
246
52.6k
    if (n->buffer_end == n->buffer_size) {
247
73
        if (n->buffer_start != 0) {
248
73
            MVMint32 shuffle = n->buffer_start;
249
73
            MVMint32 to_move = n->buffer_end - n->buffer_start;
250
73
            memmove(n->buffer, n->buffer + n->buffer_start, to_move * sizeof(MVMCodepoint));
251
73
            n->buffer_start = 0;
252
73
            n->buffer_end -= shuffle;
253
73
            n->buffer_norm_end -= shuffle;
254
73
        }
255
0
        else {
256
0
            n->buffer_size *= 2;
257
0
            n->buffer = MVM_realloc(n->buffer, n->buffer_size * sizeof(MVMCodepoint));
258
0
        }
259
73
    }
260
52.6k
    n->buffer[n->buffer_end++] = cp;
261
52.6k
}
262
263
/* Hangul-related constants from Unicode spec 3.12, following naming
264
 * convention from spec. */
265
static const int
266
    SBase = 0xAC00,
267
    LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
268
    LCount = 19, VCount = 21, TCount = 28,
269
    NCount = 588, /* VCount * TCount */
270
    SCount = 11172; /* LCount * NCount */
271
272
/* Decomposes a Hangul codepoint and add it into the buffer. */
273
0
static void decomp_hangul_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint s) {
274
0
    /* Algorithm from Unicode spec 3.12, following naming convention from spec. */
275
0
    int SIndex = s - SBase;
276
0
    if (SIndex < 0 || SIndex >= SCount) {
277
0
        add_codepoint_to_buffer(tc, n, s);
278
0
    }
279
0
    else {
280
0
        int L = LBase + SIndex / NCount;
281
0
        int V = VBase + (SIndex % NCount) / TCount;
282
0
        int T = TBase + SIndex % TCount;
283
0
        add_codepoint_to_buffer(tc, n, (MVMCodepoint)L);
284
0
        add_codepoint_to_buffer(tc, n, (MVMCodepoint)V);
285
0
        if (T != TBase)
286
0
            add_codepoint_to_buffer(tc, n, (MVMCodepoint)T);
287
0
    }
288
0
}
289
290
/* Decompose the codepoint and add it into the buffer. */
291
1.87k
static void decomp_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
292
1.87k
    /* See if we actually need to decompose (can skip if the decomposition
293
1.87k
     * type is None, or we're only doing Canonical decomposition and it is
294
1.87k
     * anything except Canonical). */
295
1.87k
    MVMint16 cp_DT = MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE);
296
1.87k
    MVMint64 decompose = 1;
297
1.87k
    if (cp_DT == MVM_UNICODE_PVALUE_DT_NONE)
298
1.75k
        decompose = 0;
299
119
    else if (!MVM_NORMALIZE_COMPAT_DECOMP(n->form) && cp_DT != MVM_UNICODE_PVALUE_DT_CANONICAL )
300
12
        decompose = 0;
301
1.87k
    if (decompose) {
302
107
        /* We need to decompose. Get the decomp spec and go over the things in
303
107
         * it; things without a decomp spec are presumably Hangul and need the
304
107
         * algorithmic treatment. */
305
107
        char *spec = (char *)MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMP_SPEC);
306
107
        if (spec && spec[0]) {
307
107
            char *end = spec + strlen(spec);
308
310
            while (spec < end) {
309
203
                /* Parse hex character code, and then recurse to do any further
310
203
                * decomposition on it; this recursion terminates when we find a
311
203
                * non-decomposable thing and add it to the buffer. */
312
203
                MVMCodepoint decomp_char = (MVMCodepoint)strtol(spec, &spec, 16);
313
203
                decomp_codepoint_to_buffer(tc, n, decomp_char);
314
203
            }
315
107
        }
316
0
        else {
317
0
            decomp_hangul_to_buffer(tc, n, cp);
318
0
        }
319
107
    }
320
1.76k
    else {
321
1.76k
        /* Don't need to decompose; add it right into the buffer. */
322
1.76k
        add_codepoint_to_buffer(tc, n, cp);
323
1.76k
    }
324
1.87k
}
325
326
/* Checks if the specified character answers "yes" on the appropriate quick check. */
327
154k
static MVMint64 passes_quickcheck(MVMThreadContext *tc, const MVMNormalizer *n, MVMCodepoint cp) {
328
154k
    const char *pval = MVM_unicode_codepoint_get_property_cstr(tc, cp, n->quick_check_property);
329
154k
    return pval && pval[0] == 'Y';
330
154k
}
331
332
/* Gets the CCC (actual value) but is slower as it looks up with string properties
333
 * Exact values are not needed for normalization.
334
 * Returns 0 for Not_Reordered codepoints *and* CCC 0 codepoints */
335
0
static MVMint64 ccc_old(MVMThreadContext *tc, MVMCodepoint cp) {
336
0
    if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) {
337
0
        return 0;
338
0
    }
339
0
    else {
340
0
        const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS);
341
0
        return !ccc_str || strlen(ccc_str) > 3 ? 0 : fast_atoi(ccc_str);
342
0
    }
343
0
}
344
/* Gets the canonical combining class for a codepoint. Does a shortcut
345
 * since CCC is stored as a string property, though because they are all sorted
346
 * numerically it is ok to get the internal integer value as stored instead of
347
 * the string.
348
 * Returns 0 for Not_Reordered codepoints *and* CCC 0 codepoints */
349
170k
MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp) {
350
170k
    if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) {
351
51.9k
        return 0;
352
51.9k
    }
353
118k
    else {
354
118k
        int ccc_int = MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS);
355
118k
        return ccc_int <= MVM_UNICODE_PVALUE_CCC_0 ? 0 : ccc_int - MVM_UNICODE_PVALUE_CCC_0;
356
118k
    }
357
170k
}
358
359
/* Checks if the thing we have is a control character (for the definition in
360
 * the Unicode Standard Annex #29). Full path. Fast path checks for controls
361
 * in the Latin-1 range. This works for those as well but needs a property lookup */
362
59.2k
MVMint32 MVM_string_is_control_full(MVMThreadContext *tc, MVMCodepoint in) {
363
59.2k
    /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded because
364
59.2k
     * they are Cf but not Control's */
365
59.2k
    if (in != UNI_CP_ZERO_WIDTH_NON_JOINER && in != UNI_CP_ZERO_WIDTH_JOINER) {
366
59.2k
        /* Consider general property:
367
59.2k
         * Cc, Zl, Zp, and Cn which are also Default_Ignorable_Code_Point=True */
368
59.2k
        const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in,
369
59.2k
            MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
370
59.2k
        switch (genprop[0]) {
371
3.11k
            case 'Z':
372
3.11k
                /* Line_Separator and Paragraph_Separator are controls. */
373
2.96k
                return genprop[1] == 'l' || genprop[1] == 'p';
374
310
            case 'C':
375
310
                /* Control, Surrogate are controls. */
376
310
                if (genprop[1] == 'c' || genprop[1] == 's') {
377
0
                    return 1;
378
0
                }
379
310
                if (genprop[1] == 'f' ) {
380
308
                    /* Format can have special properties (not control) */
381
308
                    return 0;
382
308
                }
383
310
                /* Unassigned is, but only for Default_Ignorable_Code_Point. */
384
2
                if (genprop[1] == 'n') {
385
0
                    return MVM_unicode_codepoint_get_property_int(tc, in,
386
0
                        MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0;
387
0
                }
388
59.2k
        }
389
59.2k
    }
390
55.8k
    return 0;
391
59.2k
}
392
393
/* Implements the Unicode Canonical Ordering algorithm (3.11, D109). */
394
53.2k
static void canonical_sort(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
395
53.2k
    /* Yes, this is the simplest possible thing. Key thing if you decide to
396
53.2k
     * replace it with something more optimal: it must not re-order code
397
53.2k
     * points with equal CCC. */
398
53.2k
    MVMint32 reordered = 1;
399
106k
    while (reordered) {
400
53.2k
        MVMint32 i = from;
401
53.2k
        reordered = 0;
402
55.6k
        while (i < to - 1) {
403
2.42k
            MVMint64 cccA = MVM_unicode_relative_ccc(tc, n->buffer[i]);
404
2.42k
            MVMint64 cccB = MVM_unicode_relative_ccc(tc, n->buffer[i + 1]);
405
2.42k
            if (cccA > cccB && cccB > 0) {
406
3
                MVMCodepoint tmp = n->buffer[i];
407
3
                n->buffer[i] = n->buffer[i + 1];
408
3
                n->buffer[i + 1] = tmp;
409
3
                reordered = 1;
410
3
            }
411
2.42k
            i++;
412
2.42k
        }
413
53.2k
    }
414
53.2k
}
415
416
/* Implements the Unicode Canonical Composition algorithm (3.11, D117). */
417
50.4k
static void canonical_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
418
50.4k
    MVMint32 c_idx = from + 1;
419
52.7k
    while (c_idx < to) {
420
2.32k
        /* Search for the last non-blocked starter. */
421
2.32k
        MVMint32 ss_idx = c_idx - 1;
422
2.32k
        MVMint32 c_ccc  = MVM_unicode_relative_ccc(tc, n->buffer[c_idx]);
423
2.32k
        while (ss_idx >= from) {
424
2.32k
            /* Make sure we don't go past a code point that blocks a starter
425
2.32k
             * from the current character we're considering. */
426
2.32k
            MVMint32 ss_ccc = MVM_unicode_relative_ccc(tc, n->buffer[ss_idx]);
427
2.32k
            if (ss_ccc >= c_ccc && ss_ccc != 0)
428
0
                break;
429
2.32k
430
2.32k
            /* Have we found a starter? */
431
2.32k
            if (ss_ccc == 0) {
432
2.32k
                /* See if there's a primary composite for the starter and the
433
2.32k
                 * current code point under consideration. */
434
2.32k
                MVMCodepoint pc = MVM_unicode_find_primary_composite(tc, n->buffer[ss_idx], n->buffer[c_idx]);
435
2.32k
                if (pc > 0) {
436
12
                    /* Replace the starter with the primary composite. */
437
12
                    n->buffer[ss_idx] = pc;
438
12
439
12
                    /* Move the rest of the buffer back one position. */
440
12
                    memmove(n->buffer + c_idx, n->buffer + c_idx + 1,
441
12
                        (n->buffer_end - (c_idx + 1)) * sizeof(MVMCodepoint));
442
12
                    n->buffer_end--;
443
12
444
12
                    /* Sync cc_idx and to with the change. */
445
12
                    c_idx--;
446
12
                    to--;
447
12
                }
448
2.32k
449
2.32k
                /* Don't look back beyond this starter; covers the ccc(B) = 0
450
2.32k
                 * case of D105. */
451
2.32k
                break;
452
2.32k
            }
453
1
            ss_idx--;
454
1
        }
455
2.32k
456
2.32k
        /* Move on to the next character. */
457
2.32k
        c_idx++;
458
2.32k
    }
459
50.4k
460
50.4k
    /* Make another pass for the Hangul special case. (A future optimization
461
50.4k
     * may be to incorporate this into the above loop.) */
462
50.4k
    c_idx = from;
463
52.7k
    while (c_idx < to - 1) {
464
2.31k
        /* Do we have a potential LPart? */
465
2.31k
        MVMCodepoint LPart = n->buffer[c_idx];
466
2.31k
        if (LPart >= LBase && LPart <= (LBase + LCount)) {
467
0
            /* Yes, now see if it's followed by a VPart (always safe to look
468
0
             * due to "to - 1" in loop condition above). */
469
0
            MVMCodepoint LIndex = LPart - LBase;
470
0
            MVMCodepoint VPart  = n->buffer[c_idx + 1];
471
0
            if (VPart >= VBase && VPart <= (VBase + VCount)) {
472
0
                /* Certainly something to compose; compute that. */
473
0
                MVMCodepoint VIndex = VPart - VBase;
474
0
                MVMCodepoint LVIndex = LIndex * NCount + VIndex * TCount;
475
0
                MVMCodepoint s = SBase + LVIndex;
476
0
                MVMint32 composed = 1;
477
0
478
0
                /* Is there a TPart too? */
479
0
                if (c_idx < to - 2) {
480
0
                    MVMCodepoint TPart  = n->buffer[c_idx + 2];
481
0
                    if (TPart >= TBase && TPart <= (TBase + TCount)) {
482
0
                        /* We need to compose 3 things. */
483
0
                        MVMCodepoint TIndex = TPart - TBase;
484
0
                        s += TIndex;
485
0
                        composed = 2;
486
0
                    }
487
0
                }
488
0
489
0
                /* Put composed codepoint into the buffer. */
490
0
                n->buffer[c_idx] = s;
491
0
492
0
                /* Shuffle codepoints after this in the buffer back. */
493
0
                memmove(n->buffer + c_idx + 1, n->buffer + c_idx + 1 + composed,
494
0
                        (n->buffer_end - (c_idx + 1 + composed)) * sizeof(MVMCodepoint));
495
0
                n->buffer_end -= composed;
496
0
497
0
                /* Sync to with updated buffer size. */
498
0
                to -= composed;
499
0
            }
500
0
        }
501
2.31k
        c_idx++;
502
2.31k
    }
503
50.4k
}
504
505
/* Performs grapheme composition (to get Normal Form Grapheme) on the range of
506
 * codepoints provided. This follows the algorithm in the Unicode Standard
507
 * Annex #29 on grapheme cluster boundaries. Note that we have already done
508
 * the handling of breaking around controls much earlier, so don't have to
509
 * consider that case. */
510
0
static MVMint32 maybe_hangul(MVMCodepoint cp) {
511
0
    return (0x1100 <= cp  && cp < 0x1200) || (0xA960 <= cp && cp < 0xD7FC);
512
0
}
513
0
static MVMint32 is_grapheme_extend(MVMThreadContext *tc, MVMCodepoint cp) {
514
0
    return MVM_unicode_codepoint_get_property_int(tc, cp,
515
0
        MVM_UNICODE_PROPERTY_GRAPHEME_EXTEND);
516
0
}
517
101k
static MVMint32 is_grapheme_prepend(MVMThreadContext *tc, MVMCodepoint cp) {
518
101k
    return MVM_unicode_codepoint_get_property_int(tc, cp,
519
101k
        MVM_UNICODE_PROPERTY_PREPENDED_CONCATENATION_MARK);
520
101k
}
521
/* Returns 0 if the two graphemes should be combined and returns 1 or 2 if
522
 * the graphemes should break. 2 is returned if more than the currenly seen
523
 * graphemes may be needed to determine the breaking (this is only needed if
524
 * we are checking two arbitrary codepoints. If we are normalizing linearly from
525
 * the start of the string this has no more significance than returning 1) */
526
8.87k
MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer *norm) {
527
8.87k
    int GCB_a, GCB_b;
528
8.87k
529
8.87k
    /* Don't break between \r and \n, but otherwise break around \r. */
530
8.87k
    if (a == 0x0D && b == 0x0A)
531
359
        return 0;
532
8.51k
    if (a == 0x0D || b == 0x0D)
533
176
        return 1;
534
8.51k
    /* For utf8-c8 graphemes. These we can't request property values and act like
535
8.51k
     * control's */
536
8.34k
    if (a < 0 || b < 0) {
537
0
        if ((a < 0 && MVM_nfg_get_synthetic_info(tc, a)->is_utf8_c8) || (b < 0 && MVM_nfg_get_synthetic_info(tc, b)->is_utf8_c8))
538
0
            return 1;
539
0
540
0
        MVM_exception_throw_adhoc(tc, "Internal error: synthetic grapheme found when computing grapheme segmentation");
541
0
    }
542
8.34k
    GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
543
8.34k
    GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK);
544
8.34k
    switch (GCB_a) {
545
0
        case MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR:
546
0
            if (2 <= norm->regional_indicator) {
547
0
                norm->regional_indicator = 0;
548
0
                if (GCB_b == MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR)
549
0
                /* Return 2 here so is_concat_stable can know to run re_nfg */
550
0
                    return 2;
551
0
            }
552
0
            if (GCB_b == MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR) {
553
0
                if (!norm->regional_indicator)
554
0
                    norm->regional_indicator = 2;
555
0
                else
556
0
                    norm->regional_indicator++;
557
0
                return 0;
558
0
            }
559
0
            break;
560
0
        /* Don't break after Prepend Grapheme_Cluster_Break=Prepend */
561
0
        case MVM_UNICODE_PVALUE_GCB_PREPEND:
562
0
            /* If it's a control character remember to break */
563
0
            if (MVM_string_is_control_full(tc, b )) {
564
0
                return 1;
565
0
            }
566
0
            /* Otherwise don't break */
567
0
            return 0;
568
0
        /* Don't break after ZWJ for E_Base_GAZ or Glue_After_ZWJ */
569
3
        case MVM_UNICODE_PVALUE_GCB_ZWJ:
570
3
            switch (GCB_b) {
571
3
                case MVM_UNICODE_PVALUE_GCB_E_BASE_GAZ:
572
3
                case MVM_UNICODE_PVALUE_GCB_ZWJ:
573
3
                case MVM_UNICODE_PVALUE_GCB_GLUE_AFTER_ZWJ:
574
3
                    return 0;
575
3
            }
576
0
            if ( b == UNI_CP_FEMALE_SIGN || b == UNI_CP_MALE_SIGN )
577
0
                return 0;
578
0
            /* Don't break after ZWJ for Emoji property characters that have
579
0
             * GCB=Other. This is *not* a unicode text segmentation rule but
580
0
             * is needed to not break inside Emoji sequences. As the rule to
581
0
             * not break in Emoji sequences is specified by Unicode to need
582
0
             * customization to perform properly. */
583
0
            if (GCB_b == MVM_UNICODE_PVALUE_GCB_OTHER
584
0
            && 127 < b /* Numbers and # have property Emoji. So make sure we're not in ASCII range */
585
0
            && MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_EMOJI) )
586
0
                return 0;
587
0
        case MVM_UNICODE_PVALUE_GCB_E_MODIFIER:
588
0
            if (MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_EMOJI_MODIFIER_BASE)) {
589
0
                /* Don't break after ZWJ if it's an Emoji Sequence.
590
0
                 * At the moment FEMALE SIGN and MALE SIGN don't have different
591
0
                 * GCB properties, or any special Emoji properties (Unicode 9.0),
592
0
                 * so we explictly check these codepoints here */
593
0
                if ( b == UNI_CP_FEMALE_SIGN || b == UNI_CP_MALE_SIGN )
594
0
                    return 0;
595
0
            }
596
0
            break;
597
0
        case MVM_UNICODE_PVALUE_GCB_L:
598
0
            if (GCB_b == MVM_UNICODE_PVALUE_GCB_L  || GCB_b == MVM_UNICODE_PVALUE_GCB_V ||
599
0
                     GCB_b == MVM_UNICODE_PVALUE_GCB_LV || GCB_b == MVM_UNICODE_PVALUE_GCB_LVT)
600
0
                return 0;
601
0
            break;
602
0
        case MVM_UNICODE_PVALUE_GCB_LV:
603
0
        case MVM_UNICODE_PVALUE_GCB_V:
604
0
            if (GCB_b == MVM_UNICODE_PVALUE_GCB_V || GCB_b == MVM_UNICODE_PVALUE_GCB_T)
605
0
                return 0;
606
0
            break;
607
0
        case MVM_UNICODE_PVALUE_GCB_LVT:
608
0
        case MVM_UNICODE_PVALUE_GCB_T:
609
0
            if (GCB_b == MVM_UNICODE_PVALUE_GCB_T)
610
0
                return 0;
611
0
            break;
612
8.34k
    }
613
8.34k
    switch (GCB_b) {
614
8.34k
        /* Don't break before extending chars */
615
16
        case MVM_UNICODE_PVALUE_GCB_EXTEND:
616
16
            return 0;
617
16
        /* Don't break before ZWJ */
618
5
        case MVM_UNICODE_PVALUE_GCB_ZWJ:
619
5
            return 0;
620
3
        case MVM_UNICODE_PVALUE_GCB_E_MODIFIER:
621
3
            switch (GCB_a) {
622
0
                case MVM_UNICODE_PVALUE_GCB_E_BASE_GAZ:
623
0
                    return 0;
624
3
                case MVM_UNICODE_PVALUE_GCB_E_BASE:
625
3
                    return 0;
626
3
            }
627
0
            if (MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_EMOJI_MODIFIER_BASE)) {
628
0
                /* Not all emoji modifiers have E_BASE or E_BASE_GAZ, some cases we need to check the
629
0
                 * Emoji_Modifier_Base property */
630
0
                return 0;
631
0
            }
632
0
            break;
633
0
        /* Don't break before spacing marks. */
634
0
        case MVM_UNICODE_PVALUE_GCB_SPACINGMARK:
635
0
            return 0;
636
8.34k
    }
637
8.34k
638
8.34k
    /* Otherwise break. */
639
8.31k
    return 1;
640
8.34k
}
641
50.4k
static void grapheme_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
642
50.4k
    if (to - from >= 2) {
643
2.27k
        MVMint32 starterish = from;
644
2.27k
        MVMint32 insert_pos = from;
645
2.27k
        MVMint32 pos        = from;
646
6.86k
        while (pos < to) {
647
4.59k
            MVMint32 next_pos = pos + 1;
648
4.59k
            if (next_pos == to || MVM_unicode_normalize_should_break(tc, n->buffer[pos], n->buffer[next_pos], n)) {
649
4.21k
                /* Last in buffer or next code point is a non-starter; turn
650
4.21k
                 * sequence into a synthetic. */
651
4.21k
                MVMGrapheme32 g = MVM_nfg_codes_to_grapheme(tc, n->buffer + starterish, next_pos - starterish);
652
4.21k
                if (n->translate_newlines && g == MVM_nfg_crlf_grapheme(tc))
653
5
                    g = '\n';
654
4.21k
                n->buffer[insert_pos++] = g;
655
4.21k
656
4.21k
                /* The next code point is our new starterish (harmless if we
657
4.21k
                 * are already at the end of the buffer). */
658
4.21k
                starterish = next_pos;
659
4.21k
            }
660
4.59k
            pos++;
661
4.59k
        }
662
2.27k
        memmove(n->buffer + insert_pos, n->buffer + to, (n->buffer_end - to) * sizeof(MVMCodepoint));
663
2.27k
        n->buffer_end -= to - insert_pos;
664
2.27k
    }
665
50.4k
}
666
667
/* Called when the very fast case of normalization fails (that is, when we get
668
 * any two codepoints in a row where at least one is greater than the first
669
 * significant codepoint identified by a quick check for the target form). We
670
 * may find the quick check itself is enough; if not, we have to do real work
671
 * compute the normalization. */
672
101k
MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *norm, MVMCodepoint in, MVMCodepoint *out) {
673
101k
    MVMint64 qc_in, ccc_in;
674
101k
    int is_prepend = is_grapheme_prepend(tc, in);
675
101k
676
101k
    if (MVM_UNLIKELY(0 < norm->prepend_buffer))
677
0
        norm->prepend_buffer--;
678
101k
    if (MVM_UNLIKELY(is_prepend))
679
0
        norm->prepend_buffer = 2;
680
101k
681
101k
    /* If it's a control character (outside of the range we checked in the
682
101k
     * fast path) then it's a normalization terminator. */
683
101k
    if (in > 0xFF && MVM_string_is_control_full(tc, in) && !is_prepend) {
684
300
        return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, norm, in, out);
685
300
    }
686
101k
687
101k
    /* Do a quickcheck on the codepoint we got in and get its CCC. */
688
101k
    qc_in  = passes_quickcheck(tc, norm, in);
689
101k
    ccc_in = MVM_unicode_relative_ccc(tc, in);
690
101k
    /* Fast cases when we pass quick check and what we got in has CCC = 0,
691
101k
     * and it does not follow a prepend character. */
692
101k
    if (qc_in && ccc_in == 0 && norm->prepend_buffer == 0) {
693
100k
        if (MVM_NORMALIZE_COMPOSE(norm->form)) {
694
100k
            /* We're composing. If we have exactly one thing in the buffer and
695
100k
             * it also passes the quick check, and both it and the thing in the
696
100k
             * buffer have a CCC of zero, we can hand back the first of the
697
100k
             * two - effectively replacing what's in the buffer with the new
698
100k
             * codepoint coming in. Note that the NFG quick-check property
699
100k
             * factors in grapheme extenders that don't have a CCC of zero,
700
100k
             * so we're safe. */
701
100k
            if (norm->buffer_end - norm->buffer_start == 1) {
702
52.7k
                MVMCodepoint maybe_result = norm->buffer[norm->buffer_start];
703
52.7k
                if (passes_quickcheck(tc, norm, maybe_result) && MVM_unicode_relative_ccc(tc, maybe_result) == 0) {
704
52.7k
                    *out = norm->buffer[norm->buffer_start];
705
52.7k
                    norm->buffer[norm->buffer_start] = in;
706
52.7k
                    return 1;
707
52.7k
                }
708
52.7k
            }
709
100k
        }
710
0
        else {
711
0
            /* We're only decomposing. There should probably be nothing in the
712
0
             * buffer in this case; if so we can simply return the codepoint. */
713
0
            if (norm->buffer_start == norm->buffer_end) {
714
0
                *out = in;
715
0
                return 1;
716
0
            }
717
0
        }
718
100k
    }
719
101k
720
101k
    /* If we didn't pass quick check... */
721
48.6k
    if (!qc_in || 0 < norm->prepend_buffer) {
722
1.10k
        /* If we're composing, then decompose the last thing placed in the
723
1.10k
         * buffer, if any. We need to do this since it may have passed
724
1.10k
         * quickcheck, but having seen some character that does pass then we
725
1.10k
         * must make sure we decomposed the prior passing one too. */
726
1.10k
        if (MVM_NORMALIZE_COMPOSE(norm->form) && norm->buffer_end != norm->buffer_norm_end && !is_prepend) {
727
244
            MVMCodepoint decomp = norm->buffer[norm->buffer_end - 1];
728
244
            norm->buffer_end--;
729
244
            decomp_codepoint_to_buffer(tc, norm, decomp);
730
244
        }
731
1.10k
732
1.10k
        /* Decompose this new character into the buffer. We'll need to see
733
1.10k
         * more before we can go any further. */
734
1.10k
        decomp_codepoint_to_buffer(tc, norm, in);
735
1.10k
        return 0;
736
1.10k
    }
737
48.6k
738
48.6k
    /* Since anything we have at this point does pass quick check, add it to
739
48.6k
     * the buffer directly. */
740
47.5k
    add_codepoint_to_buffer(tc, norm, in);
741
47.5k
742
47.5k
    /* If the codepoint has a CCC that is non-zero, it's not a starter so we
743
47.5k
     * should see more before normalizing. */
744
47.5k
    if (ccc_in > 0)
745
0
        return 0;
746
47.5k
747
47.5k
    /* If we don't have at least one codepoint in the buffer, it's too early
748
47.5k
     * to hand anything back. */
749
47.5k
    if (norm->buffer_end - norm->buffer_start <= 1)
750
47.3k
        return 0;
751
47.5k
752
47.5k
    /* Perform canonical sorting on everything from the start of the not yet
753
47.5k
     * normalized things in the buffer, up to but excluding the quick-check
754
47.5k
     * passing thing we just added. */
755
189
    canonical_sort(tc, norm, norm->buffer_norm_end, norm->buffer_end - 1);
756
189
757
189
    /* Perform canonical composition and grapheme composition if needed. */
758
189
    if (MVM_NORMALIZE_COMPOSE(norm->form)) {
759
189
        canonical_composition(tc, norm, norm->buffer_norm_end, norm->buffer_end - 1);
760
189
        if (MVM_NORMALIZE_GRAPHEME(norm->form))
761
189
            grapheme_composition(tc, norm, norm->buffer_norm_end, norm->buffer_end - 1);
762
189
    }
763
189
764
189
    /* We've now normalized all except the latest, quick-check-passing
765
189
     * codepoint. */
766
189
    norm->buffer_norm_end = norm->buffer_end - 1;
767
189
768
189
    /* Hand back a codepoint, and flag how many more are available. */
769
189
    *out = norm->buffer[norm->buffer_start];
770
189
    return norm->buffer_norm_end - norm->buffer_start++;
771
47.5k
}
772
773
/* Push a number of codepoints into the "to normalize" buffer. */
774
168
void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints) {
775
168
    MVMint32 i;
776
484
    for (i = 0; i < num_codepoints; i++)
777
316
        decomp_codepoint_to_buffer(tc, n, in[i]);
778
168
}
779
780
/* Processes a codepoint that we regard as a "normalization terminator". These
781
 * never have a decomposition, and for all practical purposes will not have a
782
 * combiner on them. We treat them specially so we don't, during I/O, block on
783
 * seeing a codepoint after them, which for things like REPLs that need to see
784
 * input right after a \n makes for problems. */
785
3.38k
MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
786
3.38k
    /* Add the codepoint into the buffer. */
787
3.38k
    add_codepoint_to_buffer(tc, n, in);
788
3.38k
789
3.38k
    /* Treat this as an "eof", which really means "normalize what ya got". */
790
3.38k
    MVM_unicode_normalizer_eof(tc, n);
791
3.38k
792
3.38k
    /* Hand back a normalized codepoint, and the number available (have to
793
3.38k
     * compensate for the one we steal for *out). */
794
3.38k
    *out = MVM_unicode_normalizer_get_codepoint(tc, n);
795
3.38k
    return 1 + MVM_unicode_normalizer_available(tc, n);
796
3.38k
}
797
798
/* Called when we are expecting no more codepoints. */
799
53.0k
void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n) {
800
53.0k
    /* Perform canonical ordering and, if needed, canonical composition on
801
53.0k
     * what remains. */
802
53.0k
    canonical_sort(tc, n, n->buffer_norm_end, n->buffer_end);
803
53.0k
    if (MVM_NORMALIZE_COMPOSE(n->form)) {
804
50.2k
        canonical_composition(tc, n, n->buffer_norm_end, n->buffer_end);
805
50.2k
        if (MVM_NORMALIZE_GRAPHEME(n->form))
806
50.2k
            grapheme_composition(tc, n, n->buffer_norm_end, n->buffer_end);
807
50.2k
    }
808
53.0k
    /* Reset these two to ensure their value doesn't stick around */
809
53.0k
    n->prepend_buffer     = 0;
810
53.0k
    n->regional_indicator = 0;
811
53.0k
    /* We've now normalized all that remains. */
812
53.0k
    n->buffer_norm_end = n->buffer_end;
813
53.0k
}