Coverage Report

Created: 2018-06-21 18:56

/home/travis/build/MoarVM/MoarVM/src/strings/nfg.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* Number of extra elements we add to the synthetics table each time we need
4
 * to grow it. */
5
300
#define MVM_SYNTHETIC_GROW_ELEMS 32
6
7
/* Finds the index of a given codepoint within a trie node. Returns it if
8
 * there is one, or negative if there is not (note 0 is a valid index). */
9
1.11k
static MVMint32 find_child_node_idx(MVMThreadContext *tc, const MVMNFGTrieNode *node, MVMCodepoint cp) {
10
1.11k
    if (node) {
11
813
        /* TODO: update this to do a binary search later on. */
12
813
        MVMint32 i;
13
897
        for (i = 0; i < node->num_entries; i++)
14
861
            if (node->next_codes[i].code == cp)
15
777
                return i;
16
813
    }
17
337
    return -1;
18
1.11k
}
19
20
/* Does a lookup in the trie for a synthetic for the specified codepoints. */
21
792
MVMNFGTrieNode * find_child_node(MVMThreadContext *tc, const MVMNFGTrieNode *node, MVMCodepoint cp) {
22
792
    MVMint32 idx = find_child_node_idx(tc, node, cp);
23
768
    return idx >= 0 ? node->next_codes[idx].node : NULL;
24
792
}
25
681
static MVMGrapheme32 lookup_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes) {
26
681
    MVMNFGTrieNode *cur_node        = tc->instance->nfg->grapheme_lookup;
27
681
    MVMCodepoint   *cur_code        = codes;
28
681
    MVMint32        codes_remaining = num_codes;
29
1.47k
    while (cur_node && codes_remaining) {
30
792
        cur_node = find_child_node(tc, cur_node, *cur_code);
31
792
        cur_code++;
32
792
        codes_remaining--;
33
792
    }
34
369
    return cur_node ? cur_node->graph : 0;
35
681
}
36
37
/* Recursive algorithm to add to the trie. Descends existing trie nodes so far
38
 * as we have them following the code points, then passes on a NULL for the
39
 * levels of current below that do not exist. Once we bottom out, makes a copy
40
 * of or creates a node for the synthetic. As we walk back up we create or
41
 * copy+tweak nodes until we have produced a new trie, re-using what we can of
42
 * the existing one. */
43
478
static MVMNFGTrieNode * twiddle_trie_node(MVMThreadContext *tc, MVMNFGTrieNode *current, MVMCodepoint *cur_code, MVMint32 codes_remaining, MVMGrapheme32 synthetic) {
44
478
    /* Make a new empty node, which we'll maybe copy some things from the
45
478
     * current node into. */
46
478
    MVMNFGTrieNode *new_node = MVM_fixed_size_alloc(tc, tc->instance->fsa, sizeof(MVMNFGTrieNode));
47
478
48
478
    /* If we've more codes remaining... */
49
478
    if (codes_remaining > 0) {
50
322
        /* Recurse, to get a new child node. */
51
322
        MVMint32 idx = find_child_node_idx(tc, current, *cur_code);
52
322
        MVMNFGTrieNode *new_child = twiddle_trie_node(tc,
53
9
            idx >= 0 ? current->next_codes[idx].node : NULL,
54
322
            cur_code + 1, codes_remaining - 1, synthetic);
55
322
56
322
        /* If we had an existing child node... */
57
322
        if (idx >= 0) {
58
9
            /* Make a copy of the next_codes list. */
59
9
            size_t the_size = current->num_entries * sizeof(MVMNFGTrieNodeEntry);
60
9
            MVMNFGTrieNodeEntry *new_next_codes = MVM_fixed_size_alloc(tc,
61
9
                tc->instance->fsa, the_size);
62
9
            memcpy(new_next_codes, current->next_codes, the_size);
63
9
64
9
            /* Update the copy to point to the new child. */
65
9
            new_next_codes[idx].node = new_child;
66
9
67
9
            /* Install the new next_codes list in the new node, and free the
68
9
             * existing child list at the next safe point. */
69
9
            new_node->num_entries = current->num_entries;
70
9
            new_node->next_codes  = new_next_codes;
71
9
            MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa, the_size,
72
9
                current->next_codes);
73
9
        }
74
322
75
322
        /* Otherwise, we're going to need to insert the new child into a
76
322
         * (possibly existing) child list. */
77
313
        else {
78
313
            /* Calculate new child node list size and allocate it. */
79
301
            MVMint32 orig_entries = current ? current->num_entries : 0;
80
313
            MVMint32 new_entries  = orig_entries + 1;
81
313
            size_t new_size       = new_entries * sizeof(MVMNFGTrieNodeEntry);
82
313
            MVMNFGTrieNodeEntry *new_next_codes = MVM_fixed_size_alloc(tc,
83
313
                tc->instance->fsa, new_size);
84
313
85
313
            /* Go through original entries, copying those that are for a lower
86
313
             * code point than the one we're inserting a child for. */
87
313
            MVMint32 insert_pos = 0;
88
313
            MVMint32 orig_pos   = 0;
89
327
            while (orig_pos < orig_entries && current->next_codes[orig_pos].code < *cur_code)
90
14
                new_next_codes[insert_pos++] = current->next_codes[orig_pos++];
91
313
92
313
            /* Insert the new child. */
93
313
            new_next_codes[insert_pos].code = *cur_code;
94
313
            new_next_codes[insert_pos].node = new_child;
95
313
            insert_pos++;
96
313
97
313
            /* Copy the rest. */
98
316
            while (orig_pos < orig_entries)
99
3
                new_next_codes[insert_pos++] = current->next_codes[orig_pos++];
100
313
101
313
            /* Install the new next_codes list in the new node, and free any
102
313
             * existing child list at the next safe point. */
103
313
            new_node->num_entries = new_entries;
104
313
            new_node->next_codes  = new_next_codes;
105
313
            if (orig_entries)
106
10
                MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa,
107
10
                    orig_entries * sizeof(MVMNFGTrieNodeEntry),
108
10
                    current->next_codes);
109
313
        }
110
322
111
322
        /* Always need to copy synthetic set on the existing node also;
112
322
         * otherwise make sure to clear it. */
113
301
        new_node->graph = current ? current->graph : 0;
114
322
    }
115
478
116
478
    /* Otherwise, we reached the point where we need to install the synthetic.
117
478
     * If we already had a node here, we re-use the children of it. */
118
156
    else {
119
156
        new_node->graph = synthetic;
120
156
        if (current) {
121
0
            new_node->num_entries = current->num_entries;
122
0
            new_node->next_codes  = current->next_codes;
123
0
        }
124
156
        else {
125
156
            new_node->num_entries = 0;
126
156
            new_node->next_codes  = NULL;
127
156
        }
128
156
    }
129
478
130
478
    /* Free any existing node at next safe point, return the new one. */
131
478
    if (current)
132
21
        MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa,
133
21
            sizeof(MVMNFGTrieNode), current);
134
478
    return new_node;
135
478
}
136
156
static void add_synthetic_to_trie(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMGrapheme32 synthetic) {
137
156
    MVMNFGState    *nfg      = tc->instance->nfg;
138
156
    MVMNFGTrieNode *new_trie = twiddle_trie_node(tc, nfg->grapheme_lookup, codes, num_codes, synthetic);
139
156
    MVM_barrier();
140
156
    nfg->grapheme_lookup = new_trie;
141
156
}
142
143
/* Assumes that we are holding the lock that serializes updates, and already
144
 * checked that the synthetic does not exist. Adds it to the lookup trie and
145
 * synthetics table, making sure to do enough copy/free-at-safe-point work to
146
 * not upset other threads possibly doing concurrent reads. */
147
156
static MVMGrapheme32 add_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMint32 utf8_c8) {
148
156
    MVMNFGState     *nfg = tc->instance->nfg;
149
156
    MVMNFGSynthetic *synth;
150
156
    MVMGrapheme32    result;
151
156
152
156
    /* Grow the synthetics table if needed. */
153
156
    if (nfg->num_synthetics % MVM_SYNTHETIC_GROW_ELEMS == 0) {
154
144
        size_t orig_size = nfg->num_synthetics * sizeof(MVMNFGSynthetic);
155
144
        size_t new_size  = (nfg->num_synthetics + MVM_SYNTHETIC_GROW_ELEMS) * sizeof(MVMNFGSynthetic);
156
144
        MVMNFGSynthetic *new_synthetics = MVM_fixed_size_alloc(tc, tc->instance->fsa, new_size);
157
144
        if (orig_size) {
158
0
            memcpy(new_synthetics, nfg->synthetics, orig_size);
159
0
            MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa, orig_size, nfg->synthetics);
160
0
        }
161
144
        nfg->synthetics = new_synthetics;
162
144
    }
163
156
164
156
    /* Set up the new synthetic entry. */
165
156
    synth            = &(nfg->synthetics[nfg->num_synthetics]);
166
156
    synth->num_codes = num_codes;
167
156
    /* Find which codepoint is the base codepoint. It is always index 0 unless
168
156
     * there are Prepend codepoints */
169
156
    if (!utf8_c8 && MVM_unicode_codepoint_get_property_int(tc, codes[0], MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK)
170
153
        == MVM_UNICODE_PVALUE_GCB_PREPEND) {
171
0
        MVMint64 i = 0;
172
0
        MVMCodepoint cached = codes[i++];
173
0
        MVMint64 cached_GCB = MVM_UNICODE_PVALUE_GCB_PREPEND;
174
0
        while (i < num_codes) {
175
0
            /* If it's the same codepoint as before, don't need to request
176
0
             * the property value again */
177
0
            if (cached == codes[i] || MVM_UNICODE_PVALUE_GCB_PREPEND ==
178
0
                (cached_GCB = MVM_unicode_codepoint_get_property_int(tc, (cached = codes[i]),
179
0
                    MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK))) {
180
0
            }
181
0
            else {
182
0
                /* If we see an Extend then this is a degenerate without any
183
0
                 * base character, so set i to num_codes so base_index gets set
184
0
                 * to 0 */
185
0
                if (cached_GCB == MVM_UNICODE_PVALUE_GCB_EXTEND)
186
0
                    i = num_codes;
187
0
                break;
188
0
            }
189
0
            i++;
190
0
        }
191
0
        /* If all the codepoints were prepend then we need to set it to 0 */
192
0
        synth->base_index = num_codes == i ? 0 : i;
193
0
194
0
    }
195
156
    else {
196
156
        synth->base_index = 0;
197
156
    }
198
156
199
156
200
156
    synth->codes     = MVM_fixed_size_alloc(tc, tc->instance->fsa,
201
156
        num_codes * sizeof(MVMCodepoint));
202
156
    memcpy(synth->codes, codes, (synth->num_codes * sizeof(MVMCodepoint)));
203
156
    synth->case_uc    = 0;
204
156
    synth->case_lc    = 0;
205
156
    synth->case_tc    = 0;
206
156
    synth->case_fc    = 0;
207
156
    synth->is_utf8_c8 = utf8_c8;
208
156
209
156
    /* Memory barrier to make sure the synthetic is fully in place before we
210
156
     * bump the count. */
211
156
    MVM_barrier();
212
156
    nfg->num_synthetics++;
213
156
214
156
    /* Give the synthetic an ID by negating the new number of synthetics. */
215
156
    result = -(nfg->num_synthetics);
216
156
217
156
    /* Make an entry in the lookup trie for the new synthetic, so we can use
218
156
     * it in the future when seeing the same codepoint sequence. */
219
156
    add_synthetic_to_trie(tc, codes, num_codes, result);
220
156
221
156
    return result;
222
156
}
223
224
/* Does a lookup of a synthetic in the trie. If we find one, returns it. If
225
 * not, acquires the update lock, re-checks that we really are missing the
226
 * synthetic, and then adds it. */
227
525
static MVMGrapheme32 lookup_or_add_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMint32 utf8_c8) {
228
525
    MVMGrapheme32 result = lookup_synthetic(tc, codes, num_codes);
229
525
    if (!result) {
230
156
        uv_mutex_lock(&tc->instance->nfg->update_mutex);
231
156
        result = lookup_synthetic(tc, codes, num_codes);
232
156
        if (!result)
233
156
            result = add_synthetic(tc, codes, num_codes, utf8_c8);
234
156
        uv_mutex_unlock(&tc->instance->nfg->update_mutex);
235
156
    }
236
525
    return result;
237
525
}
238
239
/* Takes one or more code points. If only one code point is passed, it is
240
 * returned as the grapheme. Otherwise, resolves it to a synthetic - either an
241
 * already existing one if we saw it before, or a new one if not.  Assumes
242
 * that the code points are already in NFC, and as such canonical ordering has
243
 * been applied. */
244
6.22k
MVMGrapheme32 MVM_nfg_codes_to_grapheme(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes) {
245
6.22k
    if (num_codes == 1)
246
5.85k
        return codes[0];
247
373
    else if (num_codes < MVM_GRAPHEME_MAX_CODEPOINTS)
248
373
        return lookup_or_add_synthetic(tc, codes, num_codes, 0);
249
373
    else
250
0
        MVM_exception_throw_adhoc(tc, "Too many codepoints (%d) in grapheme", num_codes);
251
6.22k
}
252
253
/* Does the same as MVM_nfg_codes_to_grapheme, but flags the added grapheme as
254
 * being an UTF8-C8 synthetic. */
255
8
MVMGrapheme32 MVM_nfg_codes_to_grapheme_utf8_c8(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes) {
256
8
    if (num_codes == 1)
257
0
        return codes[0];
258
8
    else
259
8
        return lookup_or_add_synthetic(tc, codes, num_codes, 1);
260
8
}
261
262
/* Gets the \r\n synthetic. */
263
1.40M
MVMGrapheme32 MVM_nfg_crlf_grapheme(MVMThreadContext *tc) {
264
1.40M
    return tc->instance->nfg->crlf_grapheme;
265
1.40M
}
266
267
/* Does a lookup of information held about a synthetic. The synth parameter
268
 * must be a synthetic codepoint (that is, negative). The memory returned is
269
 * not to be freed by the caller; it also is only valid until the next GC
270
 * safe point. */
271
54
MVMNFGSynthetic * MVM_nfg_get_synthetic_info(MVMThreadContext *tc, MVMGrapheme32 synth) {
272
54
    MVMNFGState *nfg       = tc->instance->nfg;
273
54
    MVMint32     synth_idx = -synth - 1;
274
54
    if (synth >= 0)
275
0
        MVM_oops(tc, "MVM_nfg_get_synthetic_info illegally called on a non-synthetic codepoint.\nRequested codepoint %i.", synth);
276
54
    if (synth_idx >= nfg->num_synthetics)
277
0
        MVM_oops(tc, "MVM_nfg_get_synthetic_info call requested a synthetic codepoint that does not exist.\nRequested synthetic %i when only %i have been created.", -synth, nfg->num_synthetics);
278
54
    return &(nfg->synthetics[synth_idx]);
279
54
}
280
281
/* Gets the cached case change if we already computed it, or computes it if
282
 * this is the first time we're using it. */
283
static MVMGrapheme32 CASE_UNCHANGED[1] = {0};
284
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth_g, MVMNFGSynthetic *synth_info, MVMint32 case_) {
285
0
    MVMint32 num_result_graphs;
286
0
    MVMGrapheme32          *result = NULL;
287
0
    const MVMCodepoint *result_cps = NULL;
288
0
    /* Transform the base character. */
289
0
    MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
290
0
        synth_info->codes[synth_info->base_index], case_, &result_cps);
291
0
    if (num_result_cps == 0 || (num_result_cps == 1 && result_cps[0] == synth_info->codes[synth_info->base_index])) {
292
0
        /* Base character does not change, so grapheme stays the same. We
293
0
         * install a non-null sentinel for this case, and set the result
294
0
         * grapheme count to zero, which indicates no change. */
295
0
        result = CASE_UNCHANGED;
296
0
        num_result_graphs = 0;
297
0
    }
298
0
    else {
299
0
        /* We can potentially get multiple graphemes back. We may also get
300
0
         * into situations where we case change the base and suddenly we
301
0
         * can normalize the whole thing to a non-synthetic. So, we take
302
0
         * a trip through the normalizer. We push any codepoints before the
303
0
         * base in the synthetic (only happens with Prepend codepoints).
304
0
          * We then push the first codepoint we get back from the case change
305
0
         * then the codeponits after the base characters (generally Extend
306
0
         * codepoints).
307
0
         * Finally we push anything else the case change produced. This should
308
0
         * do about the right thing for both case changes that produce a
309
0
         * base and a combiner, and those that produce a base and a base,
310
0
         * since the normalizer applies canonical combining class sorting. */
311
0
        MVMNormalizer norm;
312
0
        MVMint32 i;
313
0
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
314
0
        if (0 < synth_info->base_index)
315
0
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
316
0
                synth_info->codes,
317
0
                synth_info->base_index);
318
0
        /* Push the first result on */
319
0
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
320
0
        /* Push any combiners after that codepoint so the combiners attach to the
321
0
         * first codepoint of the casechange not the second or more */
322
0
        MVM_unicode_normalizer_push_codepoints(tc, &norm,
323
0
            synth_info->codes     + synth_info->base_index + 1,
324
0
            synth_info->num_codes - synth_info->base_index - 1);
325
0
        if (1 < num_result_cps)
326
0
            MVM_unicode_normalizer_push_codepoints(tc, &norm,
327
0
                result_cps     + 1,
328
0
                num_result_cps - 1);
329
0
        MVM_unicode_normalizer_eof(tc, &norm);
330
0
331
0
        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
332
0
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
333
0
        for (i = 0; i < num_result_graphs; i++)
334
0
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
335
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
336
0
    }
337
0
338
0
    switch (case_) {
339
0
        case MVM_unicode_case_change_type_upper:
340
0
            synth_info->case_uc        = result;
341
0
            synth_info->case_uc_graphs = num_result_graphs;
342
0
            break;
343
0
        case MVM_unicode_case_change_type_lower:
344
0
            synth_info->case_lc        = result;
345
0
            synth_info->case_lc_graphs = num_result_graphs;
346
0
            break;
347
0
        case MVM_unicode_case_change_type_title:
348
0
            synth_info->case_tc        = result;
349
0
            synth_info->case_tc_graphs = num_result_graphs;
350
0
            break;
351
0
        case MVM_unicode_case_change_type_fold:
352
0
            synth_info->case_fc        = result;
353
0
            synth_info->case_fc_graphs = num_result_graphs;
354
0
            break;
355
0
        default:
356
0
            MVM_panic(1, "NFG: invalid case change %d", case_);
357
0
    }
358
0
}
359
0
MVMuint32 MVM_nfg_get_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMint32 case_, MVMGrapheme32 **result) {
360
0
    MVMNFGSynthetic *synth_info = MVM_nfg_get_synthetic_info(tc, synth);
361
0
    switch (case_) {
362
0
    case MVM_unicode_case_change_type_upper:
363
0
        if (!synth_info->case_uc)
364
0
            compute_case_change(tc, synth, synth_info, case_);
365
0
        *result = synth_info->case_uc;
366
0
        return synth_info->case_uc_graphs;
367
0
    case MVM_unicode_case_change_type_lower:
368
0
        if (!synth_info->case_lc)
369
0
            compute_case_change(tc, synth, synth_info, case_);
370
0
        *result = synth_info->case_lc;
371
0
        return synth_info->case_lc_graphs;
372
0
    case MVM_unicode_case_change_type_title:
373
0
        if (!synth_info->case_tc)
374
0
            compute_case_change(tc, synth, synth_info, case_);
375
0
        *result = synth_info->case_tc;
376
0
        return synth_info->case_tc_graphs;
377
0
    case MVM_unicode_case_change_type_fold:
378
0
        if (!synth_info->case_fc)
379
0
            compute_case_change(tc, synth, synth_info, case_);
380
0
        *result = synth_info->case_fc;
381
0
        return synth_info->case_fc_graphs;
382
0
    default:
383
0
        MVM_panic(1, "NFG: invalid case change %d", case_);
384
0
    }
385
0
}
386
387
0
MVM_STATIC_INLINE MVMint32 passes_quickcheck_and_zero_ccc(MVMThreadContext *tc, MVMCodepoint cp) {
388
0
    return MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NFG_QC)
389
0
    &&     MVM_unicode_codepoint_get_property_int(tc, cp,
390
0
               MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS) <= MVM_UNICODE_PVALUE_CCC_0;
391
0
}
392
/* Returns true for cps with Grapheme_Cluster_Break = Control */
393
0
MVM_STATIC_INLINE MVMint32 codepoint_GCB_Control (MVMThreadContext *tc, MVMCodepoint codepoint) {
394
0
    return MVM_unicode_codepoint_get_property_int(tc, codepoint,
395
0
        MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK)
396
0
    ==  MVM_UNICODE_PVALUE_GCB_CONTROL;
397
0
}
398
/* Returns non-zero if the result of concatenating the two strings will freely
399
 * leave us in NFG without any further effort. */
400
1.41M
MVMint32 MVM_nfg_is_concat_stable(MVMThreadContext *tc, MVMString *a, MVMString *b) {
401
1.41M
    MVMGrapheme32 last_a;
402
1.41M
    MVMGrapheme32 first_b;
403
1.41M
    MVMGrapheme32 crlf;
404
1.41M
405
1.41M
    /* If either string is empty, we're good. */
406
1.41M
    if (a->body.num_graphs == 0 || b->body.num_graphs == 0)
407
153
        return 1;
408
1.41M
409
1.41M
    /* Get first and last graphemes of the strings. */
410
1.41M
    last_a = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1);
411
1.41M
    first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0);
412
1.41M
    /* Put the case where we are adding a lf or crlf line ending */
413
1.41M
    if (first_b == '\n')
414
1.41M
        /* If we see \r + \n we need to renormalize. Otherwise we're good */
415
13.9k
        return last_a == '\r' ? 0 : 1;
416
1.41M
417
1.39M
    crlf = MVM_nfg_crlf_grapheme(tc);
418
1.39M
    /* As a control code we are always going to break if we see one of these.
419
1.39M
     * Check first_b for speeding up line endings */
420
1.39M
    if (first_b == crlf || last_a == crlf)
421
13
        return 0;
422
1.39M
    /* If either is synthetic other than "\r\n", assume we'll have to re-normalize
423
1.39M
     * (this is an over-estimate, most likely). Note if you optimize this that it
424
1.39M
     * serves as a guard for what follows.
425
1.39M
     * TODO get the last codepoint of last_a and first codepoint of first_b and call
426
1.39M
     * MVM_unicode_normalize_should_break */
427
1.39M
    if (last_a < 0 || first_b < 0)
428
2
        return 0;
429
1.39M
430
1.39M
    /* If both less than the first significant char for NFC we are good */
431
1.39M
    if (last_a < MVM_NORMALIZE_FIRST_SIG_NFC && first_b < MVM_NORMALIZE_FIRST_SIG_NFC) {
432
1.39M
        return 1;
433
1.39M
    }
434
6.56k
    else {
435
6.56k
        /* Check if the two codepoints would be joined during normalization.
436
6.56k
         * Returns 1 if they would break and thus is safe under concat, or 0 if
437
6.56k
         * they would be joined. */
438
6.56k
        MVMNormalizer norm;
439
6.56k
        int rtrn;
440
6.56k
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
441
6.56k
        rtrn = MVM_unicode_normalize_should_break(tc, last_a, first_b, &norm);
442
6.56k
        MVM_unicode_normalizer_cleanup(tc, &norm);
443
6.56k
        /* If both CCC are non-zero then it may need to be reordered. For now return 0.
444
6.56k
         * This can be optimized. */
445
6.56k
        if (MVM_unicode_relative_ccc(tc, last_a) != 0 && MVM_unicode_relative_ccc(tc, first_b) != 0)
446
0
            return 0;
447
6.56k
        return rtrn;
448
6.56k
    }
449
1.39M
}
450
451
/* Initialize NFG subsystem. */
452
144
static void cache_crlf(MVMThreadContext *tc) {
453
144
    MVMCodepoint codes[2] = { '\r', '\n' };
454
144
    tc->instance->nfg->crlf_grapheme = lookup_or_add_synthetic(tc, codes, 2, 0);
455
144
}
456
144
void MVM_nfg_init(MVMThreadContext *tc) {
457
144
    int init_stat;
458
144
    tc->instance->nfg = calloc(1, sizeof(MVMNFGState));
459
144
    if ((init_stat = uv_mutex_init(&(tc->instance->nfg->update_mutex))) < 0) {
460
0
        fprintf(stderr, "MoarVM: Initialization of NFG update mutex failed\n    %s\n",
461
0
            uv_strerror(init_stat));
462
0
        exit(1);
463
0
    }
464
144
    cache_crlf(tc);
465
144
}
466
467
/* Free all memory allocated to hold synthetic graphemes. These are global
468
 * to a VM instance. */
469
0
void MVM_nfg_destroy(MVMThreadContext *tc) {
470
0
    MVMNFGState *nfg = tc->instance->nfg;
471
0
    MVMint32 i;
472
0
473
0
    /* Free all synthetics. */
474
0
    if (nfg->synthetics) {
475
0
        size_t used_synths_in_block = nfg->num_synthetics % MVM_SYNTHETIC_GROW_ELEMS;
476
0
        size_t synths_to_free = used_synths_in_block
477
0
            ? nfg->num_synthetics + (MVM_SYNTHETIC_GROW_ELEMS - used_synths_in_block)
478
0
            : nfg->num_synthetics;
479
0
480
0
        for (i = 0; i < nfg->num_synthetics; i++) {
481
0
            MVM_fixed_size_free(tc, tc->instance->fsa,
482
0
                nfg->synthetics[i].num_codes * sizeof(MVMCodepoint),
483
0
                nfg->synthetics[i].codes);
484
0
            if (nfg->synthetics[i].case_uc != CASE_UNCHANGED)
485
0
                MVM_free(nfg->synthetics[i].case_uc);
486
0
            if (nfg->synthetics[i].case_lc != CASE_UNCHANGED)
487
0
                    MVM_free(nfg->synthetics[i].case_lc);
488
0
            if (nfg->synthetics[i].case_tc != CASE_UNCHANGED)
489
0
                MVM_free(nfg->synthetics[i].case_tc);
490
0
            if (nfg->synthetics[i].case_fc != CASE_UNCHANGED)
491
0
                MVM_free(nfg->synthetics[i].case_fc);
492
0
        }
493
0
494
0
        MVM_fixed_size_free(tc, tc->instance->fsa,
495
0
            synths_to_free * sizeof(MVMNFGSynthetic),
496
0
            nfg->synthetics);
497
0
    }
498
0
499
0
    MVM_free(nfg);
500
0
}