Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/nfg.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* Number of extra elements we add to the synthetics table each time we need
4
 * to grow it. */
5
260
#define MVM_SYNTHETIC_GROW_ELEMS 32
6
7
/* Finds the index of a given codepoint within a trie node. Returns it if
8
 * there is one, or negative if there is not (note 0 is a valid index). */
9
426k
static MVMint32 find_child_node_idx(MVMThreadContext *tc, const MVMNFGTrieNode *node, MVMCodepoint cp) {
10
426k
    if (node) {
11
426k
        /* TODO: update this to do a binary search later on. */
12
426k
        MVMint32 i;
13
426k
        for (i = 0; i < node->num_entries; i++)
14
426k
            if (node->next_codes[i].code == cp)
15
426k
                return i;
16
426k
    }
17
260
    return -1;
18
426k
}
19
20
/* Does a lookup in the trie for a synthetic for the specified codepoints. */
21
426k
MVMNFGTrieNode * find_child_node(MVMThreadContext *tc, const MVMNFGTrieNode *node, MVMCodepoint cp) {
22
426k
    MVMint32 idx = find_child_node_idx(tc, node, cp);
23
426k
    return idx >= 0 ? node->next_codes[idx].node : NULL;
24
426k
}
25
213k
static MVMGrapheme32 lookup_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes) {
26
213k
    MVMNFGTrieNode *cur_node        = tc->instance->nfg->grapheme_lookup;
27
213k
    MVMCodepoint   *cur_code        = codes;
28
213k
    MVMint32        codes_remaining = num_codes;
29
639k
    while (cur_node && codes_remaining) {
30
426k
        cur_node = find_child_node(tc, cur_node, *cur_code);
31
426k
        cur_code++;
32
426k
        codes_remaining--;
33
426k
    }
34
213k
    return cur_node ? cur_node->graph : 0;
35
213k
}
36
37
/* Recursive algorithm to add to the trie. Descends existing trie nodes so far
38
 * as we have them following the code points, then passes on a NULL for the
39
 * levels of current below that do not exist. Once we bottom out, makes a copy
40
 * of or creates a node for the synthetic. As we walk back up we create or
41
 * copy+tweak nodes until we have produced a new trie, re-using what we can of
42
 * the existing one. */
43
390
static MVMNFGTrieNode * twiddle_trie_node(MVMThreadContext *tc, MVMNFGTrieNode *current, MVMCodepoint *cur_code, MVMint32 codes_remaining, MVMGrapheme32 synthetic) {
44
390
    /* Make a new empty node, which we'll maybe copy some things from the
45
390
     * current node into. */
46
390
    MVMNFGTrieNode *new_node = MVM_fixed_size_alloc(tc, tc->instance->fsa, sizeof(MVMNFGTrieNode));
47
390
48
390
    /* If we've more codes remaining... */
49
390
    if (codes_remaining > 0) {
50
260
        /* Recurse, to get a new child node. */
51
260
        MVMint32 idx = find_child_node_idx(tc, current, *cur_code);
52
260
        MVMNFGTrieNode *new_child = twiddle_trie_node(tc,
53
0
            idx >= 0 ? current->next_codes[idx].node : NULL,
54
260
            cur_code + 1, codes_remaining - 1, synthetic);
55
260
56
260
        /* If we had an existing child node... */
57
260
        if (idx >= 0) {
58
0
            /* Make a copy of the next_codes list. */
59
0
            size_t the_size = current->num_entries * sizeof(MVMNFGTrieNodeEntry);
60
0
            MVMNFGTrieNodeEntry *new_next_codes = MVM_fixed_size_alloc(tc,
61
0
                tc->instance->fsa, the_size);
62
0
            memcpy(new_next_codes, current->next_codes, the_size);
63
0
64
0
            /* Update the copy to point to the new child. */
65
0
            new_next_codes[idx].node = new_child;
66
0
67
0
            /* Install the new next_codes list in the new node, and free the
68
0
             * existing child list at the next safe point. */
69
0
            new_node->num_entries = current->num_entries;
70
0
            new_node->next_codes  = new_next_codes;
71
0
            MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa, the_size,
72
0
                current->next_codes);
73
0
        }
74
260
75
260
        /* Otherwise, we're going to need to insert the new child into a
76
260
         * (possibly existing) child list. */
77
260
        else {
78
260
            /* Calculate new child node list size and allocate it. */
79
260
            MVMint32 orig_entries = current ? current->num_entries : 0;
80
260
            MVMint32 new_entries  = orig_entries + 1;
81
260
            size_t new_size       = new_entries * sizeof(MVMNFGTrieNodeEntry);
82
260
            MVMNFGTrieNodeEntry *new_next_codes = MVM_fixed_size_alloc(tc,
83
260
                tc->instance->fsa, new_size);
84
260
85
260
            /* Go through original entries, copying those that are for a lower
86
260
             * code point than the one we're inserting a child for. */
87
260
            MVMint32 insert_pos = 0;
88
260
            MVMint32 orig_pos   = 0;
89
260
            while (orig_pos < orig_entries && current->next_codes[orig_pos].code < *cur_code)
90
0
                new_next_codes[insert_pos++] = current->next_codes[orig_pos++];
91
260
92
260
            /* Insert the new child. */
93
260
            new_next_codes[insert_pos].code = *cur_code;
94
260
            new_next_codes[insert_pos].node = new_child;
95
260
            insert_pos++;
96
260
97
260
            /* Copy the rest. */
98
260
            while (orig_pos < orig_entries)
99
0
                new_next_codes[insert_pos++] = current->next_codes[orig_pos++];
100
260
101
260
            /* Install the new next_codes list in the new node, and free any
102
260
             * existing child list at the next safe point. */
103
260
            new_node->num_entries = new_entries;
104
260
            new_node->next_codes  = new_next_codes;
105
260
            if (orig_entries)
106
0
                MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa,
107
0
                    orig_entries * sizeof(MVMNFGTrieNodeEntry),
108
0
                    current->next_codes);
109
260
        }
110
260
111
260
        /* Always need to copy synthetic set on the existing node also;
112
260
         * otherwise make sure to clear it. */
113
260
        new_node->graph = current ? current->graph : 0;
114
260
    }
115
390
116
390
    /* Otherwise, we reached the point where we need to install the synthetic.
117
390
     * If we already had a node here, we re-use the children of it. */
118
130
    else {
119
130
        new_node->graph = synthetic;
120
130
        if (current) {
121
0
            new_node->num_entries = current->num_entries;
122
0
            new_node->next_codes  = current->next_codes;
123
0
        }
124
130
        else {
125
130
            new_node->num_entries = 0;
126
130
            new_node->next_codes  = NULL;
127
130
        }
128
130
    }
129
390
130
390
    /* Free any existing node at next safe point, return the new one. */
131
390
    if (current)
132
0
        MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa,
133
0
            sizeof(MVMNFGTrieNode), current);
134
390
    return new_node;
135
390
}
136
130
static void add_synthetic_to_trie(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMGrapheme32 synthetic) {
137
130
    MVMNFGState    *nfg      = tc->instance->nfg;
138
130
    MVMNFGTrieNode *new_trie = twiddle_trie_node(tc, nfg->grapheme_lookup, codes, num_codes, synthetic);
139
130
    MVM_barrier();
140
130
    nfg->grapheme_lookup = new_trie;
141
130
}
142
143
/* Assumes that we are holding the lock that serializes updates, and already
144
 * checked that the synthetic does not exist. Adds it to the lookup trie and
145
 * synthetics table, making sure to do enough copy/free-at-safe-point work to
146
 * not upset other threads possibly doing concurrent reads. */
147
130
static MVMGrapheme32 add_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMint32 utf8_c8) {
148
130
    MVMNFGState     *nfg = tc->instance->nfg;
149
130
    MVMNFGSynthetic *synth;
150
130
    MVMGrapheme32    result;
151
130
    size_t           comb_size;
152
130
153
130
    /* Grow the synthetics table if needed. */
154
130
    if (nfg->num_synthetics % MVM_SYNTHETIC_GROW_ELEMS == 0) {
155
130
        size_t orig_size = nfg->num_synthetics * sizeof(MVMNFGSynthetic);
156
130
        size_t new_size  = (nfg->num_synthetics + MVM_SYNTHETIC_GROW_ELEMS) * sizeof(MVMNFGSynthetic);
157
130
        MVMNFGSynthetic *new_synthetics = MVM_fixed_size_alloc(tc, tc->instance->fsa, new_size);
158
130
        if (orig_size) {
159
0
            memcpy(new_synthetics, nfg->synthetics, orig_size);
160
0
            MVM_fixed_size_free_at_safepoint(tc, tc->instance->fsa, orig_size, nfg->synthetics);
161
0
        }
162
130
        nfg->synthetics = new_synthetics;
163
130
    }
164
130
165
130
    /* Set up the new synthetic entry. */
166
130
    synth            = &(nfg->synthetics[nfg->num_synthetics]);
167
130
    synth->base      = *codes;
168
130
    synth->num_combs = num_codes - 1;
169
130
    comb_size        = synth->num_combs * sizeof(MVMCodepoint);
170
130
    synth->combs     = MVM_fixed_size_alloc(tc, tc->instance->fsa, comb_size);
171
130
    memcpy(synth->combs, codes + 1, comb_size);
172
130
    synth->case_uc    = 0;
173
130
    synth->case_lc    = 0;
174
130
    synth->case_tc    = 0;
175
130
    synth->case_fc    = 0;
176
130
    synth->is_utf8_c8 = utf8_c8;
177
130
178
130
    /* Memory barrier to make sure the synthetic is fully in place before we
179
130
     * bump the count. */
180
130
    MVM_barrier();
181
130
    nfg->num_synthetics++;
182
130
183
130
    /* Give the synthetic an ID by negating the new number of synthetics. */
184
130
    result = -nfg->num_synthetics;
185
130
186
130
    /* Make an entry in the lookup trie for the new synthetic, so we can use
187
130
     * it in the future when seeing the same codepoint sequence. */
188
130
    add_synthetic_to_trie(tc, codes, num_codes, result);
189
130
190
130
    return result;
191
130
}
192
193
/* Does a lookup of a synthetic in the trie. If we find one, returns it. If
194
 * not, acquires the update lock, re-checks that we really are missing the
195
 * synthetic, and then adds it. */
196
213k
static MVMGrapheme32 lookup_or_add_synthetic(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes, MVMint32 utf8_c8) {
197
213k
    MVMGrapheme32 result = lookup_synthetic(tc, codes, num_codes);
198
213k
    if (!result) {
199
130
        uv_mutex_lock(&tc->instance->nfg->update_mutex);
200
130
        result = lookup_synthetic(tc, codes, num_codes);
201
130
        if (!result)
202
130
            result = add_synthetic(tc, codes, num_codes, utf8_c8);
203
130
        uv_mutex_unlock(&tc->instance->nfg->update_mutex);
204
130
    }
205
213k
    return result;
206
213k
}
207
208
/* Takes one or more code points. If only one code point is passed, it is
209
 * returned as the grapheme. Otherwise, resolves it to a synthetic - either an
210
 * already existing one if we saw it before, or a new one if not.  Assumes
211
 * that the code points are already in NFC, and as such canonical ordering has
212
 * been applied. */
213
30.8k
MVMGrapheme32 MVM_nfg_codes_to_grapheme(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes) {
214
30.8k
    if (num_codes == 1)
215
30.6k
        return codes[0];
216
179
    else if (num_codes < MVM_GRAPHEME_MAX_CODEPOINTS)
217
179
        return lookup_or_add_synthetic(tc, codes, num_codes, 0);
218
179
    else
219
0
        MVM_exception_throw_adhoc(tc, "Too many codepoints (%d) in grapheme", num_codes);
220
30.8k
}
221
222
/* Does the same as MVM_nfg_codes_to_grapheme, but flags the added grapheme as
223
 * being an UTF8-C8 synthetic. */
224
0
MVMGrapheme32 MVM_nfg_codes_to_grapheme_utf8_c8(MVMThreadContext *tc, MVMCodepoint *codes, MVMint32 num_codes) {
225
0
    if (num_codes == 1)
226
0
        return codes[0];
227
0
    else
228
0
        return lookup_or_add_synthetic(tc, codes, num_codes, 1);
229
0
}
230
231
/* Gets the \r\n synthetic. */
232
213k
MVMGrapheme32 MVM_nfg_crlf_grapheme(MVMThreadContext *tc) {
233
213k
    MVMCodepoint codes[2] = { '\r', '\n' };
234
213k
    return lookup_or_add_synthetic(tc, codes, 2, 0);
235
213k
}
236
237
/* Does a lookup of information held about a synthetic. The synth parameter
238
 * must be a synthetic codepoint (that is, negative). The memory returned is
239
 * not to be freed by the caller; it also is only valid until the next GC
240
 * safe point. */
241
18
MVMNFGSynthetic * MVM_nfg_get_synthetic_info(MVMThreadContext *tc, MVMGrapheme32 synth) {
242
18
    MVMNFGState *nfg       = tc->instance->nfg;
243
18
    MVMint32     synth_idx = -synth - 1;
244
18
    if (synth >= 0)
245
0
        MVM_panic(1, "MVM_nfg_get_synthetic_info illegally called on codepoint >= 0");
246
18
    if (synth_idx >= nfg->num_synthetics)
247
0
        MVM_panic(1, "MVM_nfg_get_synthetic_info called with out-of-range synthetic");
248
18
    return &(nfg->synthetics[synth_idx]);
249
18
}
250
251
/* Gets the cached case change if we already computed it, or computes it if
252
 * this is the first time we're using it. */
253
static MVMGrapheme32 CASE_UNCHANGED[1] = {0};
254
0
static void compute_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMNFGSynthetic *synth_info, MVMint32 case_) {
255
0
    MVMGrapheme32 *result;
256
0
    MVMint32 num_result_graphs;
257
0
258
0
    /* Transform the base character. */
259
0
    const MVMCodepoint *result_cps;
260
0
    MVMuint32     num_result_cps = MVM_unicode_get_case_change(tc, synth_info->base,
261
0
        case_, &result_cps);
262
0
    if (num_result_cps == 0 || *result_cps == synth_info->base) {
263
0
        /* Base character does not change, so grapheme stays the same. We
264
0
         * install a non-null sentinel for this case, and set the result
265
0
         * grapheme count to zero, which indicates no change. */
266
0
        result = CASE_UNCHANGED;
267
0
        num_result_graphs = 0;
268
0
    }
269
0
    else {
270
0
        /* We can potentially get multiple graphemes back. We may also get
271
0
         * into situations where we case change the base and suddenly we
272
0
         * can normalize the whole thing to a non-synthetic. So, we take
273
0
         * a trip through the normalizer. Note we push the first thing
274
0
         * we get back from the case change, then our combiners, and
275
0
         * finally anything else the case change produced. This should
276
0
         * do about the right thing for both case changes that produce a
277
0
         * base and a combiner, and those that produce a base and a base,
278
0
         * since the normalizer applies Unicode canonical sorting. */
279
0
        MVMNormalizer norm;
280
0
        MVMint32 i;
281
0
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
282
0
        MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, 1);
283
0
        MVM_unicode_normalizer_push_codepoints(tc, &norm, synth_info->combs,
284
0
            synth_info->num_combs);
285
0
        if (num_result_cps > 1)
286
0
            MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps + 1,
287
0
                num_result_cps - 1);
288
0
        MVM_unicode_normalizer_eof(tc, &norm);
289
0
290
0
        num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
291
0
        result = MVM_malloc(num_result_graphs * sizeof(MVMGrapheme32));
292
0
        for (i = 0; i < num_result_graphs; i++)
293
0
            result[i] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
294
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
295
0
    }
296
0
297
0
    switch (case_) {
298
0
    case MVM_unicode_case_change_type_upper:
299
0
        synth_info->case_uc = result;
300
0
        synth_info->case_uc_graphs = num_result_graphs;
301
0
        break;
302
0
    case MVM_unicode_case_change_type_lower:
303
0
        synth_info->case_lc = result;
304
0
        synth_info->case_lc_graphs = num_result_graphs;
305
0
        break;
306
0
    case MVM_unicode_case_change_type_title:
307
0
        synth_info->case_tc = result;
308
0
        synth_info->case_tc_graphs = num_result_graphs;
309
0
        break;
310
0
    case MVM_unicode_case_change_type_fold:
311
0
        synth_info->case_fc = result;
312
0
        synth_info->case_fc_graphs = num_result_graphs;
313
0
        break;
314
0
    default:
315
0
        MVM_panic(1, "NFG: invalid case change %d", case_);
316
0
    }
317
0
}
318
0
MVMuint32 MVM_nfg_get_case_change(MVMThreadContext *tc, MVMGrapheme32 synth, MVMint32 case_, MVMGrapheme32 **result) {
319
0
    MVMNFGSynthetic *synth_info = MVM_nfg_get_synthetic_info(tc, synth);
320
0
    switch (case_) {
321
0
    case MVM_unicode_case_change_type_upper:
322
0
        if (!synth_info->case_uc)
323
0
            compute_case_change(tc, synth, synth_info, case_);
324
0
        *result = synth_info->case_uc;
325
0
        return synth_info->case_uc_graphs;
326
0
    case MVM_unicode_case_change_type_lower:
327
0
        if (!synth_info->case_lc)
328
0
            compute_case_change(tc, synth, synth_info, case_);
329
0
        *result = synth_info->case_lc;
330
0
        return synth_info->case_lc_graphs;
331
0
    case MVM_unicode_case_change_type_title:
332
0
        if (!synth_info->case_tc)
333
0
            compute_case_change(tc, synth, synth_info, case_);
334
0
        *result = synth_info->case_tc;
335
0
        return synth_info->case_tc_graphs;
336
0
    case MVM_unicode_case_change_type_fold:
337
0
        if (!synth_info->case_fc)
338
0
            compute_case_change(tc, synth, synth_info, case_);
339
0
        *result = synth_info->case_fc;
340
0
        return synth_info->case_fc_graphs;
341
0
    default:
342
0
        MVM_panic(1, "NFG: invalid case change %d", case_);
343
0
    }
344
0
}
345
346
12.0k
MVM_STATIC_INLINE MVMint32 passes_quickcheck_and_zero_ccc(MVMThreadContext *tc, MVMCodepoint cp) {
347
12.0k
    return MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NFG_QC)
348
12.0k
    &&     MVM_unicode_codepoint_get_property_int(tc, cp,
349
12.0k
               MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS) <= MVM_UNICODE_PVALUE_CCC_0;
350
12.0k
}
351
/* Returns true for cps with Grapheme_Cluster_Break = Control */
352
12.0k
MVM_STATIC_INLINE MVMint32 codepoint_GCB_Control (MVMThreadContext *tc, MVMCodepoint codepoint) {
353
12.0k
    return MVM_unicode_codepoint_get_property_int(tc, codepoint,
354
12.0k
        MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK)
355
12.0k
    ==  MVM_UNICODE_PVALUE_GCB_CONTROL;
356
12.0k
}
357
/* Returns non-zero if the result of concatenating the two strings will freely
358
 * leave us in NFG without any further effort. */
359
182k
MVMint32 MVM_nfg_is_concat_stable(MVMThreadContext *tc, MVMString *a, MVMString *b) {
360
182k
    MVMGrapheme32 last_a;
361
182k
    MVMGrapheme32 first_b;
362
182k
    MVMGrapheme32 crlf;
363
182k
364
182k
    /* If either string is empty, we're good. */
365
182k
    if (a->body.num_graphs == 0 || b->body.num_graphs == 0)
366
142
        return 1;
367
182k
368
182k
    /* Get first and last graphemes of the strings. */
369
182k
    last_a = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1);
370
182k
    first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0);
371
182k
372
182k
    crlf = MVM_nfg_crlf_grapheme(tc);
373
182k
374
182k
    /* If either is synthetic other than "\r\n", assume we'll have to re-normalize
375
182k
     * (this is an over-estimate, most likely). Note if you optimize this that it
376
182k
     * serves as a guard for what follows. */
377
182k
    if ((last_a != crlf && last_a < 0) || (first_b != crlf && first_b < 0))
378
0
        return 0;
379
182k
    /* If last_a is \r and first_b is \n then we need to renormalize */
380
182k
    if (last_a == '\r' && first_b == '\n')
381
33
        return 0;
382
182k
383
182k
    /* If both less than the first significant char for NFC we are good */
384
182k
    if (last_a < MVM_NORMALIZE_FIRST_SIG_NFC && first_b < MVM_NORMALIZE_FIRST_SIG_NFC)
385
175k
        return 1;
386
182k
387
182k
    /* If either fail quickcheck or have ccc > 0, and it does not have
388
182k
     * Grapheme_Cluster_Break=Control we have to re-normalize */
389
6.03k
    return (last_a == crlf || codepoint_GCB_Control(tc, last_a) || passes_quickcheck_and_zero_ccc(tc, last_a))
390
6.03k
        && (first_b == crlf || codepoint_GCB_Control(tc, first_b) || passes_quickcheck_and_zero_ccc(tc, first_b));
391
182k
}
392
393
/* Free all memory allocated to hold synthetic graphemes. These are global
394
 * to a VM instance. */
395
0
void MVM_nfg_destroy(MVMThreadContext *tc) {
396
0
    MVMNFGState *nfg = tc->instance->nfg;
397
0
    MVMint32 i;
398
0
399
0
    /* Free all synthetics. */
400
0
    if (nfg->synthetics) {
401
0
        size_t used_synths_in_block = nfg->num_synthetics % MVM_SYNTHETIC_GROW_ELEMS;
402
0
        size_t synths_to_free = used_synths_in_block
403
0
            ? nfg->num_synthetics + (MVM_SYNTHETIC_GROW_ELEMS - used_synths_in_block)
404
0
            : nfg->num_synthetics;
405
0
406
0
        for (i = 0; i < nfg->num_synthetics; i++) {
407
0
            MVM_fixed_size_free(tc, tc->instance->fsa,
408
0
                nfg->synthetics[i].num_combs * sizeof(MVMCodepoint),
409
0
                nfg->synthetics[i].combs);
410
0
            if (nfg->synthetics[i].case_uc != CASE_UNCHANGED)
411
0
                MVM_free(nfg->synthetics[i].case_uc);
412
0
            if (nfg->synthetics[i].case_lc != CASE_UNCHANGED)
413
0
                    MVM_free(nfg->synthetics[i].case_lc);
414
0
            if (nfg->synthetics[i].case_tc != CASE_UNCHANGED)
415
0
                MVM_free(nfg->synthetics[i].case_tc);
416
0
            if (nfg->synthetics[i].case_fc != CASE_UNCHANGED)
417
0
                MVM_free(nfg->synthetics[i].case_fc);
418
0
        }
419
0
420
0
        MVM_fixed_size_free(tc, tc->instance->fsa,
421
0
            synths_to_free * sizeof(MVMNFGSynthetic),
422
0
            nfg->synthetics);
423
0
    }
424
0
425
0
    MVM_free(nfg);
426
0
}