Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/ops.c
Line
Count
Source (jump to first uncovered line)
1
#include "platform/memmem.h"
2
#include "platform/memmem32.h"
3
#include "moar.h"
4
#define MVM_DEBUG_STRANDS 0
5
9.29k
#define MVM_string_KMP_max_pattern_length 8192
6
/* Max value possible for MVMuint32 MVMStringBody.num_graphs */
7
1.39M
#define MAX_GRAPHEMES     0xFFFFFFFFLL
8
9
#if MVM_DEBUG_STRANDS
10
static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) {
11
    MVMGraphemeIter gi;
12
    MVMuint32       len;
13
    MVM_string_gi_init(tc, &gi, s);
14
    len = 0;
15
    while (MVM_string_gi_has_more(tc, &gi)) {
16
        MVM_string_gi_get_grapheme(tc, &gi);
17
        len++;
18
    }
19
    if (len != MVM_string_graphs(tc, s))
20
        MVM_exception_throw_adhoc(tc,
21
            "Strand sanity check failed (strand length %d != num_graphs %d)",
22
            len, MVM_string_graphs(tc, s));
23
}
24
#define STRAND_CHECK(tc, s) check_strand_sanity(tc, s);
25
#else
26
#define STRAND_CHECK(tc, s)
27
#endif
28
29
static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in);
30
#if MVM_DEBUG_NFG
31
static char * NFG_check_make_debug_string (MVMThreadContext *tc, MVMGrapheme32 g) {
32
    char *result = NULL;
33
    char *picked = NULL;
34
    if (g == '\r')
35
        picked = "\\r";
36
    else if (g == '\n')
37
        picked = "\\n";
38
    else if (g == MVM_nfg_crlf_grapheme(tc))
39
        picked = "\\r\\n";
40
    else if (0 <= g && !MVM_string_is_control_full(tc, g))
41
        result = MVM_string_utf8_encode_C_string(tc, MVM_string_chr(tc, g));
42
    else if (g < 0) {
43
        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
44
        char *format_str = " with num_codes = ";
45
        char *format_str2 = " first, second cp = ";
46
        char *synthtype_str = synth->is_utf8_c8 ?  "utf8-8 Synthetic" :  "Normal Synthetic";
47
        int this_len = strlen(format_str) + strlen(synthtype_str) + 6 + strlen(format_str2) + 11 + 1 + 11 + 1;
48
        result = MVM_malloc(this_len);
49
        if (2 <= synth->num_codes)
50
            sprintf(result, "%s%s%5i%s%.10"PRIi32",%.10"PRIi32"", synthtype_str, format_str, synth->num_codes, format_str2, synth->codes[0], synth->codes[1]);
51
        else
52
            sprintf(result, "WARNING synth has less than 2 codes");
53
        fprintf(stderr, "synth numcodes %i %"PRIi32"\n",
54
            MVM_nfg_get_synthetic_info(tc, synth->codes[1])->num_codes, MVM_nfg_get_synthetic_info(tc, synth->codes[1])->codes[0]
55
        );
56
    }
57
    else
58
        picked = "[Control]";
59
    if (picked) {
60
        result = MVM_malloc(sizeof(char) * (strlen(picked) + 1));
61
        strcpy(result, picked);
62
    }
63
    if (!result) {
64
        result = MVM_malloc(sizeof(char) * 1);
65
        result[0] = 0;
66
    }
67
    return result;
68
}
69
static char * NFG_checker (MVMThreadContext *tc, MVMString *orig, char *varname);
70
void NFG_check (MVMThreadContext *tc, MVMString *orig, char *varname) {
71
    char *out = NFG_checker(tc, orig, varname);
72
    char *waste[2] = { out, NULL };
73
    if (!out)
74
        return;
75
    MVM_exception_throw_adhoc_free(tc, waste, "%s", out);
76
}
77
static char * NFG_checker (MVMThreadContext *tc, MVMString *orig, char *varname) {
78
    MVMString *renorm = NULL;
79
    MVMStringIndex orig_graphs = MVM_string_graphs(tc, orig),
80
                   renorm_graphs = -1;
81
    MVMROOT2(tc, orig, renorm, {
82
        renorm = re_nfg(tc, orig);
83
        renorm_graphs = MVM_string_graphs(tc, renorm);
84
    });
85
    if (MVM_DEBUG_NFG_STRICT || orig_graphs != renorm_graphs) {
86
        MVMGraphemeIter orig_gi, renorm_gi;
87
        MVMint64 index = 0;
88
        MVM_string_gi_init(tc, &orig_gi, orig);
89
        MVM_string_gi_init(tc, &renorm_gi, renorm);
90
        while (MVM_string_gi_has_more(tc,  &orig_gi) && MVM_string_gi_has_more(tc,  &renorm_gi)) {
91
            MVMGrapheme32 orig_g   = MVM_string_gi_get_grapheme(tc, &orig_gi),
92
                          renorm_g = MVM_string_gi_get_grapheme(tc, &renorm_gi);
93
            if (orig_g != renorm_g) {
94
                char *orig_render   = NFG_check_make_debug_string(tc, orig_g),
95
                     *renorm_render = NFG_check_make_debug_string(tc, renorm_g);
96
                char *format = "NFG failure. Got different grapheme count of %s. "
97
                    "Got %i but after re_nfg got %i\n"
98
                        "Differing grapheme at index %"PRIi64"\n"
99
                            "orig: %"PRIi32"  (%s)  after re_nfg: %"PRIi32"  (%s)\n";
100
                int out_size = strlen(orig_render) + strlen(renorm_render)
101
                    + strlen(varname) + strlen(format) + (5 * 7) + 1;
102
                char *out = MVM_malloc(sizeof(char) * out_size);
103
                char *waste[] = {orig_render, renorm_render, NULL};
104
                char **w = waste;
105
                snprintf(out, out_size,
106
                    format,
107
                    varname,
108
                        orig_graphs, renorm_graphs,
109
                            index,
110
                                orig_g, orig_render, renorm_g, renorm_render);
111
                MVM_free(orig_render);
112
                MVM_free(renorm_render);
113
                return out;
114
            }
115
            index++;
116
        }
117
    }
118
    return NULL;
119
}
120
void NFG_check_concat (MVMThreadContext *tc, MVMString *result, MVMString *a, MVMString *b, char *varname) {
121
    char *a_out = NFG_checker(tc, a, "string ‘a’");
122
    char *b_out = NFG_checker(tc, b, "string ‘b’");
123
    char *out = NFG_checker(tc, result, varname);
124
    char *strings[] = { a_out, b_out, out };
125
    char *names[]   = { "\nconcat string ‘a’: ", "\nconcat string ‘b’: ", "\nconcat result: " };
126
    int i = 0, elems = 4;
127
    int rtrn = 0;
128
    char * empty = "";
129
    if (!a_out && !b_out && !out)
130
        return;
131
    else {
132
        MVMGrapheme32 last_a  =  MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1),
133
                      first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0);
134
        char   *debug_a = NFG_check_make_debug_string(tc, last_a),
135
               *debug_b = NFG_check_make_debug_string(tc, first_b),
136
             *escaped_a = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, a)),
137
             *escaped_b = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, b)),
138
        *escaped_result = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, result));
139
        char *waste[] = { out, debug_a, debug_b, escaped_a, escaped_b, escaped_result, NULL };
140
        MVM_exception_throw_adhoc_free(tc, waste,
141
            "In concat: a graphs: %"PRIi32" b graphs: %"PRIi32"\n"
142
            "last_a: %"PRIi32" (%s)  first_b %"PRIi32"  (%s)\n"
143
            "a: “%s”\n"
144
            "b: “%s”\n"
145
            "result: “%s”\n"
146
            "%s%s%s%s%s%s",
147
            MVM_string_graphs(tc, a), MVM_string_graphs(tc, b),
148
            last_a, debug_a, first_b, debug_b,
149
            escaped_a,
150
            escaped_b,
151
            escaped_result,
152
            (a_out?names[0]:""), (a_out?a_out:""),
153
            (b_out?names[1]:""), (b_out?b_out:""),
154
            (out?names[2]:""), (out?out:""));
155
        }
156
157
158
}
159
#endif
160
161
MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, void *Hs_or_gic, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase, int is_gic);
162
static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset);
163
164
/* Allocates strand storage. */
165
1.70M
static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) {
166
1.70M
    return MVM_malloc(num_strands * sizeof(MVMStringStrand));
167
1.70M
}
168
169
/* Copies strands from one strand string to another. */
170
static void copy_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
171
1.36M
        MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
172
1.36M
    assert(from->body.storage_type == MVM_STRING_STRAND);
173
1.36M
    assert(to->body.storage_type == MVM_STRING_STRAND);
174
1.36M
    memcpy(
175
1.36M
        to->body.storage.strands + to_offset,
176
1.36M
        from->body.storage.strands + from_offset,
177
1.36M
        num_strands * sizeof(MVMStringStrand));
178
1.36M
}
179
/* Move strands inside the same strand string. */
180
static void move_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
181
26
        MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
182
26
    assert(from->body.storage_type == MVM_STRING_STRAND);
183
26
    assert(to->body.storage_type == MVM_STRING_STRAND);
184
26
    memmove(
185
26
        to->body.storage.strands + to_offset,
186
26
        from->body.storage.strands + from_offset,
187
26
        num_strands * sizeof(MVMStringStrand));
188
26
}
189
190
156k
#define can_fit_into_8bit(g) ((-128 <= (g) && (g) <= 127))
191
192
0
MVM_STATIC_INLINE int can_fit_into_ascii (MVMGrapheme32 g) {
193
0
    return 0 <= g && g <= 127;
194
0
}
195
/* If a string is currently using 32bit storage, turn it into using
196
 * 8 bit storage. Doesn't do any checks at all. */
197
561
static void turn_32bit_into_8bit_unchecked(MVMThreadContext *tc, MVMString *str) {
198
561
    MVMGrapheme32 *old_buf = str->body.storage.blob_32;
199
561
    MVMStringIndex i;
200
561
    MVMGrapheme8 *dest_buf = NULL;
201
561
    MVMStringIndex num_graphs = MVM_string_graphs_nocheck(tc, str);
202
561
    str->body.storage_type   = MVM_STRING_GRAPHEME_8;
203
561
    dest_buf = str->body.storage.blob_8 = MVM_malloc(str->body.num_graphs * sizeof(MVMGrapheme8));
204
561
    MVM_VECTORIZE_LOOP
205
6.79k
    for (i = 0; i < num_graphs; i++) {
206
6.22k
        dest_buf[i] = old_buf[i];
207
6.22k
    }
208
561
209
561
    MVM_free(old_buf);
210
561
}
211
/* Checks if the next num_graphs graphemes in the iterator can fit into 8 bits.
212
 * This was written to take advantage of SIMD vectorization, so we use a multiple
213
 * bitwise operations to check, and biwise OR it with val. Care must be taken
214
 * to not use any variables altered by the loop outside of the loop and to not
215
 * have any branching or funcion calls. `i` is not used outside the loop
216
 * `val` is allowed as biwise OR works with the vectorization well.
217
 * NOTE: GraphemeIter is not modified by this function. */
218
6.64k
static int string_can_be_8bit(MVMThreadContext *tc, MVMGraphemeIter *gi_orig, MVMStringIndex num_graphs) {
219
6.64k
    MVMStringIndex pos = 0;
220
6.64k
    MVMGraphemeIter gi;
221
6.64k
    memcpy(&gi, gi_orig, sizeof(MVMGraphemeIter));
222
28.2k
    while (1) {
223
28.2k
        MVMStringIndex strand_len = MVM_string_gi_graphs_left_in_strand(tc, &gi);
224
28.2k
        MVMStringIndex togo = num_graphs - pos < strand_len
225
2.96k
            ? num_graphs - pos
226
25.2k
            : strand_len;
227
28.2k
        if (MVM_string_gi_blob_type(tc, &gi) == MVM_STRING_GRAPHEME_32) {
228
3.61k
            if (!MVM_string_buf32_can_fit_into_8bit(MVM_string_gi_active_blob_32_pos(tc, &gi), togo))
229
5
                return 0;
230
3.61k
        }
231
28.2k
        pos += togo;
232
28.2k
        if (num_graphs == pos || !MVM_string_gi_has_more_strands_rep(tc, &gi)) {
233
6.64k
            break;
234
6.64k
        }
235
21.5k
        MVM_string_gi_next_strand_rep(tc, &gi);
236
21.5k
    }
237
6.64k
    return 1;
238
6.64k
239
6.64k
}
240
/* Accepts an allocated string that should have body.num_graphs set but the blob
241
 * unallocated. This function will allocate the space for the blob and iterate
242
 * the supplied grapheme iterator for the length of body.num_graphs. Very fast
243
 * since compilers will convert them to SIMD vector operations. */
244
6.64k
static void iterate_gi_into_string(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *result, MVMString *orig, MVMStringIndex num) {
245
6.64k
    int result_pos = 0;
246
6.64k
    MVMGrapheme8   *result8   = NULL;
247
6.64k
    MVMGrapheme32 *result32   = NULL;
248
6.64k
    MVMStringIndex result_graphs = MVM_string_graphs_nocheck(tc, result);
249
6.64k
    if (!result_graphs)
250
0
        return;
251
6.64k
252
6.64k
    if (string_can_be_8bit(tc, gi, result_graphs)) {
253
6.64k
        MVMStringIndex result_pos = 0;
254
6.64k
        result->body.storage_type = MVM_STRING_GRAPHEME_8;
255
6.64k
        result8 = result->body.storage.blob_8 =
256
6.64k
            MVM_malloc(result_graphs * sizeof(MVMGrapheme8));
257
8.21k
        while (1) {
258
8.21k
            MVMStringIndex strand_len =
259
8.21k
                MVM_string_gi_graphs_left_in_strand(tc, gi);
260
8.21k
            MVMStringIndex to_copy = result_graphs - result_pos < strand_len
261
2.96k
                ? result_graphs - result_pos
262
5.25k
                : strand_len;
263
8.21k
            MVMGrapheme8  *result_blob8 = result8 + result_pos;
264
8.21k
            switch (MVM_string_gi_blob_type(tc, gi)) {
265
3.60k
            case MVM_STRING_GRAPHEME_32: {
266
3.60k
                MVMStringIndex i;
267
3.60k
                MVMGrapheme32 *active_blob =
268
3.60k
                    MVM_string_gi_active_blob_32_pos(tc, gi);
269
3.60k
                MVM_VECTORIZE_LOOP
270
7.92k
                for (i = 0; i < to_copy; i++) {
271
4.31k
                    result_blob8[i] = active_blob[i];
272
4.31k
                }
273
3.60k
                break;
274
3.60k
            }
275
4.61k
            case MVM_STRING_GRAPHEME_8:
276
4.61k
            case MVM_STRING_GRAPHEME_ASCII: {
277
4.61k
                memcpy(
278
4.61k
                    result_blob8,
279
4.61k
                    MVM_string_gi_active_blob_8_pos(tc, gi),
280
4.61k
                    to_copy * sizeof(MVMGrapheme8)
281
4.61k
                );
282
4.61k
                break;
283
4.61k
            }
284
0
            default:
285
0
                MVM_exception_throw_adhoc(tc,
286
0
                    "Internal error, string corruption in iterate_gi_into_string\n");
287
8.21k
            }
288
8.21k
            result_pos += to_copy;
289
8.21k
            if (result_graphs <= result_pos || !MVM_string_gi_has_more_strands_rep(tc, gi)) {
290
6.64k
                break;
291
6.64k
            }
292
1.57k
            MVM_string_gi_next_strand_rep(tc, gi);
293
1.57k
        }
294
6.64k
    }
295
5
    else {
296
5
        MVMStringIndex result_pos = 0;
297
5
        result->body.storage_type            = MVM_STRING_GRAPHEME_32;
298
5
        result32 = result->body.storage.blob_32 =
299
5
            result_graphs ? MVM_malloc(result_graphs * sizeof(MVMGrapheme32)) : NULL;
300
20.0k
        while (1) {
301
20.0k
            MVMStringIndex strand_len = MVM_string_gi_graphs_left_in_strand(tc, gi);
302
20.0k
            MVMStringIndex to_copy = result_graphs - result_pos < strand_len
303
0
                ? result_graphs - result_pos
304
20.0k
                : strand_len;
305
20.0k
            switch (MVM_string_gi_blob_type(tc, gi)) {
306
20.0k
                case MVM_STRING_GRAPHEME_8:
307
20.0k
                case MVM_STRING_GRAPHEME_ASCII: {
308
20.0k
                    MVMGrapheme8  *active_blob =
309
20.0k
                        MVM_string_gi_active_blob_8_pos(tc, gi);
310
20.0k
                    MVMGrapheme32 *result_blob32 = result32 + result_pos;
311
20.0k
                    MVMStringIndex i;
312
20.0k
                    MVM_VECTORIZE_LOOP
313
540k
                    for (i = 0; i < to_copy; i++) {
314
520k
                        result_blob32[i] = active_blob[i];
315
520k
                    }
316
20.0k
                    break;
317
20.0k
                }
318
5
                case MVM_STRING_GRAPHEME_32: {
319
5
                    memcpy(
320
5
                        result32 + result_pos,
321
5
                        MVM_string_gi_active_blob_32_pos(tc, gi),
322
5
                        to_copy * sizeof(MVMGrapheme32)
323
5
                    );
324
5
                    break;
325
20.0k
                }
326
0
                default:
327
0
                    MVM_exception_throw_adhoc(tc,
328
0
                        "Internal error, string corruption in iterate_gi_into_string\n");
329
20.0k
            }
330
20.0k
            result_pos += to_copy;
331
20.0k
            if (result_graphs <= result_pos || !MVM_string_gi_has_more_strands_rep(tc, gi)) {
332
5
                break;
333
5
            }
334
20.0k
            MVM_string_gi_next_strand_rep(tc, gi);
335
20.0k
        }
336
5
    }
337
6.64k
}
338
#define copy_strands_memcpy(BLOB_TYPE, SIZEOF_TYPE, STORAGE_TYPE) { \
339
    result->body.storage.BLOB_TYPE = MVM_malloc(sizeof(SIZEOF_TYPE) * MVM_string_graphs_nocheck(tc, orig)); \
340
    for (i = 0; i < orig->body.num_strands; i++) { \
341
        size_t graphs_this_strand =  orig->body.storage.strands[i].end - orig->body.storage.strands[i].start; \
342
        /* If it's 8bit format and there's only one grapheme */ \
343
        if ((STORAGE_TYPE == MVM_STRING_GRAPHEME_ASCII || STORAGE_TYPE == MVM_STRING_GRAPHEME_8) && graphs_this_strand == 1) { \
344
            /* If there are not repetitions we can directly set the grapheme */ \
345
            if (!orig->body.storage.strands[i].repetitions) \
346
                result->body.storage.BLOB_TYPE[graphs_so_far] = orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE[orig->body.storage.strands[i].start]; \
347
            /* Otherwise, use memset for the correct number of repetitions */ \
348
            else { \
349
                graphs_this_strand += orig->body.storage.strands[i].repetitions; \
350
                memset(graphs_so_far + result->body.storage.BLOB_TYPE, \
351
                    orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE[orig->body.storage.strands[i].start], \
352
                    graphs_this_strand \
353
                ); \
354
            } \
355
            graphs_so_far += graphs_this_strand; \
356
        } \
357
        else { \
358
            int j = 0; \
359
            for (; j <= orig->body.storage.strands[i].repetitions; j++) { \
360
                memcpy(graphs_so_far + result->body.storage.BLOB_TYPE, \
361
                    orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE + orig->body.storage.strands[i].start, \
362
                    sizeof(SIZEOF_TYPE) * graphs_this_strand \
363
                ); \
364
                graphs_so_far += graphs_this_strand; \
365
            } \
366
        } \
367
    } \
368
}
369
/* Collapses a bunch of strands into a single blob string. */
370
22.0k
static MVMString * collapse_strands(MVMThreadContext *tc, MVMString *orig) {
371
22.0k
    MVMString      *result = NULL;
372
22.0k
    size_t graphs_so_far = 0;
373
22.0k
374
22.0k
    /* If it's not a strand, just return it */
375
22.0k
    if (orig->body.storage_type != MVM_STRING_STRAND)
376
0
        return orig;
377
22.0k
    /* If the original string is a STRAND and all the composite strands are
378
22.0k
     * of the same type, then we will collapse it using memcpy instead of
379
22.0k
     * using a grapheme iterator. */
380
22.0k
    else {
381
22.0k
        size_t i;
382
22.0k
        MVMint32 common_storage_type = orig->body.storage.strands[0].blob_string->body.storage_type;
383
22.0k
        MVMROOT(tc, orig, {
384
22.0k
            result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
385
22.0k
            result->body.num_graphs = MVM_string_graphs(tc, orig);
386
22.0k
            for (i = 1; i < orig->body.num_strands; i++) {
387
22.0k
                if (common_storage_type != orig->body.storage.strands[i].blob_string->body.storage_type) {
388
22.0k
                    common_storage_type = -1;
389
22.0k
                    break;
390
22.0k
                }
391
22.0k
            }
392
22.0k
            result->body.storage_type = common_storage_type;
393
22.0k
            switch (common_storage_type) {
394
22.0k
                case MVM_STRING_GRAPHEME_32:
395
22.0k
                    copy_strands_memcpy(blob_32, MVMGrapheme32, MVM_STRING_GRAPHEME_32);
396
22.0k
                    break;
397
22.0k
                case MVM_STRING_GRAPHEME_ASCII:
398
22.0k
                case MVM_STRING_GRAPHEME_8:
399
22.0k
                    copy_strands_memcpy(blob_8, MVMGrapheme8, MVM_STRING_GRAPHEME_8);
400
22.0k
                    break;
401
22.0k
                default: {
402
22.0k
                    MVMGraphemeIter gi;
403
22.0k
                    MVM_string_gi_init(tc, &gi, orig);
404
22.0k
                    iterate_gi_into_string(tc, &gi, result, orig, 0);
405
22.0k
                }
406
22.0k
            }
407
22.0k
        });
408
22.0k
    }
409
22.0k
#if (MVM_DEBUG_STRANDS || MVM_DEBUG_NFG)
410
    if (!MVM_string_equal(tc, result, orig))
411
        MVM_exception_throw_adhoc(tc, "result and original were not eq in collapse_strands");
412
#endif
413
22.0k
    return result;
414
22.0k
}
415
416
/* Takes a string that is no longer in NFG form after some concatenation-style
417
 * operation, and returns a new string that is in NFG. Note that we could do a
418
 * much, much, smarter thing in the future that doesn't involve all of this
419
 * copying and allocation and re-doing the whole string, but cases like this
420
 * should be fairly rare anyway, so go with simplicity for now. */
421
13
static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) {
422
13
    MVMNormalizer norm;
423
13
    MVMCodepointIter ci;
424
13
    MVMint32 ready;
425
13
    MVMString *out = NULL;
426
13
    MVMuint32 bufsize = in->body.num_graphs;
427
13
428
13
    /* Create the output buffer. We used to believe it can't ever be bigger
429
13
     * than the initial estimate, but utf8-c8 showed us otherwise. */
430
13
    MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
431
13
    MVMint64 out_pos = 0;
432
13
    /* Iterate codepoints and normalizer. */
433
13
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
434
13
    /* Codepoint iterator that passes back utf8-c8 graphemes unchanged */
435
13
    MVM_string_ci_init(tc, &ci, in, 0, 1);
436
107
    while (MVM_string_ci_has_more(tc, &ci)) {
437
94
        MVMGrapheme32 g;
438
94
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g);
439
94
        if (ready) {
440
65
            if (out_pos + ready > bufsize) {
441
0
                /* Doubling up the buffer size seems excessive, so just
442
0
                 * add a generous amount of storage */
443
0
                bufsize += ready + 32;
444
0
                out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
445
0
            }
446
65
            out_buffer[out_pos++] = g;
447
77
            while (--ready > 0) {
448
12
                out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
449
12
            }
450
65
        }
451
94
    }
452
13
    MVM_unicode_normalizer_eof(tc, &norm);
453
13
    ready = MVM_unicode_normalizer_available(tc, &norm);
454
13
    if (out_pos + ready > bufsize) {
455
0
        bufsize += ready + 1;
456
0
        out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
457
0
    }
458
18
    while (ready--) {
459
5
        out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
460
5
    }
461
13
    MVM_unicode_normalizer_cleanup(tc, &norm);
462
13
463
13
    /* Build result string. */
464
13
    out = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
465
13
    out->body.storage.blob_32 = out_buffer;
466
13
    out->body.storage_type    = MVM_STRING_GRAPHEME_32;
467
13
    out->body.num_graphs      = out_pos;
468
13
    return out;
469
13
}
470
471
/* Returns nonzero if two substrings are equal, doesn't check bounds */
472
MVMint64 MVM_string_substrings_equal_nocheck(MVMThreadContext *tc, MVMString *a,
473
16.9M
        MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
474
16.9M
    MVMint64 i;
475
16.9M
476
16.9M
    /* Fast paths when storage types are identical. */
477
16.9M
    switch (a->body.storage_type) {
478
1.68M
        case MVM_STRING_GRAPHEME_32:
479
1.68M
            if (b->body.storage_type == MVM_STRING_GRAPHEME_32)
480
34.9k
                return 0 == memcmp(
481
34.9k
                    a->body.storage.blob_32 + starta,
482
34.9k
                    b->body.storage.blob_32 + startb,
483
34.9k
                    length * sizeof(MVMGrapheme32));
484
1.65M
            break;
485
15.0M
        case MVM_STRING_GRAPHEME_ASCII:
486
15.0M
        case MVM_STRING_GRAPHEME_8:
487
15.0M
            if (b->body.storage_type == MVM_STRING_GRAPHEME_ASCII ||
488
15.0M
                    b->body.storage_type == MVM_STRING_GRAPHEME_8)
489
14.8M
                return 0 == memcmp(
490
14.8M
                    a->body.storage.blob_8 + starta,
491
14.8M
                    b->body.storage.blob_8 + startb,
492
14.8M
                    length);
493
179k
            break;
494
16.9M
    }
495
16.9M
496
16.9M
    /* If both are flat, use MVM_string_get_grapheme_at_nocheck on both for speed */
497
2.07M
    if (a->body.storage_type != MVM_STRING_STRAND && b->body.storage_type != MVM_STRING_STRAND) {
498
1.39M
        for (i = 0; i < length; i++)
499
1.09M
            if (MVM_string_get_grapheme_at_nocheck(tc, a, starta + i)
500
1.09M
             != MVM_string_get_grapheme_at_nocheck(tc, b, startb + i))
501
74.7k
                return 0;
502
302k
        return 1;
503
377k
    }
504
1.69M
    else if (a->body.storage_type == MVM_STRING_STRAND && b->body.storage_type == MVM_STRING_STRAND) {
505
77.5k
        MVMGraphemeIter gia, gib;
506
77.5k
        /* Normal path, for the rest of the time. */
507
77.5k
        MVM_string_gi_init(tc, &gia, a);
508
77.5k
        MVM_string_gi_init(tc, &gib, b);
509
77.5k
        /* Move the grapheme iterator if start is not 0 */
510
77.5k
        if (starta) MVM_string_gi_move_to(tc, &gia, starta);
511
77.5k
        if (startb) MVM_string_gi_move_to(tc, &gib, startb);
512
3.55M
        for (i = 0; i < length; i++)
513
3.47M
            if (MVM_string_gi_get_grapheme(tc, &gia) != MVM_string_gi_get_grapheme(tc, &gib))
514
121
                return 0;
515
77.4k
        return 1;
516
77.5k
    }
517
1.61M
    else {
518
1.61M
        MVMGraphemeIter gi_y;
519
1.61M
        MVMString *y = NULL, *z = NULL;
520
1.61M
        MVMint64 starty, startz;
521
1.61M
        if (a->body.storage_type == MVM_STRING_STRAND) {
522
165k
                 y = a;           z = b;
523
165k
            starty = starta; startz = startb;
524
165k
        }
525
1.45M
        else {
526
1.45M
                 y = b;           z = a;
527
1.45M
            starty = startb; startz = starta;
528
1.45M
        }
529
1.61M
        MVM_string_gi_init(tc, &gi_y, y);
530
1.61M
        if (starty) MVM_string_gi_move_to(tc, &gi_y, starty);
531
5.72M
        for (i = 0; i < length; i++)
532
4.47M
            if (MVM_string_gi_get_grapheme(tc, &gi_y) != MVM_string_get_grapheme_at_nocheck(tc, z, startz + i))
533
372k
                return 0;
534
1.24M
        return 1;
535
1.61M
    }
536
2.07M
}
537
183k
static MVMint64 MVM_string_memmem_grapheme32 (MVMThreadContext *tc, MVMGrapheme32 *H_blob32, MVMGrapheme32 *n_blob32, MVMint64 H_start, MVMStringIndex H_graphs, MVMStringIndex n_graphs) {
538
183k
    MVMGrapheme32 * rtrn = memmem_uint32(H_blob32 + H_start, H_graphs - H_start, n_blob32, n_graphs);
539
130k
    return rtrn == NULL ? -1 : rtrn - H_blob32;
540
183k
}
541
22.3k
static MVMint64 MVM_string_memmem_grapheme32str (MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_start, MVMStringIndex H_graphs, MVMStringIndex n_graphs) {
542
22.3k
    MVMGrapheme32 *needle_buf = NULL;
543
22.3k
    MVMint64 rtrn;
544
22.3k
    if (needle->body.storage_type != MVM_STRING_GRAPHEME_32) {
545
4.09k
        MVMStringIndex i;
546
4.09k
        MVMGraphemeIter n_gi;
547
4.09k
        needle_buf = MVM_malloc(needle->body.num_graphs * sizeof(MVMGrapheme32));
548
4.09k
        if (needle->body.storage_type != MVM_STRING_GRAPHEME_8) MVM_string_gi_init(tc, &n_gi, needle);
549
10.7k
        for (i = 0; i < needle->body.num_graphs; i++) {
550
6.60k
            needle_buf[i] = needle->body.storage_type == MVM_STRING_GRAPHEME_8 ? needle->body.storage.blob_8[i] : MVM_string_gi_get_grapheme(tc, &n_gi);
551
6.60k
        }
552
4.09k
    }
553
18.2k
    rtrn = MVM_string_memmem_grapheme32(tc, Haystack->body.storage.blob_32, needle_buf ? needle_buf : needle->body.storage.blob_32, H_start, H_graphs, n_graphs);
554
22.3k
    if (needle_buf) MVM_free(needle_buf);
555
22.3k
    return rtrn;
556
22.3k
}
557
/* Returns the location of one string in another or -1  */
558
500k
MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
559
500k
    size_t index           = (size_t)start;
560
500k
    MVMStringIndex H_graphs, n_graphs;
561
500k
    MVM_string_check_arg(tc, Haystack, "index search target");
562
500k
    MVM_string_check_arg(tc,   needle, "index search term");
563
500k
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
564
500k
    n_graphs = MVM_string_graphs_nocheck(tc, needle);
565
500k
566
500k
    if (!n_graphs)
567
52
        return start <= H_graphs ? start : -1; /* the empty string is in any other string */
568
500k
569
500k
    if (!H_graphs)
570
41
        return -1;
571
500k
572
500k
    if (start < 0 || H_graphs <= start)
573
148
        return -1;
574
500k
575
499k
    if (H_graphs < n_graphs || n_graphs < 1)
576
3
        return -1;
577
499k
578
499k
    /* Fast paths when storage types are identical. Uses memmem function, which
579
499k
     * uses Knuth-Morris-Pratt algorithm on Linux and on others
580
499k
     * Crochemore+Perrin two-way string matching */
581
499k
    switch (Haystack->body.storage_type) {
582
22.3k
        case MVM_STRING_GRAPHEME_32:
583
22.3k
            if (needle->body.storage_type == MVM_STRING_GRAPHEME_32 || needle->body.num_graphs < 100) {
584
22.3k
                return MVM_string_memmem_grapheme32str(tc, Haystack, needle, start, H_graphs, n_graphs);
585
22.3k
            }
586
0
            break;
587
438k
        case MVM_STRING_GRAPHEME_8:
588
438k
            if (needle->body.storage_type == MVM_STRING_GRAPHEME_8 || needle->body.num_graphs < 100) {
589
438k
                void         *mm_return_8 = NULL;
590
438k
                MVMGrapheme8 *needle_buf  = NULL;
591
438k
                if (needle->body.storage_type != MVM_STRING_GRAPHEME_8) {
592
9.01k
                    MVMStringIndex i;
593
9.01k
                    MVMGraphemeIter n_gi;
594
9.01k
                    needle_buf = MVM_malloc(needle->body.num_graphs * sizeof(MVMGrapheme8));
595
9.01k
                    if (needle->body.storage_type != MVM_STRING_GRAPHEME_32) MVM_string_gi_init(tc, &n_gi, needle);
596
18.0k
                    for (i = 0; i < needle->body.num_graphs; i++) {
597
9.01k
                        MVMGrapheme32 g = needle->body.storage_type == MVM_STRING_GRAPHEME_32
598
0
                            ? needle->body.storage.blob_32[i]
599
9.01k
                            : MVM_string_gi_get_grapheme(tc, &n_gi);
600
9.01k
                        /* Haystack is 8 bit, needle is 32 bit. if we encounter a non8bit grapheme
601
9.01k
                         * it's impossible to match */
602
9.01k
                        if (!can_fit_into_8bit(g)) {
603
0
                            MVM_free(needle_buf);
604
0
                            return -1;
605
0
                        }
606
9.01k
                        needle_buf[i] = g;
607
9.01k
                    }
608
9.01k
                }
609
438k
                mm_return_8 = MVM_memmem(
610
438k
                    Haystack->body.storage.blob_8 + start, /* start position */
611
438k
                    (H_graphs - start) * sizeof(MVMGrapheme8), /* length of Haystack from start position to end */
612
429k
                    needle_buf ? needle_buf : needle->body.storage.blob_8, /* needle start */
613
438k
                    n_graphs * sizeof(MVMGrapheme8) /* needle length */
614
438k
                );
615
438k
                if (needle_buf) MVM_free(needle_buf);
616
438k
                if (mm_return_8 == NULL)
617
366k
                    return -1;
618
438k
                else
619
71.9k
                    return (MVMGrapheme8*)mm_return_8 -  Haystack->body.storage.blob_8;
620
438k
            }
621
0
            break;
622
499k
    }
623
499k
    /* Minimal code version for needles of size 1 */
624
39.0k
    if (n_graphs == 1) {
625
29.7k
        MVMGraphemeIter H_gi;
626
29.7k
        MVMGrapheme32 n_g = MVM_string_get_grapheme_at_nocheck(tc, needle, 0);
627
29.7k
        MVM_string_gi_init(tc, &H_gi, Haystack);
628
29.7k
        if (index) MVM_string_gi_move_to(tc, &H_gi, index);
629
3.47M
        while (index < H_graphs) {
630
3.46M
            if (n_g == MVM_string_gi_get_grapheme(tc, &H_gi))
631
11.9k
                return (MVMint64)index;
632
3.44M
            index++;
633
3.44M
        }
634
29.7k
    }
635
9.29k
    else if (n_graphs <= MVM_string_KMP_max_pattern_length)
636
9.29k
        return knuth_morris_pratt_string_index(tc, needle, Haystack, start);
637
0
    else {
638
0
        int is_gic = Haystack->body.storage_type == MVM_STRING_STRAND ? 1 : 0;
639
0
        void *Hs_or_gic = Haystack;
640
0
        /* If Haystack is a strand allocate space for a MVMGraphemeIter_cached
641
0
         * and initialize it */
642
0
        if (is_gic) {
643
0
            Hs_or_gic = alloca(sizeof(MVMGraphemeIter_cached));
644
0
            MVM_string_gi_cached_init(tc, Hs_or_gic, Haystack, start);
645
0
        }
646
0
        /* For needles > MVM_string_KMP_max_pattern_length we must revert to brute force for now.
647
0
         * Eventually we can implement brute force after it matches the whole needle OR
648
0
         * allocate more space for the pattern on reaching the end of the pattern */
649
0
        while (index <= H_graphs - n_graphs) {
650
0
            if (string_equal_at_ignore_case_INTERNAL_loop(tc, Hs_or_gic, needle, index, H_graphs, n_graphs, 0, 0, is_gic) != -1)
651
0
                return (MVMint64)index;
652
0
            index++;
653
0
        }
654
0
    }
655
17.7k
    return -1;
656
39.0k
}
657
658
/* Returns the location of one string in another or -1  */
659
16
MVMint64 MVM_string_index_from_end(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
660
16
    MVMint64 result = -1;
661
16
    size_t index;
662
16
    MVMStringIndex H_graphs, n_graphs;
663
16
664
16
    MVM_string_check_arg(tc, Haystack, "rindex search target");
665
16
    MVM_string_check_arg(tc, needle, "rindex search term");
666
16
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
667
16
    n_graphs = MVM_string_graphs_nocheck(tc, needle);
668
16
669
16
    if (!n_graphs) {
670
8
        if (0 <= start)
671
7
            return start <= H_graphs ? start : -1; /* the empty string is in any other string */
672
8
        else
673
1
            return H_graphs; /* no start, so return end */
674
8
    }
675
16
676
8
    if (!H_graphs)
677
0
        return -1;
678
8
679
8
    if (H_graphs < n_graphs || n_graphs < 1)
680
0
        return -1;
681
8
682
8
    if (start == -1)
683
2
        start = H_graphs - n_graphs;
684
8
685
8
    if (start < 0 || H_graphs <= start)
686
8
        /* maybe return -1 instead? */
687
0
        MVM_exception_throw_adhoc(tc, "index start offset out of range");
688
8
689
8
    index = start;
690
8
691
8
    if (H_graphs < index + n_graphs) {
692
1
        index = H_graphs - n_graphs;
693
1
    }
694
8
695
8
    /* brute force for now. horrible, yes. halp. */
696
19
    do {
697
19
        if (MVM_string_substrings_equal_nocheck(tc, needle, 0, n_graphs, Haystack, index)) {
698
6
            result = (MVMint64)index;
699
6
            break;
700
6
        }
701
13
    } while (0 < index--);
702
8
    return result;
703
8
}
704
705
/* Returns a substring of the given string */
706
343k
MVMString * MVM_string_substring(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMint64 length) {
707
343k
    MVMString *result;
708
343k
    MVMint64   start_pos, end_pos;
709
343k
710
343k
    MVMint64 agraphs;
711
343k
712
343k
    MVM_string_check_arg(tc, a, "substring");
713
343k
    /* convert to signed to avoid implicit arithmetic conversions */
714
343k
    agraphs = (MVMint64)MVM_string_graphs_nocheck(tc, a);
715
343k
716
343k
    /* -1 signifies go to the end of the string; anything less is a bug */
717
343k
    if (length < -1)
718
0
        MVM_exception_throw_adhoc(tc, "Substring length (%"PRId64") cannot be negative", length);
719
343k
720
343k
    /* negative offsets count from the end */
721
343k
    start_pos = offset < 0 ? offset + agraphs : offset;
722
303k
    end_pos   = length == -1 ? agraphs : start_pos + length;
723
343k
724
343k
    /* return an empty string if start_pos is out of bounds but positive */
725
343k
    if (agraphs < start_pos) {
726
0
        start_pos = 0;
727
0
        end_pos   = 0;
728
0
    }
729
343k
730
343k
    if (end_pos < 0)
731
0
        MVM_exception_throw_adhoc(tc, "Substring end (%"PRId64") cannot be less than 0", end_pos);
732
343k
733
343k
    /* Ensure we're within bounds. */
734
343k
    if (start_pos < 0)
735
0
        start_pos = 0;
736
343k
    if (agraphs < end_pos)
737
841
        end_pos = agraphs;
738
343k
739
343k
    /* Check trivial cases: empty string and whole string. */
740
343k
    if (start_pos == end_pos)
741
11.6k
        return tc->instance->str_consts.empty;
742
331k
    if (start_pos == 0 && end_pos == agraphs)
743
19.6k
        return a;
744
331k
745
331k
    /* Construct a result; how we efficiently do so will vary based on the
746
331k
     * input string. */
747
312k
    MVMROOT(tc, a, {
748
312k
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
749
312k
        result->body.num_graphs = end_pos - start_pos;
750
312k
        if (a->body.storage_type != MVM_STRING_STRAND) {
751
312k
            /* It's some kind of buffer. Construct a strand view into it. */
752
312k
            result->body.storage_type    = MVM_STRING_STRAND;
753
312k
            result->body.storage.strands = allocate_strands(tc, 1);
754
312k
            result->body.num_strands     = 1;
755
312k
            result->body.storage.strands[0].blob_string = a;
756
312k
            result->body.storage.strands[0].start       = start_pos;
757
312k
            result->body.storage.strands[0].end         = end_pos;
758
312k
            result->body.storage.strands[0].repetitions = 0;
759
312k
        }
760
312k
        else if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
761
312k
            /* Single strand string; quite possibly already a substring. We'll
762
312k
             * just produce an updated view. */
763
312k
            MVMStringStrand *orig_strand = &(a->body.storage.strands[0]);
764
312k
            result->body.storage_type    = MVM_STRING_STRAND;
765
312k
            result->body.storage.strands = allocate_strands(tc, 1);
766
312k
            result->body.num_strands     = 1;
767
312k
            result->body.storage.strands[0].blob_string = orig_strand->blob_string;
768
312k
            result->body.storage.strands[0].start       = orig_strand->start + start_pos;
769
312k
            result->body.storage.strands[0].end         = orig_strand->start + end_pos;
770
312k
            result->body.storage.strands[0].repetitions = 0;
771
312k
        }
772
312k
        else {
773
312k
            /* Produce a new blob string, collapsing the strands. */
774
312k
            MVMGraphemeIter gi;
775
312k
            MVM_string_gi_init(tc, &gi, a);
776
312k
            MVM_string_gi_move_to(tc, &gi, start_pos);
777
312k
            iterate_gi_into_string(tc, &gi, result, a, start_pos);
778
312k
        }
779
312k
    });
780
312k
781
312k
    STRAND_CHECK(tc, result);
782
312k
    return result;
783
331k
}
784
785
1
MVMString * MVM_string_replace(MVMThreadContext *tc, MVMString *original, MVMint64 start, MVMint64 count, MVMString *replacement) {
786
1
    /* XXX this could probably be done more efficiently directly. */
787
1
    MVMString *first_part = NULL;
788
1
    MVMString *rest_part  = NULL;
789
1
    MVMString *result     = NULL;
790
1
791
1
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&replacement);
792
1
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&original);
793
1
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&first_part);
794
1
    first_part = MVM_string_substring(tc, original, 0, start);
795
1
796
1
    rest_part  = MVM_string_substring(tc, original, start + count, -1);
797
1
    rest_part  = MVM_string_concatenate(tc, replacement, rest_part);
798
1
    result     = MVM_string_concatenate(tc, first_part, rest_part);
799
1
800
1
801
1
    STRAND_CHECK(tc, result);
802
1
    NFG_CHECK(tc, result, "MVM_string_replace");
803
1
    MVM_gc_root_temp_pop_n(tc, 3);
804
1
    return result;
805
1
}
806
807
144
static MVMString * string_from_strand_at_index(MVMThreadContext *tc, MVMString *a, MVMuint16 index) {
808
144
    MVMStringStrand *ss = &(a->body.storage.strands[index]);
809
144
    return MVM_string_substring(tc, ss->blob_string, ss->start, ss->end - ss->start);
810
144
}
811
812
1.39M
static MVMuint16 final_strand_match_with_repetition_count(MVMThreadContext *tc, MVMString *a, MVMString *b) {
813
1.39M
    if (a->body.storage_type == MVM_STRING_STRAND) {
814
1.29M
        MVMStringStrand *sa = &(a->body.storage.strands[a->body.num_strands - 1]);
815
1.29M
        /* If the final strand of a eq b, we'll just increment the final strand of a's repetitions. */
816
1.29M
        if (sa->end - sa->start == MVM_string_graphs_nocheck(tc, b)) {
817
1.23M
            if (MVM_string_equal_at(tc, sa->blob_string, b, sa->start))
818
1.20M
                return 1;
819
1.23M
        }
820
1.29M
        /* If the final strand of a eq the first (and only) strand of b, we'll just add b's repetitions
821
1.29M
   * (plus 1 for the strand itself) to the final strand of a's repetitions. */
822
62.4k
        else if (b->body.storage_type == MVM_STRING_STRAND && b->body.num_strands == 1) {
823
8.38k
            MVMStringStrand *sb = &(b->body.storage.strands[0]);
824
8.38k
            if (sa->end - sa->start == sb->end - sb->start)
825
72
                if (MVM_string_equal(tc, string_from_strand_at_index(tc, a, a->body.num_strands - 1), string_from_strand_at_index(tc, b, 0)))
826
0
                    return b->body.storage.strands[0].repetitions + 1;
827
8.38k
  }
828
1.29M
    }
829
196k
    return 0;
830
1.39M
}
831
832
/* Append one string to another. */
833
1.40M
MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString *b) {
834
1.40M
    MVMString *result = NULL, *renormalized_section = NULL;
835
1.40M
    int renormalized_section_graphs = 0, consumed_a = 0, consumed_b = 0;
836
1.40M
    MVMuint32  agraphs, bgraphs;
837
1.40M
    MVMuint64  total_graphs;
838
1.40M
    int lost_strands          = 0;
839
1.40M
    int is_concat_stable      = 0;
840
1.40M
    int index_ss_b;
841
1.40M
    MVMuint16 matching_repetition_count;
842
1.40M
    MVM_string_check_arg(tc, a, "concatenate");
843
1.40M
    MVM_string_check_arg(tc, b, "concatenate");
844
1.40M
845
1.40M
    /* Simple empty-string cases. */
846
1.40M
    agraphs = MVM_string_graphs_nocheck(tc, a);
847
1.40M
    if (agraphs == 0)
848
11.0k
        return b;
849
1.39M
    bgraphs = MVM_string_graphs_nocheck(tc, b);
850
1.39M
    if (bgraphs == 0)
851
356
        return a;
852
1.39M
853
1.39M
    is_concat_stable = MVM_nfg_is_concat_stable(tc, a, b);
854
1.39M
855
1.39M
    /* If is_concat_stable equals 0 and a and b are not repetitions. */
856
1.39M
    if (is_concat_stable == 0 && !(a->body.storage_type == MVM_STRING_STRAND && a->body.storage.strands[a->body.num_strands - 1].repetitions)
857
46
    && !(b->body.storage_type == MVM_STRING_STRAND && b->body.storage.strands[0].repetitions)) {
858
46
        MVMCodepoint last_a_first_b[2] = {
859
46
            MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1),
860
46
            MVM_string_get_grapheme_at_nocheck(tc, b, 0)
861
46
        };
862
46
        MVMROOT2(tc, a, b, {
863
46
        /* If both are not synthetics, we can can pass those values unchanged
864
46
         * instead of iterating by codepoint */
865
46
        if (0 <= last_a_first_b[0] && 0 <= last_a_first_b[1]) {
866
46
            renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b, 2);
867
46
            consumed_a = 1; consumed_b = 1;
868
46
        }
869
46
        else {
870
46
            MVMCodepointIter last_a_ci;
871
46
            MVMCodepointIter first_b_ci;
872
46
            MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci,  last_a_first_b[0], 1);
873
46
            MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1], 1);
874
46
            /* MSVC doesn't allow variable length arrays so use alloca to allocate onto the stack */
875
46
            MVMCodepoint *last_a_first_b_codes = alloca((a_codes + b_codes) * sizeof(MVMCodepoint));
876
46
            MVMuint32 i = 0;
877
46
            for (; MVM_string_grapheme_ci_has_more(tc, &last_a_ci); i++) {
878
46
                last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &last_a_ci);
879
46
            }
880
46
            for (; MVM_string_grapheme_ci_has_more(tc, &first_b_ci); i++) {
881
46
                last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &first_b_ci);
882
46
            }
883
46
            renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b_codes, a_codes + b_codes);
884
46
            consumed_a = 1; consumed_b = 1;
885
46
        }
886
46
        });
887
46
        if (renormalized_section) {
888
46
            if (agraphs == consumed_a && bgraphs == consumed_b) {
889
17
                NFG_CHECK_CONCAT(tc, renormalized_section, a, b, "renormalized_section");
890
17
                return renormalized_section;
891
17
            }
892
29
            renormalized_section_graphs = MVM_string_graphs_nocheck(tc, renormalized_section);
893
29
        }
894
46
    }
895
1.39M
    /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
896
1.39M
    total_graphs = (MVMuint64)agraphs + (MVMuint64)bgraphs;
897
1.39M
    if (MAX_GRAPHEMES < total_graphs)
898
0
        MVM_exception_throw_adhoc(tc,
899
0
            "Can't concatenate strings, required number of graphemes %"PRIu64" > max allowed of %lld",
900
0
             total_graphs, MAX_GRAPHEMES);
901
1.39M
902
1.39M
    /* Otherwise, we'll assemble a result string. */
903
1.39M
    MVMROOT4(tc, a, b, renormalized_section, result, {
904
1.39M
905
1.39M
        /* Allocate it. */
906
1.39M
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
907
1.39M
908
1.39M
        /* Total graphemes is trivial; just total up inputs. */
909
1.39M
        result->body.num_graphs = (MVMuint32)total_graphs;
910
1.39M
911
1.39M
        /* Result string will be made of strands. */
912
1.39M
        result->body.storage_type = MVM_STRING_STRAND;
913
1.39M
914
1.39M
        /* Detect the wonderful case where we're repeatedly concating the same
915
1.39M
         * string again and again, and thus can just bump a repetition. */
916
1.39M
        if (is_concat_stable == 1 && (matching_repetition_count = final_strand_match_with_repetition_count(tc, a, b))) {
917
1.39M
            /* We have it; just copy the strands to a new string and bump the
918
1.39M
             * repetitions count of the last one. */
919
1.39M
            result->body.storage.strands = allocate_strands(tc, a->body.num_strands);
920
1.39M
            copy_strands(tc, a, 0, result, 0, a->body.num_strands);
921
1.39M
            result->body.storage.strands[a->body.num_strands - 1].repetitions += matching_repetition_count;
922
1.39M
            result->body.num_strands = a->body.num_strands;
923
1.39M
        }
924
1.39M
925
1.39M
        /* Otherwise, construct a new strand string. */
926
1.39M
        else {
927
1.39M
            /* See if we have too many strands between the two. If so, we will
928
1.39M
             * collapse the biggest side. */
929
1.39M
            MVMuint16 strands_a = a->body.storage_type == MVM_STRING_STRAND
930
1.39M
                ? a->body.num_strands
931
1.39M
                : 1;
932
1.39M
            MVMuint16 strands_b = b->body.storage_type == MVM_STRING_STRAND
933
1.39M
                ? b->body.num_strands
934
1.39M
                : 1;
935
1.39M
            MVMString *effective_a = a;
936
1.39M
            MVMString *effective_b = b;
937
1.39M
            if (MVM_STRING_MAX_STRANDS < strands_a + strands_b) {
938
1.39M
                MVMROOT(tc, result, {
939
1.39M
                    if (strands_b <= strands_a) {
940
1.39M
                        effective_a = collapse_strands(tc, effective_a);
941
1.39M
                        strands_a   = 1;
942
1.39M
                    }
943
1.39M
                    else {
944
1.39M
                        effective_b = collapse_strands(tc, effective_b);
945
1.39M
                        strands_b   = 1;
946
1.39M
                    }
947
1.39M
                });
948
1.39M
            }
949
1.39M
            /* Assemble the result. */
950
1.39M
            result->body.num_strands = strands_a + strands_b + (renormalized_section_graphs ? 1 : 0);
951
1.39M
            result->body.storage.strands = allocate_strands(tc, result->body.num_strands);
952
1.39M
            /* START 1 */
953
1.39M
            if (effective_a->body.storage_type == MVM_STRING_STRAND) {
954
1.39M
                copy_strands(tc, effective_a, 0, result, 0, strands_a);
955
1.39M
            }
956
1.39M
            else {
957
1.39M
                int index_ss_a = 0;
958
1.39M
                MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]);
959
1.39M
                ss_a->blob_string = effective_a;
960
1.39M
                ss_a->start       = 0;
961
1.39M
                ss_a->end         = effective_a->body.num_graphs;
962
1.39M
                ss_a->repetitions = 0;
963
1.39M
            }
964
1.39M
            if (renormalized_section) {
965
1.39M
                int index_ss_re;
966
1.39M
                int index_ss_a = strands_a - 1;
967
1.39M
                /* Tweak the end of the last strand of string a. Since if a is made up of multiple strands, we can't just refer to index 0 and instead erfer to strands_a - 1 */
968
1.39M
                MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]);
969
1.39M
                MVMStringStrand *ss_re = NULL;
970
1.39M
                ss_a->end -= consumed_a;
971
1.39M
                /* If the strands ends up to be zero length we need to reduce the number of strand_index and also incease lost_strands so the next operation writes over it */
972
1.39M
                if (ss_a->start == ss_a->end)
973
1.39M
                    lost_strands++;
974
1.39M
            /* END 1 */
975
1.39M
            /* START 1.5 (only triggered in some cases) */
976
1.39M
                index_ss_re = strands_a - lost_strands;
977
1.39M
                ss_re = &(result->body.storage.strands[index_ss_re]);
978
1.39M
979
1.39M
                /* Add the renormalized section in as a strand */
980
1.39M
                ss_re->blob_string = renormalized_section;
981
1.39M
                ss_re->start       = 0;
982
1.39M
                ss_re->end         = renormalized_section->body.num_graphs;
983
1.39M
                ss_re->repetitions = 0;
984
1.39M
                if (ss_re->start == ss_re->end) {
985
1.39M
                    MVM_exception_throw_adhoc(tc, "Unexpected error in concatenation: renormalized_section is 0 graphemes.\n");
986
1.39M
                    /* renormalized_section should always be at least one grapheme
987
1.39M
                     * in length so throw if it does not (zero length is an error
988
1.39M
                     * we shouldn't lost_strands++ unlike the other strands */
989
1.39M
                }
990
1.39M
            /* END 1.5 */
991
1.39M
            }
992
1.39M
            /* START 2 */
993
1.39M
            index_ss_b = strands_a - lost_strands + (renormalized_section_graphs ? 1 : 0 );
994
1.39M
            if (effective_b->body.storage_type == MVM_STRING_STRAND) {
995
1.39M
                copy_strands(tc, effective_b, 0, result, index_ss_b, strands_b);
996
1.39M
            }
997
1.39M
            else {
998
1.39M
                MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]);
999
1.39M
                ss_b->blob_string = effective_b;
1000
1.39M
                ss_b->start       = 0;
1001
1.39M
                ss_b->end         = effective_b->body.num_graphs;
1002
1.39M
                ss_b->repetitions = 0;
1003
1.39M
            }
1004
1.39M
            if (renormalized_section_graphs) {
1005
1.39M
                /* Tweak the beginning of the first strand of string b */
1006
1.39M
                MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]);
1007
1.39M
                ss_b->start += consumed_b;
1008
1.39M
                if (ss_b->start == ss_b->end) {
1009
1.39M
                    lost_strands++;
1010
1.39M
                    move_strands(tc, result, index_ss_b + 1, result, index_ss_b, strands_b - 1);
1011
1.39M
                }
1012
1.39M
            /* END 2 */
1013
1.39M
            /* Adjust result->num_strands */
1014
1.39M
                if (lost_strands)
1015
1.39M
                    result->body.num_strands -= lost_strands;
1016
1.39M
                /* Adjust result->num_graphs */
1017
1.39M
                result->body.num_graphs += renormalized_section_graphs - consumed_b - consumed_a;
1018
1.39M
            }
1019
1.39M
        }
1020
1.39M
    STRAND_CHECK(tc, result);
1021
1.39M
    if (is_concat_stable == 1 || (is_concat_stable == 0 && renormalized_section))
1022
1.39M
        NFG_CHECK_CONCAT(tc, result, a, b, "'result'");
1023
1.39M
    });
1024
1.39M
    if (is_concat_stable == 1 || (is_concat_stable == 0 && renormalized_section))
1025
1.39M
        return result;
1026
1.39M
    /* If it's regional indicator (is_concat_stable == 2) */
1027
8
    return re_nfg(tc, result);
1028
1.39M
}
1029
1030
445
MVMString * MVM_string_repeat(MVMThreadContext *tc, MVMString *a, MVMint64 count) {
1031
445
    MVMString *result = NULL;
1032
445
    MVMuint32  agraphs;
1033
445
    MVMuint64  total_graphs;
1034
445
1035
445
    MVM_string_check_arg(tc, a, "repeat");
1036
445
1037
445
    /* Validate count; handle common cases. */
1038
445
    if (count == 0)
1039
9
        return tc->instance->str_consts.empty;
1040
436
    if (count == 1)
1041
22
        return a;
1042
414
    if (count < 0)
1043
0
        MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be negative", count);
1044
414
    if (MAX_GRAPHEMES < count)
1045
0
        MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be greater than max allowed number of graphemes %lld", count, MAX_GRAPHEMES);
1046
414
1047
414
    /* If input string is empty, repeating it is empty. */
1048
414
    agraphs = MVM_string_graphs_nocheck(tc, a);
1049
414
    if (agraphs == 0)
1050
0
        return tc->instance->str_consts.empty;
1051
414
1052
414
    /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
1053
414
    total_graphs = (MVMuint64)agraphs * (MVMuint64)count;
1054
414
    if (MAX_GRAPHEMES < total_graphs)
1055
0
        MVM_exception_throw_adhoc(tc,
1056
0
            "Can't repeat string, required number of graphemes (%"PRIu32" * %"PRIu64") greater than max allowed of %lld",
1057
0
             agraphs, count, MAX_GRAPHEMES);
1058
414
1059
414
    /* Now build a result string with the repetition set. */
1060
414
    MVMROOT(tc, a, {
1061
414
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1062
414
        result->body.num_graphs      = agraphs * count;
1063
414
        result->body.storage_type    = MVM_STRING_STRAND;
1064
414
        result->body.storage.strands = allocate_strands(tc, 1);
1065
414
        if (a->body.storage_type == MVM_STRING_STRAND) {
1066
414
            if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
1067
414
                copy_strands(tc, a, 0, result, 0, 1);
1068
414
            }
1069
414
            else {
1070
414
                MVMROOT(tc, result, {
1071
414
                    a = collapse_strands(tc, a);
1072
414
                });
1073
414
                result->body.storage.strands[0].blob_string = a;
1074
414
                result->body.storage.strands[0].start       = 0;
1075
414
                result->body.storage.strands[0].end         = agraphs;
1076
414
            }
1077
414
        }
1078
414
        else {
1079
414
            result->body.storage.strands[0].blob_string = a;
1080
414
            result->body.storage.strands[0].start       = 0;
1081
414
            result->body.storage.strands[0].end         = agraphs;
1082
414
        }
1083
414
        result->body.storage.strands[0].repetitions = count - 1;
1084
414
        result->body.num_strands = 1;
1085
414
    });
1086
414
    /* If string a is not stable under concatenation, we need to create a flat
1087
414
     * string and ensure it is normalized */
1088
414
    if (!MVM_nfg_is_concat_stable(tc, a, a))
1089
0
        result = re_nfg(tc, result);
1090
414
    STRAND_CHECK(tc, result);
1091
414
    return result;
1092
414
}
1093
1094
315
void MVM_string_say(MVMThreadContext *tc, MVMString *a) {
1095
315
    MVM_string_check_arg(tc, a, "say");
1096
315
    MVM_string_print(tc, MVM_string_concatenate(tc, a,
1097
315
        tc->instance->str_consts.platform_newline));
1098
315
}
1099
1100
316
void MVM_string_print(MVMThreadContext *tc, MVMString *a) {
1101
316
    MVMOSHandle *handle = (MVMOSHandle *)tc->instance->stdout_handle;
1102
316
    MVMuint64 encoded_size;
1103
316
    char *encoded;
1104
316
    MVM_string_check_arg(tc, a, "print");
1105
316
    encoded = MVM_string_utf8_encode(tc, a, &encoded_size, MVM_TRANSLATE_NEWLINE_OUTPUT);
1106
316
    MVM_io_write_bytes_c(tc, tc->instance->stdout_handle, encoded, encoded_size);
1107
316
    MVM_free(encoded);
1108
316
}
1109
/* Meant to be pased in a MVMNormalizer of type MVM_NORMALIZE_NFD */
1110
2.77k
static MVMGrapheme32 ord_getbasechar (MVMThreadContext *tc, MVMGrapheme32 g) {
1111
2.77k
    /* If we get a synthetic, extract the base codepoint and call ord_getbasechar again */
1112
2.77k
    if (g < 0) {
1113
0
        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
1114
0
        return ord_getbasechar(tc, synth->codes[synth->base_index]);
1115
0
    }
1116
2.77k
    else {
1117
2.77k
        MVMGrapheme32 return_g;
1118
2.77k
        MVMint32 ready;
1119
2.77k
        MVMNormalizer norm;
1120
2.77k
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD);
1121
2.77k
1122
2.77k
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, g, &return_g);
1123
2.77k
        MVM_unicode_normalizer_eof(tc, &norm);
1124
2.77k
        if (!ready)
1125
91
            return_g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
1126
2.77k
        MVM_unicode_normalizer_cleanup(tc, &norm);
1127
2.77k
        return return_g;
1128
2.77k
    }
1129
2.77k
}
1130
/* Tests whether one string a has the other string b as a substring at that index */
1131
1.93M
MVMint64 MVM_string_equal_at(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 offset) {
1132
1.93M
1133
1.93M
    MVMStringIndex agraphs, bgraphs;
1134
1.93M
1135
1.93M
    MVM_string_check_arg(tc, a, "equal_at");
1136
1.93M
    MVM_string_check_arg(tc, b, "equal_at");
1137
1.93M
1138
1.93M
    agraphs = MVM_string_graphs_nocheck(tc, a);
1139
1.93M
    bgraphs = MVM_string_graphs_nocheck(tc, b);
1140
1.93M
1141
1.93M
    if (offset < 0) {
1142
3
        offset += agraphs;
1143
3
        if (offset < 0)
1144
1
            offset = 0; /* XXX I think this is the right behavior here */
1145
3
    }
1146
1.93M
    if (agraphs - offset < bgraphs)
1147
9.07k
        return 0;
1148
1.92M
    return MVM_string_substrings_equal_nocheck(tc, a, offset, bgraphs, b, 0);
1149
1.93M
}
1150
/* Ensure return value can hold numbers at least 3x higher than MVMStringIndex.
1151
 * Theoretically if the string has all ffi ligatures and 1/3 the max size of
1152
 * MVMStringIndex in length, we could have some weird results. */
1153
1154
/* ignoremark is 0 for normal operation and 1 for ignoring diacritics */
1155
2.63k
MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, void *Hs_or_gic, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase, int is_gic) {
1156
2.63k
    MVMuint32 H_fc_cps;
1157
2.63k
    /* An additional needle offset which is used only when codepoints expand
1158
2.63k
     * when casefolded. The offset is the number of additional codepoints that
1159
2.63k
     * have been seen so Haystack and needle stay aligned */
1160
2.63k
    MVMint64 n_offset = 0;
1161
2.63k
    MVMint64 i, j;
1162
2.63k
    MVMGrapheme32 H_g, n_g;
1163
3.48k
    for (i = 0; i + H_start < H_graphs && i + n_offset < n_fc_graphs; i++) {
1164
2.94k
        const MVMCodepoint* H_result_cps;
1165
2.35k
        H_g = is_gic ? MVM_string_gi_cached_get_grapheme(tc, Hs_or_gic, H_start + i) : MVM_string_get_grapheme_at_nocheck(tc, Hs_or_gic, H_start + i);
1166
2.94k
        if (!ignorecase) {
1167
112
            H_fc_cps = 0;
1168
112
        }
1169
2.82k
        else if (0 <= H_g) {
1170
2.82k
            /* For codeponits we can get the case change directly */
1171
2.82k
            H_fc_cps = MVM_unicode_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, &H_result_cps);
1172
2.82k
        }
1173
0
        else {
1174
0
            /* Synthetics must use this function */
1175
0
            H_fc_cps = MVM_nfg_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, (MVMGrapheme32**) &H_result_cps);
1176
0
        }
1177
2.94k
        /* If we get 0 for the number that means the cp doesn't change when casefolded */
1178
2.94k
        if (H_fc_cps == 0) {
1179
2.59k
            n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
1180
2.59k
            if (ignoremark) {
1181
1.23k
                H_g = ord_getbasechar(tc, H_g);
1182
1.23k
                n_g = ord_getbasechar(tc, n_g);
1183
1.23k
            }
1184
2.59k
            if (H_g != n_g)
1185
2.02k
                return -1;
1186
2.59k
        }
1187
347
        else if (1 <= H_fc_cps) {
1188
770
            for (j = 0; j < H_fc_cps; j++) {
1189
488
                n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
1190
488
                H_g = H_result_cps[j];
1191
488
                if (ignoremark) {
1192
129
                    H_g = ord_getbasechar(tc, H_g);
1193
129
                    n_g = ord_getbasechar(tc, n_g);
1194
129
                }
1195
488
                if (H_g != n_g)
1196
65
                    return -1;
1197
423
                n_offset++;
1198
423
            }
1199
282
            n_offset--;
1200
282
        }
1201
2.94k
    }
1202
543
    return n_offset;
1203
2.63k
    /* We return -1 if the strings are not equal and 0 or more if they are equal
1204
2.63k
     * The return values from 0, 1 etc designate how many Haystack graphemes
1205
2.63k
     * were expanded.
1206
2.63k
     * This may seem like an odd arangement, but this extra information is needed
1207
2.63k
     * to determine the length of the Haystack which was traversed, as it can
1208
2.63k
     * differ from the length of the needle if there are expansions. */
1209
2.63k
}
1210
/* Checks if needle exists at the offset, but ignores case.
1211
 * Sometimes there is a difference in length of a string before and after foldcase,
1212
 * because of this we must compare this differently than just foldcasing both
1213
 * strings to ensure the offset is correct */
1214
333
static MVMint64 string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset, int ignoremark, int ignorecase) {
1215
333
    /* Foldcase version of needle */
1216
333
    MVMString *needle_fc = NULL;
1217
333
    MVMStringIndex H_graphs = MVM_string_graphs(tc, Haystack);
1218
333
    MVMStringIndex n_graphs = MVM_string_graphs(tc, needle);
1219
333
    MVMStringIndex n_fc_graphs;
1220
333
    /* H_expansion must be able to hold integers 3x larger than MVMStringIndex */
1221
333
    MVMint64 H_expansion;
1222
333
1223
333
    if (H_offset < 0) {
1224
3
        H_offset += H_graphs;
1225
3
        if (H_offset < 0)
1226
1
            H_offset = 0; /* XXX I think this is the right behavior here */
1227
3
    }
1228
333
    /* If the offset is greater or equal to the number of Haystack graphemes
1229
333
     * return 0. Since size of graphemes could change under casefolding, we
1230
333
     * can't assume too much. If optimizing this be careful */
1231
333
    if (H_graphs <= H_offset)
1232
1
        return 0;
1233
332
    MVMROOT(tc, Haystack, {
1234
332
        needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle;
1235
332
    });
1236
332
    n_fc_graphs = MVM_string_graphs(tc, needle_fc);
1237
332
    if (Haystack->body.storage_type == MVM_STRING_STRAND) {
1238
160
        MVMGraphemeIter_cached H_gic;
1239
160
        MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
1240
160
        H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, &H_gic, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 1);
1241
160
    }
1242
172
    else {
1243
172
        H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 0);
1244
172
    }
1245
332
    if (0 <= H_expansion)
1246
296
        return n_fc_graphs <= H_graphs + H_expansion - H_offset ? 1 : 0;
1247
36
    return 0;
1248
332
}
1249
/* Processes the pattern. The pattern must be able to store negative and positive
1250
 * numbers. It must be able to store at least 1/2 the length of the needle,
1251
 * though possibly more (though I am not sure it's possible for it to be more than
1252
 * 1/2). */
1253
9.29k
static void knuth_morris_pratt_process_pattern (MVMThreadContext *tc, MVMString *pat, MVMint16 *next, MVMStringIndex pat_graphs) {
1254
9.29k
    MVMint64 i = 0;
1255
9.29k
    MVMint64 j = next[0] = -1;
1256
32.4k
    while (i < pat_graphs) {
1257
23.1k
        if (j == -1 || MVM_string_get_grapheme_at_nocheck(tc, pat, i)
1258
20.7k
                    == MVM_string_get_grapheme_at_nocheck(tc, pat, j)) {
1259
20.7k
            i++; j++;
1260
20.7k
            next[i] = (i < pat_graphs
1261
11.4k
            && MVM_string_get_grapheme_at_nocheck(tc, pat, j)
1262
11.4k
            == MVM_string_get_grapheme_at_nocheck(tc, pat, i))
1263
9.04k
                ? next[j]
1264
11.6k
                : j;
1265
20.7k
        }
1266
2.39k
        else j = next[j];
1267
23.1k
    }
1268
9.29k
}
1269
1270
9.29k
static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset) {
1271
9.29k
    MVMint64 needle_offset = 0;
1272
9.29k
    MVMint64 text_offset   = H_offset;
1273
9.29k
    MVMStringIndex Haystack_graphs = MVM_string_graphs_nocheck(tc, Haystack);
1274
9.29k
    MVMStringIndex needle_graphs   = MVM_string_graphs_nocheck(tc, needle);
1275
9.29k
    MVMint16         *next = NULL;
1276
9.29k
    MVMString *flat_needle = NULL;
1277
9.29k
    size_t next_size = (1 + needle_graphs) * sizeof(MVMint16);
1278
9.29k
    int    next_is_malloced = 0;
1279
9.29k
    assert(needle_graphs <= MVM_string_KMP_max_pattern_length);
1280
9.29k
    /* Empty string is found at start of string */
1281
9.29k
    if (needle_graphs == 0)
1282
0
        return 0;
1283
9.29k
    /* Allocate max 8K onto the stack, otherwise malloc */
1284
9.29k
    if (next_size < 3000)
1285
9.29k
        next = alloca(next_size);
1286
0
    else {
1287
0
        next = MVM_malloc(next_size);
1288
0
        next_is_malloced = 1;
1289
0
    }
1290
9.29k
    /* If the needle is a strand, flatten it, otherwise use the original string */
1291
9.29k
    flat_needle = needle->body.storage_type == MVM_STRING_STRAND
1292
248
        ? collapse_strands(tc, needle)
1293
9.04k
        : needle;
1294
9.29k
    /* Process the needle into a jump table put into variable 'next' */
1295
9.29k
    knuth_morris_pratt_process_pattern(tc, flat_needle, next, needle_graphs);
1296
9.29k
    /* If the Haystack is a strand, use MVM_string_gi_cached_get_grapheme
1297
9.29k
     * since it retains its grapheme iterator over invocations unlike
1298
9.29k
     * MVM_string_get_grapheme_at_nocheck and caches the previous grapheme. It
1299
9.29k
     * is slower for flat Haystacks though. */
1300
9.29k
    #define MVM_kmp_loop(Haystack_function) {\
1301
668k
        while (text_offset < Haystack_graphs && needle_offset < needle_graphs) {\
1302
659k
            if (needle_offset == -1 || MVM_string_get_grapheme_at_nocheck(tc, flat_needle, needle_offset)\
1303
581k
                                    == (Haystack_function)) {\
1304
581k
                text_offset++; needle_offset++;\
1305
581k
                if (needle_offset == needle_graphs) {\
1306
258
                    if (next_is_malloced) MVM_free(next);\
1307
258
                    return text_offset - needle_offset;\
1308
258
                }\
1309
581k
            }\
1310
77.9k
            else needle_offset = next[needle_offset];\
1311
659k
        }\
1312
9.29k
    }
1313
9.29k
    if (Haystack->body.storage_type == MVM_STRING_STRAND) {
1314
9.29k
        MVMGraphemeIter_cached H_gic;
1315
9.29k
        MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
1316
9.29k
        MVM_kmp_loop(MVM_string_gi_cached_get_grapheme(tc, &H_gic, text_offset));
1317
9.03k
    }
1318
0
    else {
1319
0
        MVM_kmp_loop(MVM_string_get_grapheme_at_nocheck(tc, Haystack, text_offset));
1320
0
    }
1321
9.03k
    if (next_is_malloced) MVM_free(next);
1322
9.03k
    return -1;
1323
9.29k
}
1324
270
static MVMint64 string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start, int ignoremark, int ignorecase) {
1325
270
    /* Foldcase version of needle */
1326
270
    MVMString *needle_fc = NULL;
1327
270
    MVMStringIndex n_fc_graphs;
1328
270
1329
270
    size_t index           = (size_t)start;
1330
270
    MVMStringIndex H_graphs, n_graphs;
1331
270
    /* H_expansion must be able to hold integers 3x larger than MVMStringIndex */
1332
270
    MVMint64 H_expansion;
1333
270
    MVMint64 return_val = -1;
1334
270
    int is_gic = Haystack->body.storage_type == MVM_STRING_STRAND ? 1 : 0;
1335
270
    void *Hs_or_gic = Haystack;
1336
142
    MVM_string_check_arg(tc, Haystack, ignoremark ? "index ignore case ignore mark search target" : "index ignore case search target");
1337
142
    MVM_string_check_arg(tc, needle,   ignoremark ? "index ignore case ignore mark search term"   : "index ignore case search term");
1338
270
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
1339
270
    n_graphs = MVM_string_graphs_nocheck(tc, needle);
1340
270
    if (!n_graphs)
1341
12
        return start <= H_graphs ? start : -1; /* Empty string is in any other string */
1342
258
    if (!H_graphs)
1343
0
        return -1;
1344
258
    if (start < 0 || H_graphs <= start)
1345
0
        return -1;
1346
258
    /* Codepoints can expand into up to THREE codepoints (as of Unicode 9.0). The next check
1347
258
     * checks if it is at all possible for the needle grapheme number to be higher
1348
258
     * than the Haystack */
1349
258
    if (H_graphs * 3 < n_graphs)
1350
11
        return -1;
1351
258
1352
247
    if (n_graphs < 1)
1353
0
        return -1;
1354
247
1355
247
    MVMROOT(tc, Haystack, {
1356
247
        needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle;
1357
247
    });
1358
247
    n_fc_graphs = MVM_string_graphs(tc, needle_fc);
1359
247
    /* brute force for now. horrible, yes. halp. */
1360
247
    if (is_gic) {
1361
180
        Hs_or_gic = alloca(sizeof(MVMGraphemeIter_cached));
1362
180
        MVM_string_gi_cached_init(tc, Hs_or_gic, Haystack, start);
1363
180
    }
1364
2.30k
    while (index <= H_graphs) {
1365
2.30k
        H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Hs_or_gic, needle_fc, index, H_graphs, n_fc_graphs, ignoremark, ignorecase, is_gic);
1366
2.30k
        if (0 <= H_expansion)
1367
247
            return n_fc_graphs <= H_graphs + H_expansion - index ? (MVMint64)index : -1;
1368
2.05k
        index++;
1369
2.05k
    }
1370
0
    return -1;
1371
247
}
1372
1373
221
MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
1374
221
    return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 0, 1);
1375
221
}
1376
142
MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
1377
142
    return string_index_ignore_case(tc, Haystack, needle, start, 0, 1);
1378
142
}
1379
110
MVMint64 MVM_string_equal_at_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
1380
110
    return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1, 0);
1381
110
}
1382
4
MVMint64 MVM_string_index_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
1383
4
    return string_index_ignore_case(tc, Haystack, needle, start, 1, 0);
1384
4
}
1385
2
MVMint64 MVM_string_equal_at_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
1386
2
    return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1, 1);
1387
2
}
1388
124
MVMint64 MVM_string_index_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
1389
124
    return string_index_ignore_case(tc, Haystack, needle, start, 1, 1);
1390
124
}
1391
1392
932k
MVMGrapheme32 MVM_string_ord_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
1393
932k
    MVMStringIndex agraphs;
1394
932k
    MVMGrapheme32 g;
1395
932k
1396
932k
    MVM_string_check_arg(tc, s, "grapheme_at");
1397
932k
1398
932k
    agraphs = MVM_string_graphs(tc, s);
1399
932k
    if (offset < 0 || agraphs <= offset)
1400
0
        return -1;
1401
932k
1402
932k
    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
1403
932k
1404
932k
    return 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
1405
932k
}
1406
1407
/* Gets the base character at a grapheme position, ignoring things like diacritics */
1408
54
MVMGrapheme32 MVM_string_ord_basechar_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
1409
54
    MVMStringIndex agraphs;
1410
54
    MVMint32 ready;
1411
54
1412
54
    MVM_string_check_arg(tc, s, "ord_basechar_at");
1413
54
1414
54
    agraphs = MVM_string_graphs_nocheck(tc, s);
1415
54
    if (offset < 0 || agraphs <= offset)
1416
0
        return -1;  /* fixes RT #126771 */
1417
54
1418
54
    return ord_getbasechar(tc, MVM_string_get_grapheme_at_nocheck(tc, s, offset));
1419
54
}
1420
1421
1422
/* Compares two strings for equality. */
1423
55.3M
MVMint64 MVM_string_equal(MVMThreadContext *tc, MVMString *a, MVMString *b) {
1424
55.3M
    MVMStringIndex agraphs, bgraphs;
1425
55.3M
1426
55.3M
    MVM_string_check_arg(tc, a, "equal");
1427
55.3M
    MVM_string_check_arg(tc, b, "equal");
1428
55.3M
1429
55.3M
    if (a == b)
1430
16.2M
        return 1;
1431
55.3M
1432
39.0M
    agraphs = MVM_string_graphs_nocheck(tc, a);
1433
39.0M
    bgraphs = MVM_string_graphs_nocheck(tc, b);
1434
39.0M
1435
39.0M
    if (agraphs != bgraphs)
1436
23.8M
        return 0;
1437
39.0M
    /* If we have cached hash codes from both a and b we can compare if they are identical.
1438
39.0M
     * If they don't match then we already know the two strings are not equal. */
1439
15.1M
    if (a->body.cached_hash_code && b->body.cached_hash_code && a->body.cached_hash_code != b->body.cached_hash_code)
1440
115k
        return 0;
1441
15.1M
1442
15.0M
    return MVM_string_substrings_equal_nocheck(tc, a, 0, bgraphs, b, 0);
1443
15.1M
}
1444
1445
/* more general form of has_at; compares two substrings for equality */
1446
MVMint64 MVM_string_have_at(MVMThreadContext *tc, MVMString *a,
1447
0
        MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
1448
0
1449
0
    MVM_string_check_arg(tc, a, "have_at");
1450
0
    MVM_string_check_arg(tc, b, "have_at");
1451
0
1452
0
    if (starta < 0 || startb < 0)
1453
0
        return 0;
1454
0
    if (length == 0)
1455
0
        return 1;
1456
0
    if (MVM_string_graphs_nocheck(tc, a) < starta + length || MVM_string_graphs_nocheck(tc, b) < startb + length)
1457
0
        return 0;
1458
0
1459
0
    return MVM_string_substrings_equal_nocheck(tc, a, starta, length, b, startb);
1460
0
}
1461
1462
/* Returns the grapheme at a given index of the string */
1463
625
MVMint64 MVM_string_get_grapheme_at(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
1464
625
    MVMStringIndex agraphs;
1465
625
1466
625
    MVM_string_check_arg(tc, a, "grapheme_at");
1467
625
1468
625
    agraphs = MVM_string_graphs_nocheck(tc, a);
1469
625
1470
625
    if (index < 0 || agraphs <= index)
1471
0
        MVM_exception_throw_adhoc(tc, "Invalid string index: max %"PRId32", got %"PRId64,
1472
0
            agraphs - 1, index);
1473
625
1474
625
    return (MVMint64)MVM_string_get_grapheme_at_nocheck(tc, a, index);
1475
625
}
1476
1477
/* Finds the location of a grapheme in a string.  Useful for small character class lookup */
1478
2.15M
MVMint64 MVM_string_index_of_grapheme(MVMThreadContext *tc, MVMString *a, MVMGrapheme32 grapheme) {
1479
2.15M
    size_t index = -1;
1480
2.15M
    MVMGraphemeIter gi;
1481
2.15M
1482
2.15M
    MVM_string_check_arg(tc, a, "string_index_of_grapheme");
1483
2.15M
1484
2.15M
    MVM_string_gi_init(tc, &gi, a);
1485
12.1M
    while (MVM_string_gi_has_more(tc, &gi)) {
1486
10.3M
        index++;
1487
10.3M
        if (MVM_string_gi_get_grapheme(tc, &gi) == grapheme)
1488
296k
            return index;
1489
10.3M
    }
1490
1.86M
    return -1;
1491
2.15M
}
1492
1493
/* Case change functions. */
1494
MVMint64 MVM_string_grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g);
1495
13.2k
static MVMString * do_case_change(MVMThreadContext *tc, MVMString *s, MVMint32 type, char *error) {
1496
13.2k
    MVMint64 sgraphs;
1497
13.2k
    MVM_string_check_arg(tc, s, error);
1498
13.2k
    sgraphs = MVM_string_graphs_nocheck(tc, s);
1499
13.2k
    if (sgraphs) {
1500
11.7k
        MVMString *result;
1501
11.7k
        MVMGraphemeIter gi;
1502
11.7k
        MVMint64 result_graphs = sgraphs;
1503
11.7k
        MVMGrapheme32 *result_buf = MVM_malloc(result_graphs * sizeof(MVMGrapheme32));
1504
11.7k
        MVMint32 changed = 0;
1505
11.7k
        MVMint64 i = 0;
1506
11.7k
        MVM_string_gi_init(tc, &gi, s);
1507
69.9k
        while (MVM_string_gi_has_more(tc, &gi)) {
1508
58.1k
            MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
1509
58.1k
          peeked:
1510
58.1k
            if (g == 0x03A3) {
1511
0
                /* Greek sigma needs special handling when lowercased. */
1512
0
                switch (type) {
1513
0
                    case MVM_unicode_case_change_type_upper:
1514
0
                    case MVM_unicode_case_change_type_title:
1515
0
                        result_buf[i++] = g;
1516
0
                        break;
1517
0
                    case MVM_unicode_case_change_type_lower:
1518
0
                        changed = 1;
1519
0
                        if (i == 0) {
1520
0
                            /* Start of string, so not final. */
1521
0
                            result_buf[i++] = 0x03C3;
1522
0
                        }
1523
0
                        else if (!MVM_string_grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, result_buf[i - 1])) {
1524
0
                            /* Previous char is not a letter; not final (as has
1525
0
                             * to be at end of a word and not only thing in a
1526
0
                             * word). */
1527
0
                            result_buf[i++] = 0x03C3;
1528
0
                        }
1529
0
                        else if (!MVM_string_gi_has_more(tc, &gi)) {
1530
0
                            /* End of string. We only reach here if we have a
1531
0
                             * letter before us, so it must be final. */
1532
0
                            result_buf[i++] = 0x03C2;
1533
0
                        }
1534
0
                        else {
1535
0
                            /* Letter before us, something ahead of us. Need to
1536
0
                             * peek ahead to see if it's a letter, to decide if
1537
0
                             * we have final sigma or not. */
1538
0
                            g = MVM_string_gi_get_grapheme(tc, &gi);
1539
0
                            if (MVM_string_grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, g))
1540
0
                                result_buf[i++] = 0x03C3;
1541
0
                            else
1542
0
                                result_buf[i++] = 0x03C2;
1543
0
                            goto peeked;
1544
0
                        }
1545
0
                        break;
1546
0
                    case MVM_unicode_case_change_type_fold:
1547
0
                        result_buf[i++] = 0x03C3;
1548
0
                        changed = 1;
1549
0
                        break;
1550
0
                }
1551
0
            }
1552
58.1k
            else if (0 <= g) {
1553
58.1k
                const MVMCodepoint *result_cps;
1554
58.1k
                MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
1555
58.1k
                    g, type, &result_cps);
1556
58.1k
                if (num_result_cps == 0) {
1557
40.7k
                    result_buf[i++] = g;
1558
40.7k
                }
1559
17.4k
                else if (num_result_cps == 1) {
1560
17.3k
                    result_buf[i++] = *result_cps;
1561
17.3k
                    changed = 1;
1562
17.3k
                }
1563
148
                else {
1564
148
                    /* To maintain NFG, we need to re-normalize when we get an
1565
148
                     * expansion. */
1566
148
                    MVMNormalizer norm;
1567
148
                    MVMint32 num_result_graphs;
1568
148
                    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
1569
148
                    MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, num_result_cps);
1570
148
                    MVM_unicode_normalizer_eof(tc, &norm);
1571
148
                    num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
1572
148
1573
148
                    /* Make space for any extra graphemes. */
1574
148
                    if (1 < num_result_graphs) {
1575
148
                        result_graphs += num_result_graphs - 1;
1576
148
                        result_buf = MVM_realloc(result_buf,
1577
148
                            result_graphs * sizeof(MVMGrapheme32));
1578
148
                    }
1579
148
1580
148
                    /* Copy resulting graphemes. */
1581
444
                    while (0 < num_result_graphs) {
1582
296
                        result_buf[i++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
1583
296
                        num_result_graphs--;
1584
296
                    }
1585
148
                    changed = 1;
1586
148
1587
148
                    /* Clean up normalizer (we could init one per transform
1588
148
                     * and keep it around in the future, if we find it's a
1589
148
                     * worthwhile gain). */
1590
148
                    MVM_unicode_normalizer_cleanup(tc, &norm);
1591
148
                }
1592
58.1k
            }
1593
0
            else {
1594
0
                MVMGrapheme32 *transformed;
1595
0
                MVMint32 num_transformed = MVM_nfg_get_case_change(tc, g, type, &transformed);
1596
0
                if (num_transformed == 0) {
1597
0
                    result_buf[i++] = g;
1598
0
                }
1599
0
                else if (num_transformed == 1) {
1600
0
                    result_buf[i++] = *transformed;
1601
0
                    changed = 1;
1602
0
                }
1603
0
                else {
1604
0
                    MVMuint32 j;
1605
0
                    result_graphs += num_transformed - 1;
1606
0
                    result_buf = MVM_realloc(result_buf,
1607
0
                        result_graphs * sizeof(MVMGrapheme32));
1608
0
                    MVM_VECTORIZE_LOOP
1609
0
                    for (j = 0; j < num_transformed; j++)
1610
0
                        result_buf[i++] = transformed[j];
1611
0
                    changed = 1;
1612
0
                }
1613
0
            }
1614
58.1k
        }
1615
11.7k
        if (changed) {
1616
3.62k
            result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1617
3.62k
            result->body.num_graphs      = result_graphs;
1618
3.62k
            result->body.storage_type    = MVM_STRING_GRAPHEME_32;
1619
3.62k
            result->body.storage.blob_32 = result_buf;
1620
3.62k
            return result;
1621
3.62k
        }
1622
8.12k
        else {
1623
8.12k
            MVM_free(result_buf);
1624
8.12k
        }
1625
11.7k
    }
1626
9.67k
    STRAND_CHECK(tc, s);
1627
9.67k
    return s;
1628
13.2k
}
1629
128
MVMString * MVM_string_uc(MVMThreadContext *tc, MVMString *s) {
1630
128
    return do_case_change(tc, s, MVM_unicode_case_change_type_upper, "uc");
1631
128
}
1632
12.6k
MVMString * MVM_string_lc(MVMThreadContext *tc, MVMString *s) {
1633
12.6k
    return do_case_change(tc, s, MVM_unicode_case_change_type_lower, "lc");
1634
12.6k
}
1635
12
MVMString * MVM_string_tc(MVMThreadContext *tc, MVMString *s) {
1636
12
    return do_case_change(tc, s, MVM_unicode_case_change_type_title, "tc");
1637
12
}
1638
469
MVMString * MVM_string_fc(MVMThreadContext *tc, MVMString *s) {
1639
469
    return do_case_change(tc, s, MVM_unicode_case_change_type_fold, "fc");
1640
469
}
1641
1642
/* "Strict"ly (if possible) decodes a C buffer to an MVMString, dependent on the
1643
 * encoding type flag. Unlike MVM_string_decode, it will not pass through
1644
 * codepoints which have no official mapping. `config` can be set to 1 to indicate
1645
 * that you want to decode non-strict ("permissive"), which will try and decode
1646
 * as long as it's possible (For example codepoint 129 in windows-1252 is invalid,
1647
 * but is technically possible to use Unicode codepoint 129 instead (though it's
1648
 * most likely this means the input is actually *not* windows-1252).
1649
 * For now windows-1252 and windows-1251 are the only ones this makes a difference
1650
 * on. And it is mostly irrelevant for utf8/utf8-c8 encodings since they can
1651
 * already represent all codepoints below 0x10FFFF */
1652
MVMString * MVM_string_decode_config(MVMThreadContext *tc,
1653
        const MVMObject *type_object, char *Cbuf, MVMint64 byte_length,
1654
29
        MVMint64 encoding_flag, MVMString *replacement, MVMint64 config) {
1655
29
    switch(encoding_flag) {
1656
6
        case MVM_encoding_type_utf8:
1657
6
            return MVM_string_utf8_decode_strip_bom(tc, type_object, Cbuf, byte_length);
1658
0
        case MVM_encoding_type_ascii:
1659
0
            return MVM_string_ascii_decode(tc, type_object, Cbuf, byte_length);
1660
2
        case MVM_encoding_type_latin1:
1661
2
            return MVM_string_latin1_decode(tc, type_object, Cbuf, byte_length);
1662
1
        case MVM_encoding_type_utf16:
1663
1
            return MVM_string_utf16_decode(tc, type_object, Cbuf, byte_length);
1664
7
        case MVM_encoding_type_windows1252:
1665
7
            return MVM_string_windows1252_decode_config(tc, type_object, Cbuf, byte_length, replacement, config);
1666
0
        case MVM_encoding_type_windows1251:
1667
0
            return MVM_string_windows1251_decode_config(tc, type_object, Cbuf, byte_length, replacement, config);
1668
0
        case MVM_encoding_type_shiftjis:
1669
0
            return MVM_string_shiftjis_decode(tc, type_object, Cbuf, byte_length, replacement, config);
1670
13
        case MVM_encoding_type_utf8_c8:
1671
13
            return MVM_string_utf8_c8_decode(tc, type_object, Cbuf, byte_length);
1672
0
        default:
1673
0
            MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
1674
29
    }
1675
29
}
1676
/* Strictly decodes a C buffer to an MVMString, dependent on the encoding type flag.
1677
 * See the comments above MVM_string_decode_config() above for more details. */
1678
MVMString * MVM_string_decode(MVMThreadContext *tc,
1679
4
        const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, MVMint64 encoding_flag) {
1680
4
    return MVM_string_decode_config(tc, type_object, Cbuf, byte_length, encoding_flag, NULL, MVM_ENCODING_PERMISSIVE);
1681
4
}
1682
1683
/* Strictly encodes an MVMString to a C buffer, dependent on the encoding type flag.
1684
 * See comments for MVM_string_decode_config() above for more details. */
1685
char * MVM_string_encode_config(MVMThreadContext *tc, MVMString *s, MVMint64 start,
1686
        MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag,
1687
13.0k
        MVMString *replacement, MVMint32 translate_newlines, MVMuint8 config) {
1688
13.0k
    switch(encoding_flag) {
1689
13.0k
        case MVM_encoding_type_utf8:
1690
13.0k
            return MVM_string_utf8_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1691
6
        case MVM_encoding_type_ascii:
1692
6
            return MVM_string_ascii_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1693
2
        case MVM_encoding_type_latin1:
1694
2
            return MVM_string_latin1_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1695
1
        case MVM_encoding_type_utf16:
1696
1
            return MVM_string_utf16_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1697
9
        case MVM_encoding_type_windows1252:
1698
9
            return MVM_string_windows1252_encode_substr_config(tc, s, output_size, start, length, replacement, translate_newlines, config);
1699
0
        case MVM_encoding_type_windows1251:
1700
0
            return MVM_string_windows1251_encode_substr_config(tc, s, output_size, start, length, replacement, translate_newlines, config);
1701
0
        case MVM_encoding_type_shiftjis:
1702
0
            return MVM_string_shiftjis_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines, config);
1703
8
        case MVM_encoding_type_utf8_c8:
1704
8
            return MVM_string_utf8_c8_encode_substr(tc, s, output_size, start, length, replacement);
1705
0
        default:
1706
0
            MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
1707
13.0k
    }
1708
13.0k
}
1709
char * MVM_string_encode(MVMThreadContext *tc, MVMString *s, MVMint64 start,
1710
        MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag,
1711
0
        MVMString *replacement, MVMint32 translate_newlines) {
1712
0
    return MVM_string_encode_config(tc, s, start, length, output_size, encoding_flag, replacement, translate_newlines, MVM_ENCODING_PERMISSIVE);
1713
0
}
1714
1715
/* Strictly encodes a string, and writes the encoding string into the supplied Buf
1716
 * instance, which should be an integer array with MVMArray REPR. */
1717
MVMObject * MVM_string_encode_to_buf_config(MVMThreadContext *tc, MVMString *s, MVMString *enc_name,
1718
13.0k
        MVMObject *buf, MVMString *replacement, MVMint64 config) {
1719
13.0k
    MVMuint64 output_size;
1720
13.0k
    MVMuint8 *encoded;
1721
13.0k
    MVMArrayREPRData *buf_rd;
1722
13.0k
    MVMuint8 elem_size = 0;
1723
13.0k
1724
13.0k
    /* Ensure the target is in the correct form. */
1725
13.0k
    MVM_string_check_arg(tc, s, "encode");
1726
13.0k
    if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
1727
0
        MVM_exception_throw_adhoc(tc, "encode requires a native array to write into");
1728
13.0k
    buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
1729
13.0k
    if (buf_rd) {
1730
13.0k
        switch (buf_rd->slot_type) {
1731
0
            case MVM_ARRAY_I64: elem_size = 8; break;
1732
0
            case MVM_ARRAY_I32: elem_size = 4; break;
1733
0
            case MVM_ARRAY_I16: elem_size = 2; break;
1734
0
            case MVM_ARRAY_I8:  elem_size = 1; break;
1735
0
            case MVM_ARRAY_U64: elem_size = 8; break;
1736
0
            case MVM_ARRAY_U32: elem_size = 4; break;
1737
1
            case MVM_ARRAY_U16: elem_size = 2; break;
1738
13.0k
            case MVM_ARRAY_U8:  elem_size = 1; break;
1739
13.0k
        }
1740
13.0k
    }
1741
13.0k
    if (!elem_size)
1742
0
        MVM_exception_throw_adhoc(tc, "encode requires a native int array");
1743
13.0k
    if (((MVMArray *)buf)->body.slots.any)
1744
0
        MVM_exception_throw_adhoc(tc, "encode requires an empty array");
1745
13.0k
1746
13.0k
    /* At least find_encoding may allocate on first call, so root just
1747
13.0k
     * in case. */
1748
13.0k
    MVMROOT2(tc, buf, s, {
1749
13.0k
        const MVMuint8 encoding_flag = MVM_string_find_encoding(tc, enc_name);
1750
13.0k
        encoded = (MVMuint8 *)MVM_string_encode_config(tc, s, 0, MVM_string_graphs_nocheck(tc, s), &output_size,
1751
13.0k
            encoding_flag, replacement, 0, config);
1752
13.0k
    });
1753
13.0k
1754
13.0k
    /* Stash the encoded data in the VMArray. */
1755
13.0k
    ((MVMArray *)buf)->body.slots.i8 = (MVMint8 *)encoded;
1756
13.0k
    ((MVMArray *)buf)->body.start    = 0;
1757
13.0k
    ((MVMArray *)buf)->body.ssize    = output_size / elem_size;
1758
13.0k
    ((MVMArray *)buf)->body.elems    = output_size / elem_size;
1759
13.0k
    return buf;
1760
13.0k
}
1761
MVMObject * MVM_string_encode_to_buf(MVMThreadContext *tc, MVMString *s, MVMString *enc_name,
1762
13.0k
        MVMObject *buf, MVMString *replacement) {
1763
13.0k
    return MVM_string_encode_to_buf_config(tc, s, enc_name, buf, replacement, MVM_ENCODING_PERMISSIVE);
1764
13.0k
}
1765
/* Decodes a string using the data from the specified Buf. Decodes "strict" by
1766
 * default, but optionally can be "permissive". */
1767
MVMString * MVM_string_decode_from_buf_config(MVMThreadContext *tc, MVMObject *buf,
1768
25
        MVMString *enc_name, MVMString *replacement, MVMint64 config) {
1769
25
    MVMArrayREPRData *buf_rd;
1770
25
    MVMuint8 encoding_flag;
1771
25
    MVMuint8 elem_size = 0;
1772
25
1773
25
    /* Ensure the source is in the correct form. */
1774
25
    if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
1775
0
        MVM_exception_throw_adhoc(tc, "decode requires a native array to read from");
1776
25
    buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
1777
25
    if (buf_rd) {
1778
25
        switch (buf_rd->slot_type) {
1779
0
            case MVM_ARRAY_I64: elem_size = 8; break;
1780
0
            case MVM_ARRAY_I32: elem_size = 4; break;
1781
0
            case MVM_ARRAY_I16: elem_size = 2; break;
1782
1
            case MVM_ARRAY_I8:  elem_size = 1; break;
1783
0
            case MVM_ARRAY_U64: elem_size = 8; break;
1784
0
            case MVM_ARRAY_U32: elem_size = 4; break;
1785
1
            case MVM_ARRAY_U16: elem_size = 2; break;
1786
23
            case MVM_ARRAY_U8:  elem_size = 1; break;
1787
25
        }
1788
25
    }
1789
25
    if (!elem_size)
1790
0
        MVM_exception_throw_adhoc(tc, "encode requires a native int array");
1791
25
1792
25
    /* Decode. */
1793
25
    MVMROOT(tc, buf, {
1794
25
        encoding_flag = MVM_string_find_encoding(tc, enc_name);
1795
25
    });
1796
25
    return MVM_string_decode_config(tc, tc->instance->VMString,
1797
25
        (char *)(((MVMArray *)buf)->body.slots.i8 + ((MVMArray *)buf)->body.start),
1798
25
        ((MVMArray *)buf)->body.elems * elem_size,
1799
25
        encoding_flag, replacement, config);
1800
25
}
1801
21
MVMString * MVM_string_decode_from_buf(MVMThreadContext *tc, MVMObject *buf, MVMString *enc_name) {
1802
21
    return MVM_string_decode_from_buf_config(tc, buf, enc_name, NULL, MVM_ENCODING_PERMISSIVE);
1803
21
}
1804
1805
43.3k
MVMObject * MVM_string_split(MVMThreadContext *tc, MVMString *separator, MVMString *input) {
1806
43.3k
    MVMObject *result = NULL;
1807
43.3k
    MVMStringIndex start, end, sep_length;
1808
43.3k
    MVMHLLConfig *hll = MVM_hll_current(tc);
1809
43.3k
1810
43.3k
    MVM_string_check_arg(tc, separator, "split separator");
1811
43.3k
    MVM_string_check_arg(tc, input, "split input");
1812
43.3k
1813
43.3k
    MVMROOT3(tc, input, separator, result, {
1814
43.3k
        result = MVM_repr_alloc_init(tc, hll->slurpy_array_type);
1815
43.3k
        start = 0;
1816
43.3k
        end = MVM_string_graphs_nocheck(tc, input);
1817
43.3k
        sep_length = MVM_string_graphs_nocheck(tc, separator);
1818
43.3k
1819
43.3k
        while (start < end) {
1820
43.3k
            MVMString *portion;
1821
43.3k
            MVMStringIndex index;
1822
43.3k
            MVMStringIndex length;
1823
43.3k
1824
43.3k
            /* XXX make this use the dual-traverse iterator, but such that it
1825
43.3k
                can reset the index of what it's comparing... <!> */
1826
43.3k
            index = MVM_string_index(tc, input, separator, start);
1827
43.3k
            length = sep_length ? (index == -1 ? end : index) - start : 1;
1828
43.3k
            if (0 < length || (sep_length && length == 0)) {
1829
43.3k
                portion = MVM_string_substring(tc, input, start, length);
1830
43.3k
                MVMROOT(tc, portion, {
1831
43.3k
                    MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
1832
43.3k
                    MVM_repr_set_str(tc, pobj, portion);
1833
43.3k
                    MVM_repr_push_o(tc, result, pobj);
1834
43.3k
                });
1835
43.3k
            }
1836
43.3k
            start += length + sep_length;
1837
43.3k
            /* Gather an empty string if the delimiter is found at the end. */
1838
43.3k
            if (sep_length && start == end) {
1839
43.3k
                MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
1840
43.3k
                MVM_repr_set_str(tc, pobj, tc->instance->str_consts.empty);
1841
43.3k
                MVM_repr_push_o(tc, result, pobj);
1842
43.3k
            }
1843
43.3k
        }
1844
43.3k
    });
1845
43.3k
1846
43.3k
    return result;
1847
43.3k
}
1848
/* Used in the MVM_string_join function. Moved here to simplify the code */
1849
void copy_to_32bit (MVMThreadContext *tc, MVMString *source,
1850
28.9k
    MVMString *dest, MVMint64 *position, MVMGraphemeIter *gi) {
1851
28.9k
    /* Add source. */
1852
28.9k
    switch (source->body.storage_type) {
1853
636
        case MVM_STRING_GRAPHEME_32: {
1854
636
            memcpy(
1855
636
                dest->body.storage.blob_32 + *position,
1856
636
                source->body.storage.blob_32,
1857
636
                source->body.num_graphs * sizeof(MVMGrapheme32));
1858
636
            *position += source->body.num_graphs;
1859
636
            break;
1860
636
        }
1861
2.72k
        case MVM_STRING_GRAPHEME_ASCII:
1862
2.72k
        case MVM_STRING_GRAPHEME_8: {
1863
2.72k
            MVMStringIndex sindex = 0;
1864
8.37k
            while (sindex < source->body.num_graphs)
1865
5.65k
                dest->body.storage.blob_32[(*position)++] =
1866
5.65k
                    source->body.storage.blob_8[sindex++];
1867
2.72k
            break;
1868
2.72k
        }
1869
25.5k
        default:
1870
25.5k
            MVM_string_gi_init(tc, gi, source);
1871
787k
            while (MVM_string_gi_has_more(tc, gi))
1872
762k
                dest->body.storage.blob_32[(*position)++] =
1873
762k
                    MVM_string_gi_get_grapheme(tc, gi);
1874
25.5k
            break;
1875
28.9k
    }
1876
28.9k
}
1877
/* Used in MVM_string_join to check stability of adding the next piece */
1878
MVM_STATIC_INLINE void join_check_stability(MVMThreadContext *tc, MVMString *piece,
1879
15.3k
    MVMString *separator, MVMString **pieces, MVMint32 *concats_stable, MVMint64 num_pieces, MVMint64 sgraphs, MVMint64 piece_index) {
1880
15.3k
    if (!sgraphs) {
1881
15.0k
        /* If there's no separator and one piece is The Empty String we
1882
15.0k
         * have to be extra careful about concat stability */
1883
15.0k
        if (!MVM_string_graphs_nocheck(tc, piece)
1884
18
                && piece_index + 1 < num_pieces
1885
15
                && !MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], pieces[piece_index + 1])) {
1886
0
            *concats_stable = 0;
1887
0
        }
1888
15.0k
        /* Separator has no graphemes, so NFG stability check
1889
15.0k
         * should consider pieces. */
1890
15.0k
        else if (!MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], piece))
1891
4
            *concats_stable = 0;
1892
15.0k
    }
1893
15.3k
    /* If we have a separator, check concat stability */
1894
306
    else {
1895
306
        if (!MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], separator) /* Before */
1896
306
         || !MVM_nfg_is_concat_stable(tc, separator, piece)) { /* And after separator */
1897
0
            *concats_stable = 0;
1898
0
        }
1899
306
    }
1900
15.3k
}
1901
34.4k
MVM_STATIC_INLINE MVMString * join_get_str_from_pos(MVMThreadContext *tc, MVMObject *array, MVMint64 index, MVMint64 is_str_array) {
1902
34.4k
    if (is_str_array) {
1903
27.0k
        MVMString *piece = MVM_repr_at_pos_s(tc, array, index);
1904
27.0k
        if (piece)
1905
27.0k
            return piece;
1906
27.0k
    }
1907
7.44k
    else {
1908
7.44k
        MVMObject *item = MVM_repr_at_pos_o(tc, array, index);
1909
7.44k
        if (item && IS_CONCRETE(item))
1910
7.44k
            return MVM_repr_get_str(tc, item);
1911
7.44k
    }
1912
0
    return (MVMString*)NULL;
1913
34.4k
}
1914
19.2k
MVMString * MVM_string_join(MVMThreadContext *tc, MVMString *separator, MVMObject *input) {
1915
19.2k
    MVMString  *result = NULL;
1916
19.2k
    MVMString **pieces = NULL;
1917
19.2k
    MVMint64    elems, num_pieces, sgraphs, i, is_str_array, total_graphs;
1918
19.2k
    MVMuint16   sstrands, total_strands;
1919
19.2k
    MVMint32    concats_stable = 1, all_strands;
1920
19.2k
    size_t      bytes;
1921
19.2k
1922
19.2k
    MVM_string_check_arg(tc, separator, "join separator");
1923
19.2k
    if (!IS_CONCRETE(input))
1924
0
        MVM_exception_throw_adhoc(tc, "join needs a concrete array to join");
1925
19.2k
1926
19.2k
    /* See how many things we have to join; if the answer is "none" then we
1927
19.2k
     * can make a hasty escape. */
1928
19.2k
    elems = MVM_repr_elems(tc, input);
1929
19.2k
    if (elems == 0)
1930
171
        return tc->instance->str_consts.empty;
1931
19.0k
    bytes = elems * sizeof(MVMString *);
1932
19.0k
    is_str_array = REPR(input)->pos_funcs.get_elem_storage_spec(tc,
1933
19.0k
        STABLE(input)).boxed_primitive == MVM_STORAGE_SPEC_BP_STR;
1934
19.0k
1935
19.0k
    /* If there's only one element to join, just return it. */
1936
19.0k
    if (elems == 1) {
1937
5.83k
        MVMString *piece = join_get_str_from_pos(tc, input, 0, is_str_array);
1938
5.83k
        if (piece)
1939
5.83k
            return piece;
1940
5.83k
    }
1941
19.0k
1942
19.0k
    /* Allocate result. */
1943
13.2k
    MVMROOT2(tc, separator, input, {
1944
13.2k
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1945
13.2k
    });
1946
13.2k
1947
13.2k
    /* Take a first pass through the string, counting up length and the total
1948
13.2k
     * number of strands we encounter as well as building a flat array of the
1949
13.2k
     * strings (so we only have to do the indirect calls once). */
1950
13.2k
    sgraphs  = MVM_string_graphs_nocheck(tc, separator);
1951
13.2k
    if (sgraphs)
1952
64
        sstrands = separator->body.storage_type == MVM_STRING_STRAND
1953
0
            ? separator->body.num_strands
1954
64
            : 1;
1955
13.2k
    else
1956
13.1k
        sstrands = 1;
1957
13.2k
    pieces        = MVM_fixed_size_alloc(tc, tc->instance->fsa, bytes);
1958
13.2k
    num_pieces    = 0;
1959
13.2k
    total_graphs  = 0;
1960
13.2k
    total_strands = 0;
1961
13.2k
    /* Is the separator a strand? */
1962
13.2k
    all_strands = separator->body.storage_type == MVM_STRING_STRAND;
1963
41.8k
    for (i = 0; i < elems; i++) {
1964
28.6k
        /* Get piece of the string. */
1965
28.6k
        MVMString *piece = join_get_str_from_pos(tc, input, i, is_str_array);
1966
28.6k
        MVMint64   piece_graphs;
1967
28.6k
        if (!piece)
1968
0
            continue;
1969
28.6k
1970
28.6k
        /* Check that all the pieces are strands. */
1971
28.6k
        if (all_strands)
1972
0
            all_strands = piece->body.storage_type == MVM_STRING_STRAND;
1973
28.6k
1974
28.6k
        /* If it wasn't the first piece, add separator here. */
1975
28.6k
        if (num_pieces) {
1976
15.4k
            total_strands += sstrands;
1977
15.4k
            total_graphs  += sgraphs;
1978
15.4k
        }
1979
28.6k
1980
28.6k
        /* Add on the piece's strands and graphs. */
1981
28.6k
        piece_graphs = MVM_string_graphs(tc, piece);
1982
28.6k
        if (piece_graphs) {
1983
28.4k
            total_strands += piece->body.storage_type == MVM_STRING_STRAND
1984
25.5k
                ? piece->body.num_strands
1985
2.91k
                : 1;
1986
28.4k
            total_graphs += piece_graphs;
1987
28.4k
        }
1988
28.6k
1989
28.6k
        /* Store piece. */
1990
28.6k
        pieces[num_pieces++] = piece;
1991
28.6k
    }
1992
13.2k
    /* This guards the joining by method of multiple concats, and will be faster
1993
13.2k
     * if we only end up with one piece after going through each element of the array */
1994
13.2k
    if (num_pieces == 1)
1995
0
        return pieces[0];
1996
13.2k
    /* We now know the total eventual number of graphemes. */
1997
13.2k
    if (total_graphs == 0) {
1998
10
        MVM_fixed_size_free(tc, tc->instance->fsa, bytes, pieces);
1999
10
        return tc->instance->str_consts.empty;
2000
10
    }
2001
13.2k
    result->body.num_graphs = total_graphs;
2002
13.2k
2003
13.2k
    MVMROOT(tc, result, {
2004
13.2k
    /* If the separator and pieces are all strands, and there are
2005
13.2k
     * on average at least 16 graphemes in each of the strands. */
2006
13.2k
    if (all_strands && total_strands <  MVM_STRING_MAX_STRANDS
2007
13.2k
              &&  total_strands * 16 <= total_graphs) {
2008
13.2k
        MVMuint16 offset = 0;
2009
13.2k
        result->body.storage_type    = MVM_STRING_STRAND;
2010
13.2k
        result->body.storage.strands = allocate_strands(tc, total_strands);
2011
13.2k
        result->body.num_strands     = total_strands;
2012
13.2k
        for (i = 0; i < num_pieces; i++) {
2013
13.2k
            MVMString *piece = pieces[i];
2014
13.2k
            if (0 < i) {
2015
13.2k
                /* No more checks unless still stable */
2016
13.2k
                if (concats_stable)
2017
13.2k
                    join_check_stability(tc, piece, separator, pieces,
2018
13.2k
                        &concats_stable, num_pieces, sgraphs, i);
2019
13.2k
                copy_strands(tc, separator, 0, result, offset, separator->body.num_strands);
2020
13.2k
                offset += separator->body.num_strands;
2021
13.2k
            }
2022
13.2k
            copy_strands(tc, piece, 0, result, offset, piece->body.num_strands);
2023
13.2k
            offset += piece->body.num_strands;
2024
13.2k
        }
2025
13.2k
    }
2026
13.2k
    /* Doing multiple concats is only faster if we have about 300 graphemes per
2027
13.2k
       piece or if we have less than for pieces and more than 150 graphemes per piece */
2028
13.2k
    else if (total_strands <  MVM_STRING_MAX_STRANDS && (300 < num_pieces/total_graphs || (num_pieces < 4 && 150 < num_pieces/total_graphs))) {
2029
13.2k
        MVMString *result = NULL;
2030
13.2k
        MVMROOT(tc, result, {
2031
13.2k
            if (sgraphs) {
2032
13.2k
                i = 0;
2033
13.2k
                result = MVM_string_concatenate(tc, pieces[i++], separator);
2034
13.2k
                result = MVM_string_concatenate(tc, result, pieces[i++]);
2035
13.2k
                for (; i < num_pieces;) {
2036
13.2k
                    result = MVM_string_concatenate(tc, result, separator);
2037
13.2k
                    result = MVM_string_concatenate(tc, result, pieces[i++]);
2038
13.2k
                }
2039
13.2k
2040
13.2k
            }
2041
13.2k
            else {
2042
13.2k
                result = MVM_string_concatenate(tc, pieces[0], pieces[1]);
2043
13.2k
                i = 2;
2044
13.2k
                for (; i < num_pieces;) {
2045
13.2k
                    result = MVM_string_concatenate(tc, result, pieces[i++]);
2046
13.2k
                }
2047
13.2k
            }
2048
13.2k
        });
2049
13.2k
        return result;
2050
13.2k
    }
2051
13.2k
    else {
2052
13.2k
        /* We'll produce a single, flat string. */
2053
13.2k
        MVMint64        position = 0;
2054
13.2k
        MVMGraphemeIter gi;
2055
13.2k
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
2056
13.2k
        result->body.storage.blob_32 = MVM_malloc(total_graphs * sizeof(MVMGrapheme32));
2057
13.2k
        for (i = 0; i < num_pieces; i++) {
2058
13.2k
            /* Get piece. */
2059
13.2k
            MVMString *piece = pieces[i];
2060
13.2k
2061
13.2k
            /* Add separator if needed. */
2062
13.2k
            if (0 < i) {
2063
13.2k
                /* No more checks unless still stable */
2064
13.2k
                if (concats_stable)
2065
13.2k
                    join_check_stability(tc, piece, separator, pieces,
2066
13.2k
                        &concats_stable, num_pieces, sgraphs, i);
2067
13.2k
                /* Add separator */
2068
13.2k
                if (sgraphs)
2069
13.2k
                    copy_to_32bit(tc, separator, result, &position, &gi);
2070
13.2k
            }
2071
13.2k
            /* Add piece */
2072
13.2k
            copy_to_32bit(tc, piece, result, &position, &gi);
2073
13.2k
        }
2074
13.2k
    }
2075
13.2k
2076
13.2k
    MVM_fixed_size_free(tc, tc->instance->fsa, bytes, pieces);
2077
13.2k
    STRAND_CHECK(tc, result);
2078
13.2k
    /* if concat is stable and NFG_CHECK on, run a NFG_CHECK on it since it
2079
13.2k
     * should be properly constructed now */
2080
13.2k
    if (concats_stable)
2081
13.2k
        NFG_CHECK(tc, result, "MVM_string_join");
2082
13.2k
    });
2083
13.2k
    return concats_stable ? result : re_nfg(tc, result);
2084
13.2k
}
2085
2086
/* Returning nonzero means it found the char at the position specified in 'a' in 'Haystack'.
2087
 * For character enumerations in regexes. */
2088
307k
MVMint64 MVM_string_char_at_in_string(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMString *Haystack) {
2089
307k
    MVMuint32     H_graphs;
2090
307k
    MVMGrapheme32 search;
2091
307k
2092
307k
    MVM_string_check_arg(tc, a, "char_at_in_string");
2093
307k
    MVM_string_check_arg(tc, Haystack, "char_at_in_string");
2094
307k
2095
307k
    /* We return -2 here only to be able to distinguish between "out of bounds" and "not in string". */
2096
307k
    if (offset < 0 || MVM_string_graphs_nocheck(tc, a) <= offset)
2097
12.8k
        return -2;
2098
307k
2099
294k
    search  = MVM_string_get_grapheme_at_nocheck(tc, a, offset);
2100
294k
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
2101
294k
    switch (Haystack->body.storage_type) {
2102
161k
    case MVM_STRING_GRAPHEME_32:
2103
161k
        return MVM_string_memmem_grapheme32(tc, Haystack->body.storage.blob_32, &search, 0, H_graphs, 1);
2104
161k
2105
0
    case MVM_STRING_GRAPHEME_ASCII:
2106
0
        if (can_fit_into_ascii(search)) {
2107
0
            MVMStringIndex i;
2108
0
            for (i = 0; i < H_graphs; i++)
2109
0
                if (Haystack->body.storage.blob_ascii[i] == search)
2110
0
                    return i;
2111
0
        }
2112
0
        break;
2113
132k
    case MVM_STRING_GRAPHEME_8:
2114
132k
        if (can_fit_into_8bit(search)) {
2115
132k
            MVMStringIndex i;
2116
301k
            for (i = 0; i < H_graphs; i++)
2117
206k
                if (Haystack->body.storage.blob_8[i] == search)
2118
37.5k
                    return i;
2119
132k
        }
2120
95.3k
        break;
2121
0
    case MVM_STRING_STRAND: {
2122
0
        MVMGraphemeIter gi;
2123
0
        MVMStringIndex  i;
2124
0
        MVM_string_gi_init(tc, &gi, Haystack);
2125
0
        for (i = 0; i < H_graphs; i++)
2126
0
            if (MVM_string_gi_get_grapheme(tc, &gi) == search)
2127
0
                return i;
2128
0
    }
2129
294k
    }
2130
95.3k
    return -1;
2131
294k
}
2132
2133
8
MVMint64 MVM_string_offset_has_unicode_property_value(MVMThreadContext *tc, MVMString *s, MVMint64 offset, MVMint64 property_code, MVMint64 property_value_code) {
2134
8
    MVMGrapheme32 g;
2135
8
    MVMCodepoint  cp;
2136
8
2137
8
    MVM_string_check_arg(tc, s, "uniprop");
2138
8
2139
8
    if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s))
2140
0
        return 0;
2141
8
2142
8
    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
2143
8
    if (g >= 0)
2144
8
        cp = (MVMCodepoint)g;
2145
8
    else
2146
0
        cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0];
2147
8
    return MVM_unicode_codepoint_has_property_value(tc, cp, property_code, property_value_code);
2148
8
}
2149
2150
/* If the string is made up of strands, then produces a flattend string
2151
 * representing the exact same graphemes but without strands. Otherwise,
2152
 * returns the input string. Intended for strings that will be indexed
2153
 * into heavily (when evaluating regexes, for example). */
2154
22.8k
MVMString * MVM_string_indexing_optimized(MVMThreadContext *tc, MVMString *s) {
2155
22.8k
    MVM_string_check_arg(tc, s, "indexingoptimized");
2156
22.8k
    if (s->body.storage_type == MVM_STRING_STRAND)
2157
21.8k
        return collapse_strands(tc, s);
2158
22.8k
    else
2159
1.04k
        return s;
2160
22.8k
}
2161
2162
/* Escapes a string, replacing various chars like \n with \\n. Can no doubt be
2163
 * further optimized. */
2164
561
MVMString * MVM_string_escape(MVMThreadContext *tc, MVMString *s) {
2165
561
    MVMString      *res     = NULL;
2166
561
    MVMStringIndex  spos    = 0;
2167
561
    MVMStringIndex  bpos    = 0;
2168
561
    MVMStringIndex  sgraphs, balloc;
2169
561
    MVMGrapheme32  *buffer  = NULL;
2170
561
    MVMGrapheme32   crlf;
2171
561
    MVMint8         string_can_fit_into_8bit = 1;
2172
561
2173
561
    MVM_string_check_arg(tc, s, "escape");
2174
561
2175
561
    sgraphs = MVM_string_graphs_nocheck(tc, s);
2176
561
    balloc  = sgraphs;
2177
561
    buffer  = MVM_malloc(sizeof(MVMGrapheme32) * balloc);
2178
561
2179
561
    crlf = MVM_nfg_crlf_grapheme(tc);
2180
561
2181
6.75k
    for (; spos < sgraphs; spos++) {
2182
6.19k
        MVMGrapheme32 graph = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
2183
6.19k
        MVMGrapheme32 esc   = 0;
2184
6.19k
        switch (graph) {
2185
11
            case '\\': esc = '\\'; break;
2186
0
            case 7:    esc = 'a';  break;
2187
1
            case '\b': esc = 'b';  break;
2188
4
            case '\n': esc = 'n';  break;
2189
3
            case '\r': esc = 'r';  break;
2190
13
            case '\t': esc = 't';  break;
2191
1
            case '\f': esc = 'f';  break;
2192
2
            case '"':  esc = '"';  break;
2193
1
            case 27:   esc = 'e';  break;
2194
6.19k
        }
2195
6.19k
        if (esc) {
2196
36
            if (bpos + 2 > balloc) {
2197
13
                balloc += 32;
2198
13
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
2199
13
            }
2200
36
            buffer[bpos++] = '\\';
2201
36
            buffer[bpos++] = esc;
2202
36
        }
2203
6.15k
        else if (graph == crlf) {
2204
1
            if (bpos + 4 > balloc) {
2205
1
                balloc += 32;
2206
1
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
2207
1
            }
2208
1
            buffer[bpos++] = '\\';
2209
1
            buffer[bpos++] = 'r';
2210
1
            buffer[bpos++] = '\\';
2211
1
            buffer[bpos++] = 'n';
2212
1
        }
2213
6.15k
        else {
2214
6.15k
            if (bpos + 1 > balloc) {
2215
11
                balloc += 32;
2216
11
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
2217
11
            }
2218
6.15k
            if (!can_fit_into_8bit(graph))
2219
0
                string_can_fit_into_8bit = 0;
2220
6.15k
            buffer[bpos++] = graph;
2221
6.15k
        }
2222
6.19k
    }
2223
561
2224
561
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
2225
561
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;
2226
561
    res->body.storage.blob_32 = buffer;
2227
561
    res->body.num_graphs      = bpos;
2228
561
2229
561
    if (string_can_fit_into_8bit)
2230
561
        turn_32bit_into_8bit_unchecked(tc, res);
2231
561
2232
561
    STRAND_CHECK(tc, res);
2233
561
    return res;
2234
561
}
2235
2236
/* Takes a string and reverses its characters. */
2237
1
MVMString * MVM_string_flip(MVMThreadContext *tc, MVMString *s) {
2238
1
    MVMString      *res     = NULL;
2239
1
    MVMStringIndex  spos    = 0;
2240
1
    MVMStringIndex  sgraphs;
2241
1
    MVMStringIndex  rpos;
2242
1
2243
1
    MVM_string_check_arg(tc, s, "flip");
2244
1
    sgraphs = MVM_string_graphs_nocheck(tc, s);
2245
1
    rpos    = sgraphs;
2246
1
2247
1
    switch (s->body.storage_type) {
2248
1
    case MVM_STRING_GRAPHEME_ASCII:
2249
1
    case MVM_STRING_GRAPHEME_8: {
2250
1
        MVMGrapheme8   *rbuffer;
2251
1
        /* Copy the variables, so we can use them just for the loop. This way
2252
1
         * the loop will vectorize since we won't refer to their values except
2253
1
         * in the loop. Use size_t to coerce vectorization.  */
2254
1
        size_t spos_l = spos, rpos_l = rpos;
2255
1
        rbuffer = MVM_malloc(sizeof(MVMGrapheme8) * sgraphs);
2256
1
        MVM_VECTORIZE_LOOP
2257
4
        while (spos_l < s->body.num_graphs)
2258
3
            rbuffer[--rpos_l] = s->body.storage.blob_8[spos_l++];
2259
1
2260
1
        spos += sgraphs - spos;
2261
1
        rpos -= sgraphs - spos;
2262
1
2263
1
        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
2264
1
        res->body.storage_type    = s->body.storage_type;
2265
1
        res->body.storage.blob_8  = rbuffer;
2266
1
        break;
2267
1
    }
2268
0
    default: {
2269
0
        MVMGrapheme32  *rbuffer;
2270
0
        rbuffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
2271
0
2272
0
        if (s->body.storage_type == MVM_STRING_GRAPHEME_32) {
2273
0
            size_t spos_l = spos, rpos_l = rpos;
2274
0
            MVM_VECTORIZE_LOOP
2275
0
            while (spos_l < s->body.num_graphs)
2276
0
                rbuffer[--rpos_l] = s->body.storage.blob_32[spos_l++];
2277
0
2278
0
            spos += sgraphs - spos;
2279
0
            rpos -= sgraphs - spos;
2280
0
        }
2281
0
        else
2282
0
            for (; spos < sgraphs; spos++)
2283
0
                rbuffer[--rpos] = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
2284
0
2285
0
        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
2286
0
        res->body.storage_type    = MVM_STRING_GRAPHEME_32;
2287
0
        res->body.storage.blob_32 = rbuffer;
2288
0
    }}
2289
1
2290
1
    res->body.num_graphs      = sgraphs;
2291
1
2292
1
    STRAND_CHECK(tc, res);
2293
1
    return res;
2294
1
}
2295
2296
/* Compares two strings, returning -1, 0 or 1 to indicate less than,
2297
 * equal or greater than. */
2298
46.6k
MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) {
2299
46.6k
    MVMStringIndex alen, blen, i = 0, scanlen;
2300
46.6k
    MVMGraphemeIter gi_a, gi_b;
2301
46.6k
2302
46.6k
    MVM_string_check_arg(tc, a, "compare");
2303
46.6k
    MVM_string_check_arg(tc, b, "compare");
2304
46.6k
2305
46.6k
    /* Simple cases when one or both are zero length. */
2306
46.6k
    alen = MVM_string_graphs_nocheck(tc, a);
2307
46.6k
    blen = MVM_string_graphs_nocheck(tc, b);
2308
46.6k
    if (alen == 0)
2309
356
        return blen == 0 ? 0 : -1;
2310
46.3k
    if (blen == 0)
2311
10.8k
        return 1;
2312
46.3k
2313
46.3k
    /* Otherwise, need to scan them. */
2314
35.5k
    scanlen = blen < alen ? blen : alen;
2315
35.5k
2316
35.5k
    /* Short circuit a case where the other conditionals won't speed it up */
2317
35.5k
    if (a->body.storage_type == MVM_STRING_STRAND || b->body.storage_type == MVM_STRING_STRAND) {
2318
8.56k
2319
8.56k
    }
2320
26.9k
    else if ((a->body.storage_type == MVM_STRING_GRAPHEME_8 || a->body.storage_type == MVM_STRING_GRAPHEME_ASCII)
2321
22.4k
          && (b->body.storage_type == MVM_STRING_GRAPHEME_8 || b->body.storage_type == MVM_STRING_GRAPHEME_ASCII)) {
2322
22.3k
        MVMGrapheme8  *a_blob8 = a->body.storage.blob_8;
2323
22.3k
        MVMGrapheme8  *b_blob8 = b->body.storage.blob_8;
2324
31.4k
        while (i < scanlen && a_blob8[i] == b_blob8[i]) {
2325
9.18k
            i++;
2326
9.18k
        }
2327
22.3k
    }
2328
4.66k
    else if (a->body.storage_type == MVM_STRING_GRAPHEME_32 && b->body.storage_type == MVM_STRING_GRAPHEME_32) {
2329
4.43k
        MVMGrapheme32  *a_blob32 = a->body.storage.blob_32;
2330
4.43k
        MVMGrapheme32  *b_blob32 = b->body.storage.blob_32;
2331
6.28k
        while (i < scanlen && a_blob32[i] == b_blob32[i]) {
2332
1.85k
            i++;
2333
1.85k
        }
2334
4.43k
    }
2335
231
    else {
2336
231
        MVMGrapheme32 *blob32 = NULL;
2337
231
        MVMGrapheme8  *blob8  = NULL;
2338
231
        switch (a->body.storage_type) {
2339
162
            case MVM_STRING_GRAPHEME_8:
2340
162
            case MVM_STRING_GRAPHEME_ASCII:
2341
162
                blob8 = a->body.storage.blob_8;
2342
162
                break;
2343
69
            case MVM_STRING_GRAPHEME_32:
2344
69
                blob32 = a->body.storage.blob_32;
2345
69
                break;
2346
0
            default:
2347
0
                MVM_exception_throw_adhoc(tc,
2348
0
                    "String corruption in string compare. Unknown string type.");
2349
231
        }
2350
231
        switch (b->body.storage_type) {
2351
69
            case MVM_STRING_GRAPHEME_8:
2352
69
            case MVM_STRING_GRAPHEME_ASCII:
2353
69
                blob8 = b->body.storage.blob_8;
2354
69
                break;
2355
162
            case MVM_STRING_GRAPHEME_32:
2356
162
                blob32 = b->body.storage.blob_32;
2357
162
                break;
2358
0
            default:
2359
0
                MVM_exception_throw_adhoc(tc,
2360
0
                    "String corruption in string compare. Unknown string type.");
2361
231
        }
2362
343
        while (i < scanlen && blob32[i] == blob8[i]) {
2363
112
            i++;
2364
112
        }
2365
231
    }
2366
35.5k
    /* If one of the strings was a strand or we encountered a differing character
2367
35.5k
     * while scanning in the loops above. */
2368
35.5k
    if (i < scanlen) {
2369
31.5k
        MVM_string_gi_init(tc, &gi_a, a);
2370
31.5k
        MVM_string_gi_init(tc, &gi_b, b);
2371
31.5k
        if (i) {
2372
2.15k
            MVM_string_gi_move_to(tc, &gi_a, i);
2373
2.15k
            MVM_string_gi_move_to(tc, &gi_b, i);
2374
2.15k
        }
2375
31.5k
    }
2376
44.2k
    for (; i < scanlen; i++) {
2377
40.0k
        MVMGrapheme32 g_a = MVM_string_gi_get_grapheme(tc, &gi_a);
2378
40.0k
        MVMGrapheme32 g_b = MVM_string_gi_get_grapheme(tc, &gi_b);
2379
40.0k
        if (g_a != g_b) {
2380
31.3k
            MVMint64 rtrn;
2381
31.3k
            /* If one of the deciding graphemes is a synthetic then we need to
2382
31.3k
             * iterate the codepoints inside it */
2383
31.3k
            if (g_a < 0 || g_b < 0) {
2384
0
                MVMCodepointIter ci_a, ci_b;
2385
0
                MVM_string_grapheme_ci_init(tc, &ci_a, g_a, 0);
2386
0
                MVM_string_grapheme_ci_init(tc, &ci_b, g_b, 0);
2387
0
                while (MVM_string_grapheme_ci_has_more(tc, &ci_a) && MVM_string_grapheme_ci_has_more(tc, &ci_b)) {
2388
0
                    g_a = MVM_string_grapheme_ci_get_codepoint(tc, &ci_a);
2389
0
                    g_b = MVM_string_grapheme_ci_get_codepoint(tc, &ci_b);
2390
0
                    if (g_a != g_b)
2391
0
                        break;
2392
0
                }
2393
0
                rtrn = g_a < g_b ? -1 :
2394
0
                       g_b < g_a ?  1 :
2395
0
                                    0 ;
2396
0
                /* If we get here, all the codepoints in the synthetics have matched
2397
0
                 * so go based on which has more codepoints left in that grapheme */
2398
0
                if (!rtrn) {
2399
0
                    MVMint32 a_has_more = MVM_string_grapheme_ci_has_more(tc, &ci_a),
2400
0
                             b_has_more = MVM_string_grapheme_ci_has_more(tc, &ci_b);
2401
0
2402
0
                    return a_has_more < b_has_more ? -1 :
2403
0
                           b_has_more < a_has_more ?  1 :
2404
0
                                                      0 ;
2405
0
                }
2406
0
                return rtrn;
2407
0
            }
2408
31.3k
            return g_a < g_b ? -1 :
2409
24.6k
                   g_b < g_a ?  1 :
2410
0
                                0 ;
2411
31.3k
        }
2412
40.0k
    }
2413
35.5k
2414
35.5k
    /* All shared chars equal, so go on length. */
2415
4.16k
    return alen < blen ? -1 :
2416
4.02k
           blen < alen ?  1 :
2417
3.88k
                          0 ;
2418
35.5k
}
2419
42
#define nfg_ok(cp) ((cp) < MVM_NORMALIZE_FIRST_SIG_NFC)
2420
/* BITOP is the operator that is done to the two strings. GO_FULL_LEN determines
2421
 * if we do the operation for the longest strong. If GO_FULL_LEN == 0 then
2422
 * it only goes to the end of the shortest string (we do this for the string AND
2423
 * op, but the other ones go for the length of the longest string. */
2424
#define MVM_STRING_BITOP(BITOP, GO_FULL_LEN, OP_DESC) \
2425
3
    MVMString      *res    = NULL;\
2426
3
    MVMGrapheme32  *buffer = NULL;\
2427
3
    MVMStringIndex  alen, blen, sgraphs = 0;\
2428
3
    size_t buf_size;\
2429
3
    MVMCodepointIter ci_a, ci_b;\
2430
3
    int nfg_is_safe = 1;\
2431
3
    MVM_string_check_arg(tc, a, (OP_DESC));\
2432
3
    MVM_string_check_arg(tc, b, (OP_DESC));\
2433
3
\
2434
3
    alen = MVM_string_graphs_nocheck(tc, a);\
2435
3
    blen = MVM_string_graphs_nocheck(tc, b);\
2436
3
    buf_size = blen < alen ? alen : blen;\
2437
3
    buffer = MVM_malloc(sizeof(MVMGrapheme32) * buf_size);\
2438
3
    MVM_string_ci_init(tc, &ci_a, a, 0, 0);\
2439
3
    MVM_string_ci_init(tc, &ci_b, b, 0, 0);\
2440
3
\
2441
3
    /* First, binary-or up to the length of the shortest string. */\
2442
24
    while (MVM_string_ci_has_more(tc, &ci_a) && MVM_string_ci_has_more(tc, &ci_b)) {\
2443
21
        const MVMGrapheme32 g_a = MVM_string_ci_get_codepoint(tc, &ci_a);\
2444
21
        const MVMGrapheme32 g_b = MVM_string_ci_get_codepoint(tc, &ci_b);\
2445
21
        buffer[sgraphs++] = g_a BITOP g_b;\
2446
21
        if (nfg_is_safe && (!nfg_ok(g_a) || !nfg_ok(g_b)))\
2447
1
            nfg_is_safe = 0;\
2448
21
        if (sgraphs == buf_size) {\
2449
0
            buf_size += 16;\
2450
0
            buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\
2451
0
        }\
2452
21
    }\
2453
3
    if (GO_FULL_LEN) {\
2454
4
        while (MVM_string_ci_has_more(tc, &ci_a)) {\
2455
2
            const MVMGrapheme32 g_a = MVM_string_ci_get_codepoint(tc, &ci_a);\
2456
2
            buffer[sgraphs++] = g_a;\
2457
2
            if (nfg_is_safe && !nfg_ok(g_a))\
2458
0
                nfg_is_safe = 0;\
2459
2
            if (sgraphs == buf_size) {\
2460
2
                buf_size += 16;\
2461
2
                buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\
2462
2
            }\
2463
2
        }\
2464
2
        while (MVM_string_ci_has_more(tc, &ci_b)) {\
2465
0
            const MVMGrapheme32 g_b = MVM_string_ci_get_codepoint(tc, &ci_b);\
2466
0
            buffer[sgraphs++] = g_b;\
2467
0
            if (nfg_is_safe && !nfg_ok(g_b))\
2468
0
                nfg_is_safe = 0;\
2469
0
            if (sgraphs == buf_size) {\
2470
0
                buf_size += 16;\
2471
0
                buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\
2472
0
            }\
2473
0
        }\
2474
2
    }\
2475
3
\
2476
3
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);\
2477
3
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;\
2478
3
    res->body.storage.blob_32 = buffer;\
2479
3
    res->body.num_graphs      = sgraphs;\
2480
3
\
2481
2
    res = nfg_is_safe ? res : re_nfg(tc, res);\
2482
3
    STRAND_CHECK(tc, res);\
2483
3
    return res;\
2484
2485
/* Takes two strings and AND's their characters. */
2486
1
MVMString * MVM_string_bitand(MVMThreadContext *tc, MVMString *a, MVMString *b) {
2487
1
    MVM_STRING_BITOP( & , 0, "bitwise and")
2488
1
}
2489
2490
/* Takes two strings and OR's their characters. */
2491
1
MVMString * MVM_string_bitor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
2492
1
    MVM_STRING_BITOP( | , 1, "bitwise or")
2493
1
}
2494
/* Takes two strings and XOR's their characters. */
2495
1
MVMString * MVM_string_bitxor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
2496
1
    MVM_STRING_BITOP( ^ , 1, "bitwise xor");
2497
0
}
2498
2499
/* Shortcuts for some unicode general category pvalues */
2500
218k
#define UPV_Nd MVM_UNICODE_PVALUE_GC_ND
2501
230
#define UPV_Lu MVM_UNICODE_PVALUE_GC_LU
2502
243
#define UPV_Ll MVM_UNICODE_PVALUE_GC_LL
2503
#define UPV_Lt MVM_UNICODE_PVALUE_GC_LT
2504
#define UPV_Lm MVM_UNICODE_PVALUE_GC_LM
2505
#define UPV_Lo MVM_UNICODE_PVALUE_GC_LO
2506
56
#define UPV_Zs MVM_UNICODE_PVALUE_GC_ZS
2507
#define UPV_Zl MVM_UNICODE_PVALUE_GC_ZL
2508
#define UPV_Pc MVM_UNICODE_PVALUE_GC_PC
2509
#define UPV_Pd MVM_UNICODE_PVALUE_GC_PD
2510
#define UPV_Ps MVM_UNICODE_PVALUE_GC_PS
2511
#define UPV_Pe MVM_UNICODE_PVALUE_GC_PE
2512
#define UPV_Pi MVM_UNICODE_PVALUE_GC_PI
2513
#define UPV_Pf MVM_UNICODE_PVALUE_GC_PF
2514
#define UPV_Po MVM_UNICODE_PVALUE_GC_PO
2515
2516
/* concatenating with "" ensures that only literal strings are accepted as argument. */
2517
#define STR_WITH_LEN(str)  ("" str ""), (sizeof(str) - 1)
2518
2519
#include "strings/unicode_prop_macros.h"
2520
/* Checks if the specified grapheme is in the given character class. */
2521
1.57M
MVMint64 MVM_string_grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) {
2522
1.57M
    /* If it's a synthetic, then grab the base codepoint. */
2523
1.57M
    MVMCodepoint cp;
2524
1.57M
    if (0 <= g)
2525
1.57M
        cp = (MVMCodepoint)g;
2526
1.57M
    else
2527
5
        cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0];
2528
1.57M
2529
1.57M
    switch (cclass) {
2530
30.3k
        case MVM_CCLASS_ANY:
2531
30.3k
            return 1;
2532
30.3k
2533
230
        case MVM_CCLASS_UPPERCASE:
2534
230
            return MVM_unicode_codepoint_has_property_value(tc, cp,
2535
230
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu);
2536
30.3k
2537
243
        case MVM_CCLASS_LOWERCASE:
2538
243
            return MVM_unicode_codepoint_has_property_value(tc, cp,
2539
243
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll);
2540
30.3k
2541
659k
        case MVM_CCLASS_WORD:
2542
659k
            if (cp <= 'z') {  /* short circuit common case */
2543
658k
                if (cp >= 'a' || cp == '_' || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9'))
2544
447k
                    return 1;
2545
658k
                else
2546
211k
                    return 0;
2547
658k
            }
2548
659k
            /* Deliberate fall-through; word is _ or digit or alphabetic. */
2549
659k
2550
1.07k
        case MVM_CCLASS_ALPHANUMERIC:
2551
1.07k
            if (cp <= '9' && cp >= '0')  /* short circuit common case */
2552
5
                return 1;
2553
1.06k
            if (MVM_unicode_codepoint_has_property_value(tc, cp,
2554
1.06k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd))
2555
1
                return 1;
2556
1.06k
            /* Deliberate fall-through; alphanumeric is digit or alphabetic. */
2557
1.06k
2558
208k
        case MVM_CCLASS_ALPHABETIC:
2559
208k
            if (cp <= 'z') {  /* short circuit common case */
2560
205k
                if (cp >= 'a' || (cp >= 'A' && cp <= 'Z'))
2561
113k
                    return 1;
2562
205k
                else
2563
91.9k
                    return 0;
2564
205k
            }
2565
208k
            /* Property L covers Lo, Ll, Lu, Lt, Lm */
2566
2.93k
            return !!MVM_unicode_codepoint_get_property_int(tc, cp,
2567
2.93k
                MVM_UNICODE_PROPERTY_L);
2568
208k
            /* TODO: Maybe we want MVM_UNICODE_PROPERTY_ALPHABETIC instead? */
2569
208k
2570
345k
        case MVM_CCLASS_NUMERIC:
2571
345k
            if (cp <= '9' && cp >= '0')  /* short circuit common case */
2572
128k
                return 1;
2573
217k
            return MVM_unicode_codepoint_has_property_value(tc, cp,
2574
217k
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd);
2575
345k
2576
185
        case MVM_CCLASS_HEXADECIMAL:
2577
185
            return MVM_unicode_codepoint_has_property_value(tc, cp,
2578
185
                MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1);
2579
345k
2580
145k
        case MVM_CCLASS_WHITESPACE:
2581
145k
            return MVM_CP_is_White_Space(cp);
2582
345k
2583
63
        case MVM_CCLASS_BLANK:
2584
63
            if (cp == '\t')
2585
7
                return 1;
2586
56
            return MVM_unicode_codepoint_has_property_value(tc, cp,
2587
56
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zs);
2588
63
2589
60
        case MVM_CCLASS_CONTROL: {
2590
60
            return (cp >= 0 && cp < 32) || (cp >= 127 && cp < 160);
2591
63
        }
2592
63
2593
55
        case MVM_CCLASS_PRINTING: {
2594
55
            return !((cp >= 0 && cp < 32) || (cp >= 127 && cp < 160));
2595
63
        }
2596
63
2597
9.15k
        case MVM_CCLASS_PUNCTUATION:
2598
9.15k
            return !!MVM_unicode_codepoint_get_property_int(tc, cp,
2599
9.15k
                MVM_UNICODE_PROPERTY_P);
2600
63
2601
172k
        case MVM_CCLASS_NEWLINE: {
2602
172k
            if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
2603
165k
                cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))
2604
6.92k
                return 1;
2605
172k
        }
2606
172k
2607
165k
        default:
2608
165k
            return 0;
2609
1.57M
    }
2610
1.57M
}
2611
2612
/* Checks if the character at the specified offset is a member of the
2613
 * indicated character class. */
2614
237k
MVMint64 MVM_string_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset) {
2615
237k
    MVM_string_check_arg(tc, s, "is_cclass");
2616
237k
    if (MVM_UNLIKELY(offset < 0 || MVM_string_graphs_nocheck(tc, s) <= offset))
2617
18
        return 0;
2618
237k
    return MVM_string_grapheme_is_cclass(tc, cclass, MVM_string_get_grapheme_at_nocheck(tc, s, offset));
2619
237k
}
2620
2621
/* Searches for the next char that is in the specified character class. */
2622
18.0k
MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
2623
18.0k
    MVMGraphemeIter gi;
2624
18.0k
    MVMint64        length, end, pos;
2625
18.0k
2626
18.0k
    MVM_string_check_arg(tc, s, "find_cclass");
2627
18.0k
2628
18.0k
    length = MVM_string_graphs_nocheck(tc, s);
2629
18.0k
    end    = offset + count;
2630
17.2k
    end = length < end ? length : end;
2631
18.0k
    if (offset < 0 || offset >= length)
2632
149
        return end;
2633
18.0k
2634
17.8k
    MVM_string_gi_init(tc, &gi, s);
2635
17.8k
    MVM_string_gi_move_to(tc, &gi, offset);
2636
17.8k
    switch (cclass) {
2637
609
        case MVM_CCLASS_WHITESPACE:
2638
4.14k
            for (pos = offset; pos < end; pos++) {
2639
3.55k
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
2640
3.55k
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
2641
3.55k
                if (MVM_CP_is_White_Space(cp))
2642
20
                    return pos;
2643
3.55k
            }
2644
589
            break;
2645
17.1k
        case MVM_CCLASS_NEWLINE:
2646
563k
            for (pos = offset; pos < end; pos++) {
2647
563k
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
2648
563k
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
2649
563k
                if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
2650
546k
                    cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))
2651
17.1k
                    return pos;
2652
563k
            }
2653
53
            break;
2654
120
        default:
2655
369
            for (pos = offset; pos < end; pos++) {
2656
345
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
2657
345
                if (MVM_string_grapheme_is_cclass(tc, cclass, g) > 0)
2658
96
                    return pos;
2659
345
            }
2660
17.8k
    }
2661
17.8k
2662
666
    return end;
2663
17.8k
}
2664
2665
/* Searches for the next char that is not in the specified character class. */
2666
24.8k
MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
2667
24.8k
    MVMGraphemeIter gi;
2668
24.8k
    MVMint64        length, end, pos;
2669
24.8k
2670
24.8k
    MVM_string_check_arg(tc, s, "find_not_cclass");
2671
24.8k
2672
24.8k
    length = MVM_string_graphs_nocheck(tc, s);
2673
24.8k
    end    = offset + count;
2674
24.2k
    end = length < end ? length : end;
2675
24.8k
    if (offset < 0 || offset >= length)
2676
589
        return end;
2677
24.8k
2678
24.3k
    MVM_string_gi_init(tc, &gi, s);
2679
24.3k
    MVM_string_gi_move_to(tc, &gi, offset);
2680
24.3k
    switch (cclass) {
2681
663
        case MVM_CCLASS_WHITESPACE:
2682
735
            for (pos = offset; pos < end; pos++) {
2683
730
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
2684
730
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
2685
730
                if (!MVM_CP_is_White_Space(cp))
2686
658
                    return pos;
2687
730
            }
2688
5
            break;
2689
12
        case MVM_CCLASS_NEWLINE:
2690
13
            for (pos = offset; pos < end; pos++) {
2691
13
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
2692
13
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
2693
13
                if (!(cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
2694
12
                    cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp)))
2695
12
                    return pos;
2696
13
            }
2697
0
            break;
2698
23.6k
        default:
2699
124k
            for (pos = offset; pos < end; pos++) {
2700
124k
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
2701
124k
                if (!MVM_string_grapheme_is_cclass(tc, cclass, g))
2702
23.6k
                    return pos;
2703
124k
            }
2704
24.3k
    }
2705
24.3k
2706
26
    return end;
2707
24.3k
}
2708
2709
static MVMint16   encoding_name_init         = 0;
2710
static MVMString *encoding_utf8_name         = NULL;
2711
static MVMString *encoding_ascii_name        = NULL;
2712
static MVMString *encoding_latin1_name       = NULL;
2713
static MVMString *encoding_utf16_name        = NULL;
2714
static MVMString *encoding_windows1252_name  = NULL;
2715
static MVMString *encoding_windows1251_name  = NULL;
2716
static MVMString *encoding_shiftjis_name     = NULL;
2717
static MVMString *encoding_utf8_c8_name      = NULL;
2718
13.7k
MVMuint8 MVM_string_find_encoding(MVMThreadContext *tc, MVMString *name) {
2719
13.7k
    MVM_string_check_arg(tc, name, "find encoding");
2720
13.7k
    if (!encoding_name_init) {
2721
144
        MVM_gc_allocate_gen2_default_set(tc);
2722
144
        encoding_utf8_name        = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8");
2723
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_name, "Encoding name");
2724
144
        encoding_ascii_name       = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "ascii");
2725
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_ascii_name, "Encoding name");
2726
144
        encoding_latin1_name      = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "iso-8859-1");
2727
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_latin1_name, "Encoding name");
2728
144
        encoding_utf16_name       = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf16");
2729
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf16_name, "Encoding name");
2730
144
        encoding_windows1252_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1252");
2731
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1252_name, "Encoding name");
2732
144
        encoding_windows1251_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1251");
2733
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1251_name, "Encoding name");
2734
144
        encoding_shiftjis_name     = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-932");
2735
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_shiftjis_name, "Encoding name");
2736
144
        encoding_utf8_c8_name     = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8-c8");
2737
144
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_c8_name, "Encoding name");
2738
144
        encoding_name_init   = 1;
2739
144
        MVM_gc_allocate_gen2_default_clear(tc);
2740
144
    }
2741
13.7k
2742
13.7k
    if (MVM_string_equal(tc, name, encoding_utf8_name)) {
2743
13.6k
        return MVM_encoding_type_utf8;
2744
13.6k
    }
2745
48
    else if (MVM_string_equal(tc, name, encoding_ascii_name)) {
2746
7
        return MVM_encoding_type_ascii;
2747
7
    }
2748
41
    else if (MVM_string_equal(tc, name, encoding_latin1_name)) {
2749
5
        return MVM_encoding_type_latin1;
2750
5
    }
2751
36
    else if (MVM_string_equal(tc, name, encoding_windows1252_name)) {
2752
16
        return MVM_encoding_type_windows1252;
2753
16
    }
2754
20
    else if (MVM_string_equal(tc, name, encoding_windows1251_name)) {
2755
0
        return MVM_encoding_type_windows1251;
2756
0
    }
2757
20
    else if (MVM_string_equal(tc, name, encoding_utf16_name)) {
2758
2
        return MVM_encoding_type_utf16;
2759
2
    }
2760
18
    else if (MVM_string_equal(tc, name, encoding_utf8_c8_name)) {
2761
17
        return MVM_encoding_type_utf8_c8;
2762
17
    }
2763
1
    else if (MVM_string_equal(tc, name, encoding_shiftjis_name)) {
2764
0
        return MVM_encoding_type_shiftjis;
2765
0
    }
2766
1
    else {
2767
1
        char *c_name = MVM_string_utf8_encode_C_string(tc, name);
2768
1
        char *waste[] = { c_name, NULL };
2769
1
        MVM_exception_throw_adhoc_free(tc, waste, "Unknown string encoding: '%s'",
2770
1
            c_name);
2771
1
    }
2772
13.7k
}
2773
2774
/* Turns a codepoint into a string. If required uses the normalizer to ensure
2775
 * that we get a valid NFG string (NFG is a superset of NFC, and singleton
2776
 * decompositions exist). */
2777
8.53k
MVMString * MVM_string_chr(MVMThreadContext *tc, MVMint64 cp) {
2778
8.53k
    MVMString *s = NULL;
2779
8.53k
    MVMGrapheme32 g;
2780
8.53k
2781
8.53k
    if (cp < 0)
2782
0
        MVM_exception_throw_adhoc(tc, "chr codepoint cannot be negative");
2783
8.53k
    /* If the codepoint decomposes we may need to normalize it.
2784
8.53k
     * The first cp that decomposes is U+0340, but to be on the safe side
2785
8.53k
     * for now we go with the first significant character which at the time
2786
8.53k
     * of writing (Unicode 9.0) is COMBINING GRAVE ACCENT U+300 */
2787
8.53k
    if (cp >= MVM_NORMALIZE_FIRST_SIG_NFC
2788
220
        && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE)
2789
220
        != MVM_UNICODE_PVALUE_DT_NONE) {
2790
80
2791
80
        MVMNormalizer norm;
2792
80
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
2793
80
        if (!MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp, &g)) {
2794
80
            MVM_unicode_normalizer_eof(tc, &norm);
2795
80
            g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
2796
80
        }
2797
80
        MVM_unicode_normalizer_cleanup(tc, &norm);
2798
80
    }
2799
8.45k
    else {
2800
8.45k
        g = (MVMGrapheme32) cp;
2801
8.45k
    }
2802
8.53k
2803
8.53k
    s = (MVMString *)REPR(tc->instance->VMString)->allocate(tc, STABLE(tc->instance->VMString));
2804
8.53k
    if (can_fit_into_8bit(g)) {
2805
8.29k
        s->body.storage_type       = MVM_STRING_GRAPHEME_8;
2806
8.29k
        s->body.storage.blob_8     = MVM_malloc(sizeof(MVMGrapheme8));
2807
8.29k
        s->body.storage.blob_8[0]  = g;
2808
243
    } else {
2809
243
        s->body.storage_type       = MVM_STRING_GRAPHEME_32;
2810
243
        s->body.storage.blob_32    = MVM_malloc(sizeof(MVMGrapheme32));
2811
243
        s->body.storage.blob_32[0] = g;
2812
243
    }
2813
8.53k
    s->body.num_graphs         = 1;
2814
8.53k
    return s;
2815
8.53k
}
2816
2817
/* Takes a string and computes a hash code for it, storing it in the hash code
2818
 * cache field of the string. Hashing code is derived from the Jenkins hash
2819
 * implementation in uthash.h. */
2820
typedef union {
2821
    MVMint32 graphs[3];
2822
    unsigned char bytes[12];
2823
} MVMJenHashGraphemeView;
2824
2825
2.61M
MVM_STATIC_INLINE void MVM_hash_add_three (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv) {
2826
2.61M
    *hj_i  += hash_block->graphs[0];
2827
2.61M
    *hj_j  += hash_block->graphs[1];
2828
2.61M
    *hashv += hash_block->graphs[2];
2829
2.61M
    HASH_JEN_MIX(*hj_i, *hj_j, *hashv);
2830
2.61M
}
2831
915k
MVM_STATIC_INLINE void MVM_hash_finish (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv, MVMStringIndex sgraphs, MVMStringIndex graphs_remaining) {
2832
915k
    /* Mix in key length (in bytes, not graphemes). */
2833
915k
    *hashv += sgraphs * sizeof(MVMGrapheme32);
2834
915k
2835
915k
    /* Now handle trailing graphemes (must be 2, 1, or 0). */
2836
915k
    /* NOTE: this is weird since it changes the order in different cases. This
2837
915k
     * is just replicating old functionality. */
2838
915k
    switch (graphs_remaining) {
2839
304k
        case 2:
2840
304k
            *hj_j += hash_block->graphs[0];
2841
304k
            *hj_i += hash_block->graphs[1];
2842
304k
            break;
2843
304k
        /* Fallthrough */
2844
296k
        case 1:
2845
296k
            *hj_i += hash_block->graphs[0];
2846
915k
    }
2847
915k
    HASH_JEN_MIX(*hj_i, *hj_j, *hashv);
2848
915k
    /* Because we check if MVMString->body.cached_hash_code == 0 to tell if
2849
915k
     * we have not yet computed the hash code, ensure that hashv is never 0
2850
915k
     * by adding the length of the string to hashv iff hashv == 0. Since both
2851
915k
     * the hashv and MVMStringIndex are both uint32, there should never be any
2852
915k
     * overflow. Only problematic case is if the string is of length 0 and
2853
915k
     * hashv is zero, though this is very very unlikely (if possible at all)
2854
915k
     * and it should be very fast to calculate the hash so as to be negligible. */
2855
915k
    if (*hashv == 0) {
2856
0
        *hashv += sgraphs;
2857
0
    }
2858
915k
}
2859
915k
void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) {
2860
915k
    /* The hash algorithm works in bytes. Since we can represent strings in a
2861
915k
     * number of ways, and we want consistent hashing, then we'll read the
2862
915k
     * strings using the grapheme iterator in groups of 3, using 32-bit ints
2863
915k
     * for the graphemes no matter what the string really holds them as. Then
2864
915k
     * we'll use the bytes view of that in the hashing function. */
2865
915k
2866
915k
    MVMStringIndex graphs_remaining, sgraphs;
2867
915k
2868
915k
    /* Initialize hash state. */
2869
915k
    MVMhashv hashv = tc->instance->hashSecret;
2870
915k
    MVMuint32 hj_i, hj_j;
2871
915k
    hj_i = hj_j = 0x9e3779b9;
2872
915k
    graphs_remaining = sgraphs = MVM_string_graphs(tc, s);
2873
915k
2874
915k
    switch (s->body.storage_type) {
2875
673k
        case MVM_STRING_GRAPHEME_ASCII:
2876
673k
        case MVM_STRING_GRAPHEME_8: {
2877
673k
            int i;
2878
673k
            MVMJenHashGraphemeView hash_block;
2879
2.84M
            for (i = 0; 3 <= sgraphs - i; i += 3) {
2880
2.16M
                hash_block.graphs[0] = s->body.storage.blob_8[i];
2881
2.16M
                hash_block.graphs[1] = s->body.storage.blob_8[i+1];
2882
2.16M
                hash_block.graphs[2] = s->body.storage.blob_8[i+2];
2883
2.16M
                MVM_hash_add_three(
2884
2.16M
                    &hash_block,
2885
2.16M
                    &hj_i, &hj_j, &hashv);
2886
2.16M
            }
2887
673k
            graphs_remaining = sgraphs - i;
2888
673k
            switch (graphs_remaining) {
2889
217k
                case 1:
2890
217k
                    hash_block.graphs[0] = s->body.storage.blob_8[i];
2891
217k
                    break;
2892
222k
                case 2:
2893
222k
                    hash_block.graphs[0] = s->body.storage.blob_8[i];
2894
222k
                    hash_block.graphs[1] = s->body.storage.blob_8[i+1];
2895
222k
                    break;
2896
673k
            }
2897
673k
            MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
2898
673k
            break;
2899
673k
        }
2900
94.3k
        case MVM_STRING_GRAPHEME_32: {
2901
94.3k
            int i;
2902
218k
            for (i = 0; 3 <= sgraphs - i; i += 3) {
2903
123k
                MVM_hash_add_three(
2904
123k
                    (MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i),
2905
123k
                    &hj_i, &hj_j, &hashv);
2906
123k
            }
2907
94.3k
            graphs_remaining = sgraphs - i;
2908
94.3k
            MVM_hash_finish((MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i), &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
2909
94.3k
            break;
2910
673k
        }
2911
147k
        default: {
2912
147k
            MVMGraphemeIter gi;
2913
147k
            MVMJenHashGraphemeView hash_block;
2914
147k
            /* Work through the string 3 graphemes at a time. */
2915
147k
            MVM_string_gi_init(tc, &gi, s);
2916
471k
            while (3 <= graphs_remaining) {
2917
323k
                hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
2918
323k
                hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
2919
323k
                hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi);
2920
323k
                MVM_hash_add_three(
2921
323k
                    &hash_block,
2922
323k
                    &hj_i, &hj_j, &hashv);
2923
323k
                graphs_remaining -= 3;
2924
323k
            }
2925
147k
            /* Now handle trailing graphemes (must be 2, 1, or 0). */
2926
147k
            switch (graphs_remaining) {
2927
61.4k
                case 1:
2928
61.4k
                    hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
2929
61.4k
                    break;
2930
50.1k
                case 2:
2931
50.1k
                    hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
2932
50.1k
                    hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
2933
50.1k
                    break;
2934
147k
2935
147k
            }
2936
147k
            MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
2937
147k
        }
2938
915k
    }
2939
915k
    /* Store computed hash value. */
2940
915k
    s->body.cached_hash_code = hashv;
2941
915k
}