Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/ops.c
Line
Count
Source (jump to first uncovered line)
1
#include "platform/memmem.h"
2
#include "moar.h"
3
#define MVM_DEBUG_STRANDS 0
4
5
#if MVM_DEBUG_STRANDS
6
static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) {
7
    MVMGraphemeIter gi;
8
    MVMuint32       len;
9
    MVM_string_gi_init(tc, &gi, s);
10
    len = 0;
11
    while (MVM_string_gi_has_more(tc, &gi)) {
12
        MVM_string_gi_get_grapheme(tc, &gi);
13
        len++;
14
    }
15
    if (len != MVM_string_graphs(tc, s))
16
        MVM_exception_throw_adhoc(tc,
17
            "Strand sanity check failed (stand length %d != num_graphs %d)",
18
            len, MVM_string_graphs(tc, s));
19
}
20
#define STRAND_CHECK(tc, s) check_strand_sanity(tc, s);
21
#else
22
#define STRAND_CHECK(tc, s)
23
#endif
24
25
/* Allocates strand storage. */
26
693k
static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) {
27
693k
    return MVM_malloc(num_strands * sizeof(MVMStringStrand));
28
693k
}
29
30
/* Copies strands from one strand string to another. */
31
static void copy_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
32
148k
        MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
33
148k
    assert(from->body.storage_type == MVM_STRING_STRAND);
34
148k
    assert(to->body.storage_type == MVM_STRING_STRAND);
35
148k
    memcpy(
36
148k
        to->body.storage.strands + to_offset,
37
148k
        from->body.storage.strands + from_offset,
38
148k
        num_strands * sizeof(MVMStringStrand));
39
148k
}
40
41
/* If a string is currently using 32bit storage, turn it into using
42
 * 8 bit storage. Doesn't do any checks at all. */
43
24.2k
static void turn_32bit_into_8bit_unchecked(MVMThreadContext *tc, MVMString *str) {
44
24.2k
    MVMGrapheme32 *old_buf = str->body.storage.blob_32;
45
24.2k
    MVMStringIndex i;
46
24.2k
    str->body.storage_type = MVM_STRING_GRAPHEME_8;
47
24.2k
    str->body.storage.blob_8 = MVM_malloc(str->body.num_graphs * sizeof(MVMGrapheme8));
48
24.2k
49
405k
    for (i = 0; i < str->body.num_graphs; i++) {
50
381k
        str->body.storage.blob_8[i] = old_buf[i];
51
381k
    }
52
24.2k
53
24.2k
    MVM_free(old_buf);
54
24.2k
}
55
56
/* Collapses a bunch of strands into a single blob string. */
57
6
static MVMString * collapse_strands(MVMThreadContext *tc, MVMString *orig) {
58
6
    MVMString       *result;
59
6
    MVMStringIndex   i;
60
6
    MVMuint32        ographs;
61
6
    MVMGraphemeIter  gi;
62
6
63
6
    MVMROOT(tc, orig, {
64
6
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
65
6
    });
66
6
    ographs                      = MVM_string_graphs(tc, orig);
67
6
    result->body.num_graphs      = ographs;
68
6
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
69
6
    result->body.storage.blob_32 = MVM_malloc(ographs * sizeof(MVMGrapheme32));
70
6
71
6
    MVM_string_gi_init(tc, &gi, orig);
72
493
    for (i = 0; i < ographs; i++) {
73
487
        MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
74
487
        result->body.storage.blob_32[i] = g;
75
487
        if (g < -127 || g > 127) {
76
0
            /* If we know we can't fit into 8 bits, enter a tighter loop for maximum speed */
77
0
            for (i++; i < ographs; i++) {
78
0
                result->body.storage.blob_32[i] = MVM_string_gi_get_grapheme(tc, &gi);
79
0
            }
80
0
            return result;
81
0
        }
82
487
    }
83
6
    /* If we get here, we didn't see any cp's lower than -127 or higher than 127
84
6
     * so turn it into an 8 bit string */
85
6
    turn_32bit_into_8bit_unchecked(tc, result);
86
6
    return result;
87
6
}
88
89
/* Takes a string that is no longer in NFG form after some concatenation-style
90
 * operation, and returns a new string that is in NFG. Note that we could do a
91
 * much, much, smarter thing in the future that doesn't involve all of this
92
 * copying and allocation and re-doing the whole string, but cases like this
93
 * should be fairly rare anyway, so go with simplicity for now. */
94
40
static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) {
95
40
    MVMNormalizer norm;
96
40
    MVMCodepointIter ci;
97
40
    MVMint32 ready;
98
40
    MVMString *out;
99
40
    MVMuint32 bufsize = in->body.num_graphs;
100
40
101
40
    /* Create the output buffer. We used to believe it can't ever be bigger
102
40
     * than the initial estimate, but utf8-c8 showed us otherwise. */
103
40
    MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
104
40
    MVMint64 out_pos = 0;
105
40
106
40
    /* Iterate codepoints and normalizer. */
107
40
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
108
40
    MVM_string_ci_init(tc, &ci, in, 0);
109
274
    while (MVM_string_ci_has_more(tc, &ci)) {
110
234
        MVMGrapheme32 g;
111
234
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g);
112
234
        if (ready) {
113
153
            if (out_pos + ready > bufsize) {
114
0
                /* Doubling up the buffer size seems excessive, so just
115
0
                 * add a generous amount of storage */
116
0
                bufsize += ready + 32;
117
0
                out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
118
0
            }
119
153
            out_buffer[out_pos++] = g;
120
189
            while (--ready > 0) {
121
36
                out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
122
36
            }
123
153
        }
124
234
    }
125
40
    MVM_unicode_normalizer_eof(tc, &norm);
126
40
    ready = MVM_unicode_normalizer_available(tc, &norm);
127
40
    if (out_pos + ready > bufsize) {
128
0
        bufsize += ready + 1;
129
0
        out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
130
0
    }
131
50
    while (ready--) {
132
10
        out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
133
10
    }
134
40
    MVM_unicode_normalizer_cleanup(tc, &norm);
135
40
136
40
    /* Build result string. */
137
40
    out = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
138
40
    out->body.storage.blob_32 = out_buffer;
139
40
    out->body.storage_type    = MVM_STRING_GRAPHEME_32;
140
40
    out->body.num_graphs      = out_pos;
141
40
    return out;
142
40
}
143
144
/* Returns nonzero if two substrings are equal, doesn't check bounds */
145
MVMint64 MVM_string_substrings_equal_nocheck(MVMThreadContext *tc, MVMString *a,
146
15.9M
        MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
147
15.9M
    MVMGraphemeIter gia;
148
15.9M
    MVMGraphemeIter gib;
149
15.9M
    MVMint64 i;
150
15.9M
151
15.9M
    /* Fast paths when storage types are identical. */
152
15.9M
    switch (a->body.storage_type) {
153
395k
    case MVM_STRING_GRAPHEME_32:
154
395k
        if (b->body.storage_type == MVM_STRING_GRAPHEME_32)
155
142k
            return 0 == memcmp(
156
142k
                a->body.storage.blob_32 + starta,
157
142k
                b->body.storage.blob_32 + startb,
158
142k
                length * sizeof(MVMGrapheme32));
159
253k
        break;
160
12.3M
    case MVM_STRING_GRAPHEME_ASCII:
161
12.3M
    case MVM_STRING_GRAPHEME_8:
162
12.3M
        if (b->body.storage_type == MVM_STRING_GRAPHEME_ASCII ||
163
12.3M
                b->body.storage_type == MVM_STRING_GRAPHEME_8)
164
11.9M
            return 0 == memcmp(
165
11.9M
                a->body.storage.blob_8 + starta,
166
11.9M
                b->body.storage.blob_8 + startb,
167
11.9M
                length);
168
380k
        break;
169
15.9M
    }
170
15.9M
171
15.9M
    /* Normal path, for the rest of the time. */
172
3.86M
    MVM_string_gi_init(tc, &gia, a);
173
3.86M
    MVM_string_gi_init(tc, &gib, b);
174
3.86M
    MVM_string_gi_move_to(tc, &gia, starta);
175
3.86M
    MVM_string_gi_move_to(tc, &gib, startb);
176
6.14M
    for (i = 0; i < length; i++)
177
5.62M
        if (MVM_string_gi_get_grapheme(tc, &gia) != MVM_string_gi_get_grapheme(tc, &gib))
178
3.34M
            return 0;
179
518k
    return 1;
180
3.86M
}
181
182
/* Returns the codepoint without doing checks, for internal VM use only. */
183
10.4M
MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
184
10.4M
    switch (a->body.storage_type) {
185
8.73M
    case MVM_STRING_GRAPHEME_32:
186
8.73M
        return a->body.storage.blob_32[index];
187
0
    case MVM_STRING_GRAPHEME_ASCII:
188
0
        return a->body.storage.blob_ascii[index];
189
1.49M
    case MVM_STRING_GRAPHEME_8:
190
1.49M
        return a->body.storage.blob_8[index];
191
234k
    case MVM_STRING_STRAND: {
192
234k
        MVMGraphemeIter gi;
193
234k
        MVM_string_gi_init(tc, &gi, a);
194
234k
        MVM_string_gi_move_to(tc, &gi, index);
195
234k
        return MVM_string_gi_get_grapheme(tc, &gi);
196
8.73M
    }
197
0
    default:
198
0
        MVM_exception_throw_adhoc(tc, "String corruption detected: bad storage type");
199
10.4M
    }
200
10.4M
}
201
202
/* Returns the location of one string in another or -1  */
203
405k
MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) {
204
405k
    size_t index           = (size_t)start;
205
405k
    MVMStringIndex hgraphs = MVM_string_graphs(tc, haystack), ngraphs = MVM_string_graphs(tc, needle);
206
405k
    MVM_string_check_arg(tc, haystack, "index search target");
207
405k
    MVM_string_check_arg(tc, needle, "index search term");
208
405k
209
405k
    if (!ngraphs)
210
48
        return start <= hgraphs ? start : -1; /* the empty string is in any other string */
211
405k
212
405k
    if (!hgraphs)
213
41
        return -1;
214
405k
215
405k
    if (start < 0 || start >= hgraphs)
216
148
        return -1;
217
405k
218
405k
    if (ngraphs > hgraphs || ngraphs < 1)
219
3
        return -1;
220
405k
221
405k
    /* Fast paths when storage types are identical. Uses memmem function, which
222
405k
     * uses Knuth-Morris-Pratt algorithm on Linux and on others
223
405k
     * Crochemore+Perrin two-way string matching */
224
405k
    switch (haystack->body.storage_type) {
225
18.1k
        case MVM_STRING_GRAPHEME_32:
226
18.1k
            if (needle->body.storage_type == MVM_STRING_GRAPHEME_32) {
227
15.3k
                void *start_ptr = haystack->body.storage.blob_32 + start;
228
15.3k
                void *mm_return_32;
229
15.3k
                void *end_ptr = (char*)start_ptr + sizeof(MVMGrapheme32) * (hgraphs - start);
230
15.3k
                do {
231
15.3k
                    /* Keep as void* to not lose precision */
232
15.3k
                    mm_return_32 = MVM_memmem(
233
15.3k
                        start_ptr, /* start position */
234
15.3k
                        (char*)end_ptr - (char*)start_ptr, /* length of haystack from start position to end */
235
15.3k
                        needle->body.storage.blob_32, /* needle start */
236
15.3k
                        ngraphs * sizeof(MVMGrapheme32) /* needle length */
237
15.3k
                    );
238
15.3k
                    if (mm_return_32 == NULL)
239
25
                        return -1;
240
15.3k
                } /* If we aren't on a 32 bit boundary then continue from where we left off (unlikely but possible) */
241
15.3k
                while ( ( (char*)mm_return_32 - (char*)haystack->body.storage.blob_32) % sizeof(MVMGrapheme32)
242
0
                    && ( start_ptr = mm_return_32 ) /* Set the new start pointer at where we left off */
243
0
                    && ( end_ptr > start_ptr ) /* Check we aren't past the end of the string just in case */
244
15.3k
                );
245
15.3k
246
15.3k
                return (MVMGrapheme32*)mm_return_32 - haystack->body.storage.blob_32;
247
15.3k
            }
248
2.80k
            break;
249
354k
        case MVM_STRING_GRAPHEME_8:
250
354k
            if (needle->body.storage_type == MVM_STRING_GRAPHEME_8) {
251
347k
                void *mm_return_8 = MVM_memmem(
252
347k
                    haystack->body.storage.blob_8 + start, /* start position */
253
347k
                    (hgraphs - start) * sizeof(MVMGrapheme8), /* length of haystack from start position to end */
254
347k
                    needle->body.storage.blob_8, /* needle start */
255
347k
                    ngraphs * sizeof(MVMGrapheme8) /* needle length */
256
347k
                );
257
347k
                if (mm_return_8 == NULL)
258
296k
                    return -1;
259
347k
                else
260
51.0k
                    return (MVMGrapheme8*)mm_return_8 -  haystack->body.storage.blob_8;
261
347k
            }
262
6.90k
            break;
263
405k
    }
264
405k
265
405k
    /* brute force for now. horrible, yes. halp. */
266
2.98M
    while (index <= hgraphs - ngraphs) {
267
2.96M
        if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
268
19.3k
            return (MVMint64)index;
269
19.3k
        }
270
2.94M
        index++;
271
2.94M
    }
272
23.1k
    return -1;
273
42.5k
}
274
275
/* Returns the location of one string in another or -1  */
276
12
MVMint64 MVM_string_index_from_end(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) {
277
12
    MVMint64 result = -1;
278
12
    size_t index;
279
12
    MVMStringIndex hgraphs, ngraphs;
280
12
281
12
    MVM_string_check_arg(tc, haystack, "rindex search target");
282
12
    MVM_string_check_arg(tc, needle, "rindex search term");
283
12
    hgraphs = MVM_string_graphs_nocheck(tc, haystack);
284
12
    ngraphs = MVM_string_graphs_nocheck(tc, needle);
285
12
286
12
    if (!ngraphs) {
287
4
        if (start >= 0)
288
3
            return start <= hgraphs ? start : -1; /* the empty string is in any other string */
289
4
        else
290
1
            return hgraphs; /* no start, so return end */
291
4
    }
292
12
293
8
    if (!hgraphs)
294
0
        return -1;
295
8
296
8
    if (ngraphs > hgraphs || ngraphs < 1)
297
0
        return -1;
298
8
299
8
    if (start == -1)
300
2
        start = hgraphs - ngraphs;
301
8
302
8
    if (start < 0 || start >= hgraphs)
303
8
        /* maybe return -1 instead? */
304
0
        MVM_exception_throw_adhoc(tc, "index start offset out of range");
305
8
306
8
    index = start;
307
8
308
8
    if (index + ngraphs > hgraphs) {
309
1
        index = hgraphs - ngraphs;
310
1
    }
311
8
312
8
    /* brute force for now. horrible, yes. halp. */
313
19
    do {
314
19
        if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) {
315
6
            result = (MVMint64)index;
316
6
            break;
317
6
        }
318
13
    } while (index-- > 0);
319
8
    return result;
320
8
}
321
322
/* Returns a substring of the given string */
323
563k
MVMString * MVM_string_substring(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMint64 length) {
324
563k
    MVMString *result;
325
563k
    MVMint64   start_pos, end_pos;
326
563k
327
563k
    MVMint64 agraphs;
328
563k
329
563k
    MVM_string_check_arg(tc, a, "substring");
330
563k
    /* convert to signed to avoid implicit arithmetic conversions */
331
563k
    agraphs = (MVMint64)MVM_string_graphs_nocheck(tc, a);
332
563k
333
563k
    /* -1 signifies go to the end of the string; anything less is a bug */
334
563k
    if (length < -1)
335
0
        MVM_exception_throw_adhoc(tc, "Substring length (%"PRId64") cannot be negative", length);
336
563k
337
563k
    /* negative offsets count from the end */
338
563k
    start_pos = offset < 0 ? offset + agraphs : offset;
339
530k
    end_pos   = length == -1 ? agraphs : start_pos + length;
340
563k
341
563k
    /* return an empty string if start_pos is out of bounds but positive */
342
563k
    if (start_pos > agraphs) {
343
0
        start_pos = 0;
344
0
        end_pos   = 0;
345
0
    }
346
563k
347
563k
    if (end_pos < 0)
348
0
        MVM_exception_throw_adhoc(tc, "Substring end (%"PRId64") cannot be less than 0", end_pos);
349
563k
350
563k
    /* Ensure we're within bounds. */
351
563k
    if (start_pos < 0)
352
0
        start_pos = 0;
353
563k
    if (end_pos > agraphs)
354
7.02k
        end_pos = agraphs;
355
563k
356
563k
    /* Check trivial cases: empty string and whole string. */
357
563k
    if (start_pos == end_pos)
358
9.98k
        return tc->instance->str_consts.empty;
359
553k
    if (start_pos == 0 && end_pos == agraphs)
360
22.9k
        return a;
361
553k
362
553k
    /* Construct a result; how we efficiently do so will vary based on the
363
553k
     * input string. */
364
530k
    MVMROOT(tc, a, {
365
530k
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
366
530k
        result->body.num_graphs = end_pos - start_pos;
367
530k
        if (a->body.storage_type != MVM_STRING_STRAND) {
368
530k
            /* It's some kind of buffer. Construct a strand view into it. */
369
530k
            result->body.storage_type    = MVM_STRING_STRAND;
370
530k
            result->body.storage.strands = allocate_strands(tc, 1);
371
530k
            result->body.num_strands     = 1;
372
530k
            result->body.storage.strands[0].blob_string = a;
373
530k
            result->body.storage.strands[0].start       = start_pos;
374
530k
            result->body.storage.strands[0].end         = end_pos;
375
530k
            result->body.storage.strands[0].repetitions = 0;
376
530k
        }
377
530k
        else if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
378
530k
            /* Single strand string; quite possibly already a substring. We'll
379
530k
             * just produce an updated view. */
380
530k
            MVMStringStrand *orig_strand = &(a->body.storage.strands[0]);
381
530k
            result->body.storage_type    = MVM_STRING_STRAND;
382
530k
            result->body.storage.strands = allocate_strands(tc, 1);
383
530k
            result->body.num_strands     = 1;
384
530k
            result->body.storage.strands[0].blob_string = orig_strand->blob_string;
385
530k
            result->body.storage.strands[0].start       = orig_strand->start + start_pos;
386
530k
            result->body.storage.strands[0].end         = orig_strand->start + end_pos;
387
530k
            result->body.storage.strands[0].repetitions = 0;
388
530k
        }
389
530k
        else {
390
530k
            /* Produce a new blob string, collapsing the strands. */
391
530k
            MVMGraphemeIter gi;
392
530k
            MVMint32 i;
393
530k
            MVMint8  can_fit_into_8 = 1;
394
530k
            result->body.storage_type    = MVM_STRING_GRAPHEME_32;
395
530k
            result->body.storage.blob_32 = MVM_malloc(result->body.num_graphs * sizeof(MVMGrapheme32));
396
530k
            MVM_string_gi_init(tc, &gi, a);
397
530k
            MVM_string_gi_move_to(tc, &gi, start_pos);
398
530k
            for (i = 0; i < result->body.num_graphs; i++) {
399
530k
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
400
530k
                if (g < -128 || g >= 128)
401
530k
                    can_fit_into_8 = 0;
402
530k
                result->body.storage.blob_32[i] = g;
403
530k
            }
404
530k
            if (can_fit_into_8)
405
530k
                turn_32bit_into_8bit_unchecked(tc, result);
406
530k
        }
407
530k
    });
408
530k
409
530k
    STRAND_CHECK(tc, result);
410
530k
    return result;
411
553k
}
412
413
1
MVMString * MVM_string_replace(MVMThreadContext *tc, MVMString *original, MVMint64 start, MVMint64 count, MVMString *replacement) {
414
1
    /* XXX this could probably be done more efficiently directly. */
415
1
    MVMString *first_part;
416
1
    MVMString *rest_part;
417
1
    MVMString *result;
418
1
419
1
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&replacement);
420
1
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&original);
421
1
    first_part = MVM_string_substring(tc, original, 0, start);
422
1
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&first_part);
423
1
424
1
    rest_part  = MVM_string_substring(tc, original, start + count, -1);
425
1
    rest_part  = MVM_string_concatenate(tc, replacement, rest_part);
426
1
    result     = MVM_string_concatenate(tc, first_part, rest_part);
427
1
428
1
    MVM_gc_root_temp_pop_n(tc, 3);
429
1
430
1
    STRAND_CHECK(tc, result);
431
1
    return result;
432
1
}
433
434
/* Append one string to another. */
435
168k
static MVMint32 final_strand_matches(MVMThreadContext *tc, MVMString *a, MVMString *b) {
436
168k
    if (a->body.storage_type == MVM_STRING_STRAND) {
437
88.2k
        MVMStringStrand *ss = &(a->body.storage.strands[a->body.num_strands - 1]);
438
88.2k
        if (ss->end - ss->start == MVM_string_graphs(tc, b))
439
26.5k
            if (MVM_string_equal_at(tc, ss->blob_string, b, ss->start))
440
143
                return 1;
441
88.2k
    }
442
168k
    return 0;
443
168k
}
444
177k
MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString *b) {
445
177k
    MVMString *result;
446
177k
    MVMuint32  agraphs, bgraphs;
447
177k
    MVMuint64  total_graphs;
448
177k
449
177k
    MVM_string_check_arg(tc, a, "concatenate");
450
177k
    MVM_string_check_arg(tc, b, "concatenate");
451
177k
452
177k
    /* Simple empty-string cases. */
453
177k
    agraphs = MVM_string_graphs_nocheck(tc, a);
454
177k
    if (agraphs == 0)
455
8.91k
        return b;
456
168k
    bgraphs = MVM_string_graphs(tc, b);
457
168k
    if (bgraphs == 0)
458
159
        return a;
459
168k
460
168k
    /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
461
168k
    total_graphs = (MVMuint64)agraphs + (MVMuint64)bgraphs;
462
168k
    if (total_graphs > 0xFFFFFFFF)
463
0
        MVM_exception_throw_adhoc(tc,
464
0
            "Can't concatenate strings, required number of graphemes %"PRIu64" > max allowed of %u",
465
0
             total_graphs, 0xFFFFFFFF);
466
168k
467
168k
    /* Otherwise, we'll assemble a result string. */
468
168k
    MVMROOT(tc, a, {
469
168k
    MVMROOT(tc, b, {
470
168k
        /* Allocate it. */
471
168k
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
472
168k
473
168k
        /* Total graphemes is trivial; just total up inputs. */
474
168k
        result->body.num_graphs = (MVMuint32)total_graphs;
475
168k
476
168k
        /* Result string will be made of strands. */
477
168k
        result->body.storage_type = MVM_STRING_STRAND;
478
168k
479
168k
        /* Detect the wonderful case where we're repeatedly concating the same
480
168k
         * string again and again, and thus can just bump a repetition. */
481
168k
        if (final_strand_matches(tc, a, b)) {
482
168k
            /* We have it; just copy the strands to a new string and bump the
483
168k
             * repetitions count of the last one. */
484
168k
            result->body.storage.strands = allocate_strands(tc, a->body.num_strands);
485
168k
            copy_strands(tc, a, 0, result, 0, a->body.num_strands);
486
168k
            result->body.storage.strands[a->body.num_strands - 1].repetitions++;
487
168k
            result->body.num_strands = a->body.num_strands;
488
168k
        }
489
168k
490
168k
        /* Otherwise, construct a new strand string. */
491
168k
        else {
492
168k
            /* See if we have too many strands between the two. If so, we will
493
168k
             * collapse the biggest side. */
494
168k
            MVMuint16 strands_a = a->body.storage_type == MVM_STRING_STRAND
495
168k
                ? a->body.num_strands
496
168k
                : 1;
497
168k
            MVMuint16 strands_b = b->body.storage_type == MVM_STRING_STRAND
498
168k
                ? b->body.num_strands
499
168k
                : 1;
500
168k
            MVMString *effective_a = a;
501
168k
            MVMString *effective_b = b;
502
168k
            if (strands_a + strands_b > MVM_STRING_MAX_STRANDS) {
503
168k
                MVMROOT(tc, result, {
504
168k
                    if (strands_a >= strands_b) {
505
168k
                        effective_a = collapse_strands(tc, effective_a);
506
168k
                        strands_a   = 1;
507
168k
                    }
508
168k
                    else {
509
168k
                        effective_b = collapse_strands(tc, effective_b);
510
168k
                        strands_b   = 1;
511
168k
                    }
512
168k
                });
513
168k
            }
514
168k
515
168k
            /* Assemble the result. */
516
168k
            result->body.num_strands = strands_a + strands_b;
517
168k
            result->body.storage.strands = allocate_strands(tc, strands_a + strands_b);
518
168k
            if (effective_a->body.storage_type == MVM_STRING_STRAND) {
519
168k
                copy_strands(tc, effective_a, 0, result, 0, strands_a);
520
168k
            }
521
168k
            else {
522
168k
                MVMStringStrand *ss = &(result->body.storage.strands[0]);
523
168k
                ss->blob_string = effective_a;
524
168k
                ss->start       = 0;
525
168k
                ss->end         = effective_a->body.num_graphs;
526
168k
                ss->repetitions = 0;
527
168k
            }
528
168k
            if (effective_b->body.storage_type == MVM_STRING_STRAND) {
529
168k
                copy_strands(tc, effective_b, 0, result, strands_a, strands_b);
530
168k
            }
531
168k
            else {
532
168k
                MVMStringStrand *ss = &(result->body.storage.strands[strands_a]);
533
168k
                ss->blob_string = effective_b;
534
168k
                ss->start       = 0;
535
168k
                ss->end         = effective_b->body.num_graphs;
536
168k
                ss->repetitions = 0;
537
168k
            }
538
168k
        }
539
168k
    });
540
168k
    });
541
168k
542
168k
    STRAND_CHECK(tc, result);
543
168k
    return MVM_nfg_is_concat_stable(tc, a, b) ? result : re_nfg(tc, result);
544
168k
}
545
546
304
MVMString * MVM_string_repeat(MVMThreadContext *tc, MVMString *a, MVMint64 count) {
547
304
    MVMString *result;
548
304
    MVMuint32  agraphs;
549
304
    MVMuint64  total_graphs;
550
304
551
304
    MVM_string_check_arg(tc, a, "repeat");
552
304
553
304
    /* Validate count; handle common cases. */
554
304
    if (count == 0)
555
7
        return tc->instance->str_consts.empty;
556
297
    if (count == 1)
557
15
        return a;
558
282
    if (count < 0)
559
0
        MVM_exception_throw_adhoc(tc, "repeat count (%"PRId64") cannot be negative", count);
560
282
    if (count > (1 << 30))
561
0
        MVM_exception_throw_adhoc(tc, "repeat count > %d arbitrarily unsupported...", (1 << 30));
562
282
563
282
    /* If input string is empty, repeating it is empty. */
564
282
    agraphs = MVM_string_graphs_nocheck(tc, a);
565
282
    if (agraphs == 0)
566
0
        return tc->instance->str_consts.empty;
567
282
568
282
    /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
569
282
    total_graphs = (MVMuint64)agraphs * (MVMuint64)count;
570
282
    if (total_graphs > 0xFFFFFFFF)
571
0
        MVM_exception_throw_adhoc(tc,
572
0
            "Can't repeat string, required number of graphemes %"PRIu64" > max allowed of %u",
573
0
             total_graphs, 0xFFFFFFFF);
574
282
575
282
    /* Now build a result string with the repetition set. */
576
282
    MVMROOT(tc, a, {
577
282
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
578
282
        result->body.num_graphs      = agraphs * count;
579
282
        result->body.storage_type    = MVM_STRING_STRAND;
580
282
        result->body.storage.strands = allocate_strands(tc, 1);
581
282
        if (a->body.storage_type == MVM_STRING_STRAND) {
582
282
            if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
583
282
                copy_strands(tc, a, 0, result, 0, 1);
584
282
            }
585
282
            else {
586
282
                MVMROOT(tc, result, {
587
282
                    a = collapse_strands(tc, a);
588
282
                });
589
282
                result->body.storage.strands[0].blob_string = a;
590
282
                result->body.storage.strands[0].start       = 0;
591
282
                result->body.storage.strands[0].end         = agraphs;
592
282
            }
593
282
        }
594
282
        else {
595
282
            result->body.storage.strands[0].blob_string = a;
596
282
            result->body.storage.strands[0].start       = 0;
597
282
            result->body.storage.strands[0].end         = agraphs;
598
282
        }
599
282
        result->body.storage.strands[0].repetitions = count - 1;
600
282
        result->body.num_strands = 1;
601
282
    });
602
282
603
282
    STRAND_CHECK(tc, result);
604
282
    return result;
605
282
}
606
607
11.3k
void MVM_string_say(MVMThreadContext *tc, MVMString *a) {
608
11.3k
    MVM_string_check_arg(tc, a, "say");
609
11.3k
    MVM_io_write_string(tc, tc->instance->stdout_handle, a, 1);
610
11.3k
}
611
612
47
void MVM_string_print(MVMThreadContext *tc, MVMString *a) {
613
47
    MVM_string_check_arg(tc, a, "print");
614
47
    MVM_io_write_string(tc, tc->instance->stdout_handle, a, 0);
615
47
}
616
617
/* Tests whether one string a has the other string b as a substring at that index */
618
12.9M
MVMint64 MVM_string_equal_at(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 offset) {
619
12.9M
620
12.9M
    MVMStringIndex agraphs, bgraphs;
621
12.9M
622
12.9M
    MVM_string_check_arg(tc, a, "equal_at");
623
12.9M
    MVM_string_check_arg(tc, b, "equal_at");
624
12.9M
625
12.9M
    agraphs = MVM_string_graphs_nocheck(tc, a);
626
12.9M
    bgraphs = MVM_string_graphs_nocheck(tc, b);
627
12.9M
628
12.9M
    if (offset < 0) {
629
2
        offset += agraphs;
630
2
        if (offset < 0)
631
1
            offset = 0; /* XXX I think this is the right behavior here */
632
2
    }
633
12.9M
    if (agraphs - offset < bgraphs)
634
1.37k
        return 0;
635
12.9M
    return MVM_string_substrings_equal_nocheck(tc, a, offset, bgraphs, b, 0);
636
12.9M
}
637
1.17k
MVM_STATIC_INLINE MVMint32 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, MVMString *haystack, MVMString *needle_fc, MVMint64 h_start, MVMint64 h_graphs, MVMint64 n_graphs) {
638
1.17k
    MVMuint32 h_fc_cps;
639
1.17k
    /* An additional needle offset which is used only when codepoints expand
640
1.17k
     * when casefolded. The offset is the number of additional codepoints that
641
1.17k
     * have been seen so haystack and needle stay aligned */
642
1.17k
    MVMint64 n_offset = 0;
643
1.17k
    MVMint64 i, j;
644
1.17k
    MVMGrapheme32 h_g, n_g;
645
1.62k
    for (i = 0; i + h_start < h_graphs && i + n_offset < n_graphs; i++) {
646
1.33k
        const MVMCodepoint* h_result_cps;
647
1.33k
        h_g = MVM_string_get_grapheme_at_nocheck(tc, haystack, h_start + i);
648
1.33k
        if (h_g >= 0 ) {
649
1.33k
            /* For codeponits we can get the case change directly */
650
1.33k
            h_fc_cps = MVM_unicode_get_case_change(tc, h_g, MVM_unicode_case_change_type_fold, &h_result_cps);
651
1.33k
        }
652
0
        else {
653
0
            /* Synthetics must use this function */
654
0
            h_fc_cps = MVM_nfg_get_case_change(tc, h_g, MVM_unicode_case_change_type_fold, (MVMGrapheme32**) &h_result_cps);
655
0
        }
656
1.33k
        /* If we get 0 for the number that means the cp doesn't change when casefolded */
657
1.33k
        if (h_fc_cps == 0) {
658
1.07k
            n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
659
1.07k
            if (h_g != n_g)
660
841
                return 0;
661
1.07k
        }
662
258
        else if (h_fc_cps >= 1) {
663
573
            for (j = 0; j < h_fc_cps; j++) {
664
352
                n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
665
352
                h_g = h_result_cps[j];
666
352
                if (h_g != n_g)
667
37
                    return 0;
668
315
                n_offset++;
669
315
            }
670
221
            n_offset--;
671
221
        }
672
1.33k
    }
673
296
    return 1;
674
1.17k
}
675
/* Checks if needle exists at the offset, but ignores case.
676
 * Sometimes there is a difference in length of a string before and after foldcase,
677
 * because of this we must compare this differently than just foldcasing both
678
 * strings to ensure the offset is correct */
679
215
MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 h_offset) {
680
215
    /* Foldcase version of needle */
681
215
    MVMString *needle_fc;
682
215
    MVMStringIndex h_graphs = MVM_string_graphs(tc, haystack);
683
215
    MVMStringIndex n_graphs = MVM_string_graphs(tc, needle);
684
215
685
215
    if (h_offset < 0) {
686
0
        h_offset += h_graphs;
687
0
        if (h_offset < 0)
688
0
            h_offset = 0; /* XXX I think this is the right behavior here */
689
0
    }
690
215
    /* If the offset is greater or equal to the number of haystack graphemes
691
215
     * return 0. Since size of graphemes could change under casefolding, we
692
215
     * can't assume too much. If optimizing this be careful */
693
215
    if (h_offset >= h_graphs)
694
8
        return 0;
695
215
696
207
    MVMROOT(tc, haystack, {
697
207
        needle_fc = MVM_string_fc(tc, needle);
698
207
    });
699
207
700
207
    return string_equal_at_ignore_case_INTERNAL_loop(tc, haystack, needle_fc, h_offset, h_graphs, n_graphs);
701
215
}
702
123
MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) {
703
123
    /* Foldcase version of needle */
704
123
    MVMString *needle_fc;
705
123
    MVMStringIndex n_fc_graphs;
706
123
707
123
    size_t index           = (size_t)start;
708
123
    MVMStringIndex hgraphs, ngraphs;
709
123
    MVMint64 return_val = -1;
710
123
    MVM_string_check_arg(tc, haystack, "index search target");
711
123
    MVM_string_check_arg(tc, needle, "index search term");
712
123
    hgraphs = MVM_string_graphs(tc, haystack);
713
123
    ngraphs = MVM_string_graphs(tc, needle);
714
123
    if (!ngraphs)
715
0
        return start <= hgraphs ? start : -1; /* Empty string is in any other string */
716
123
    if (!hgraphs)
717
0
        return -1;
718
123
    if (start < 0 || start >= hgraphs)
719
7
        return -1;
720
123
    /* Codepoints can expand into up to THREE codepoints (as of Unicode 9.0). The next check
721
123
     * checks if it is at all possible for the needle grapheme number to be higher
722
123
     * than the haystack */
723
116
    if (ngraphs > hgraphs * 3)
724
1
        return -1;
725
116
726
115
    if (ngraphs < 1)
727
0
        return -1;
728
115
729
115
    MVMROOT(tc, haystack, {
730
115
        needle_fc = MVM_string_fc(tc, needle);
731
115
    });
732
115
    n_fc_graphs = MVM_string_graphs(tc, needle_fc);
733
115
734
115
    /* brute force for now. horrible, yes. halp. */
735
967
    while (index <= hgraphs) {
736
967
        if (string_equal_at_ignore_case_INTERNAL_loop(tc, haystack, needle_fc, index, hgraphs, n_fc_graphs))
737
115
            return (MVMint64)index;
738
967
739
852
        index++;
740
852
    }
741
0
    return -1;
742
115
}
743
155k
MVMGrapheme32 MVM_string_ord_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
744
155k
    MVMStringIndex agraphs;
745
155k
    MVMGrapheme32 g;
746
155k
747
155k
    MVM_string_check_arg(tc, s, "grapheme_at");
748
155k
749
155k
    agraphs = MVM_string_graphs(tc, s);
750
155k
    if (offset < 0 || offset >= agraphs)
751
0
        return -1;
752
155k
753
155k
    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
754
155k
755
155k
    return g >= 0 ? g : MVM_nfg_get_synthetic_info(tc, g)->base;
756
155k
}
757
758
0
MVMGrapheme32 MVM_string_ord_basechar_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
759
0
    MVMStringIndex agraphs;
760
0
    MVMGrapheme32 g;
761
0
    MVMNormalizer norm;
762
0
    MVMint32 ready;
763
0
764
0
    MVM_string_check_arg(tc, s, "ord_basechar_at");
765
0
766
0
    agraphs = MVM_string_graphs(tc, s);
767
0
    if (offset < 0 || offset >= agraphs)
768
0
        return -1;  /* fixes RT #126771 */
769
0
770
0
    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
771
0
772
0
    if (g < 0) {
773
0
        MVMNFGSynthetic *si = MVM_nfg_get_synthetic_info(tc, g);
774
0
        g = si->base;
775
0
    }
776
0
    else {
777
0
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD);
778
0
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, g, &g);
779
0
        MVM_unicode_normalizer_eof(tc, &norm);
780
0
        if (!ready)
781
0
            g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
782
0
783
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
784
0
    }
785
0
786
0
    return g;
787
0
}
788
789
/* Compares two strings for equality. */
790
59.5M
MVMint64 MVM_string_equal(MVMThreadContext *tc, MVMString *a, MVMString *b) {
791
59.5M
    MVM_string_check_arg(tc, a, "equal");
792
59.5M
    MVM_string_check_arg(tc, b, "equal");
793
59.5M
    if (a == b)
794
11.1M
        return 1;
795
48.4M
    if (MVM_string_graphs(tc, a) != MVM_string_graphs(tc, b))
796
35.6M
        return 0;
797
12.8M
    return MVM_string_equal_at(tc, a, b, 0);
798
48.4M
}
799
800
/* more general form of has_at; compares two substrings for equality */
801
MVMint64 MVM_string_have_at(MVMThreadContext *tc, MVMString *a,
802
0
        MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
803
0
804
0
    MVM_string_check_arg(tc, a, "have_at");
805
0
    MVM_string_check_arg(tc, b, "have_at");
806
0
807
0
    if (starta < 0 || startb < 0)
808
0
        return 0;
809
0
    if (length == 0)
810
0
        return 1;
811
0
    if (starta + length > MVM_string_graphs(tc, a) || startb + length > MVM_string_graphs(tc, b))
812
0
        return 0;
813
0
814
0
    return MVM_string_substrings_equal_nocheck(tc, a, starta, length, b, startb);
815
0
}
816
817
/* Returns the grapheme at a given index of the string */
818
597k
MVMint64 MVM_string_get_grapheme_at(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
819
597k
    MVMStringIndex agraphs;
820
597k
821
597k
    MVM_string_check_arg(tc, a, "grapheme_at");
822
597k
823
597k
    agraphs = MVM_string_graphs(tc, a);
824
597k
825
597k
    if (index < 0 || index >= agraphs)
826
0
        MVM_exception_throw_adhoc(tc, "Invalid string index: max %"PRId32", got %"PRId64,
827
0
            agraphs - 1, index);
828
597k
829
597k
    return (MVMint64)MVM_string_get_grapheme_at_nocheck(tc, a, index);
830
597k
}
831
832
/* Finds the location of a grapheme in a string.  Useful for small character class lookup */
833
1.70M
MVMint64 MVM_string_index_of_grapheme(MVMThreadContext *tc, MVMString *a, MVMGrapheme32 grapheme) {
834
1.70M
    size_t index = -1;
835
1.70M
    MVMGraphemeIter gi;
836
1.70M
837
1.70M
    MVM_string_check_arg(tc, a, "string_index_of_grapheme");
838
1.70M
839
1.70M
    MVM_string_gi_init(tc, &gi, a);
840
9.57M
    while (MVM_string_gi_has_more(tc, &gi)) {
841
8.09M
        index++;
842
8.09M
        if (MVM_string_gi_get_grapheme(tc, &gi) == grapheme)
843
228k
            return index;
844
8.09M
    }
845
1.47M
    return -1;
846
1.70M
}
847
848
/* Case change functions. */
849
static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g);
850
10.8k
static MVMString * do_case_change(MVMThreadContext *tc, MVMString *s, MVMint32 type, char *error) {
851
10.8k
    MVMint64 sgraphs;
852
10.8k
    MVM_string_check_arg(tc, s, error);
853
10.8k
    sgraphs = MVM_string_graphs(tc, s);
854
10.8k
    if (sgraphs) {
855
9.36k
        MVMString *result;
856
9.36k
        MVMGraphemeIter gi;
857
9.36k
        MVMint64 result_graphs = sgraphs;
858
9.36k
        MVMGrapheme32 *result_buf = MVM_malloc(result_graphs * sizeof(MVMGrapheme32));
859
9.36k
        MVMint32 changed = 0;
860
9.36k
        MVMint64 i = 0;
861
9.36k
        MVM_string_gi_init(tc, &gi, s);
862
54.7k
        while (MVM_string_gi_has_more(tc, &gi)) {
863
45.4k
            MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
864
45.4k
          peeked:
865
45.4k
            if (g == 0x03A3) {
866
0
                /* Greek sigma needs special handling when lowercased. */
867
0
                switch (type) {
868
0
                    case MVM_unicode_case_change_type_upper:
869
0
                    case MVM_unicode_case_change_type_title:
870
0
                        result_buf[i++] = g;
871
0
                        break;
872
0
                    case MVM_unicode_case_change_type_lower:
873
0
                        changed = 1;
874
0
                        if (i == 0) {
875
0
                            /* Start of string, so not final. */
876
0
                            result_buf[i++] = 0x03C3;
877
0
                        }
878
0
                        else if (!grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, result_buf[i - 1])) {
879
0
                            /* Previous char is not a letter; not final (as has
880
0
                             * to be at end of a word and not only thing in a
881
0
                             * word). */
882
0
                            result_buf[i++] = 0x03C3;
883
0
                        }
884
0
                        else if (!MVM_string_gi_has_more(tc, &gi)) {
885
0
                            /* End of string. We only reach here if we have a
886
0
                             * letter before us, so it must be final. */
887
0
                            result_buf[i++] = 0x03C2;
888
0
                        }
889
0
                        else {
890
0
                            /* Letter before us, something ahead of us. Need to
891
0
                             * peek ahead to see if it's a letter, to decide if
892
0
                             * we have final sigma or not. */
893
0
                            g = MVM_string_gi_get_grapheme(tc, &gi);
894
0
                            if (grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, g))
895
0
                                result_buf[i++] = 0x03C3;
896
0
                            else
897
0
                                result_buf[i++] = 0x03C2;
898
0
                            goto peeked;
899
0
                        }
900
0
                        break;
901
0
                    case MVM_unicode_case_change_type_fold:
902
0
                        result_buf[i++] = 0x03C3;
903
0
                        changed = 1;
904
0
                        break;
905
0
                }
906
0
            }
907
45.4k
            else if (g >= 0) {
908
45.4k
                const MVMCodepoint *result_cps;
909
45.4k
                MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
910
45.4k
                    g, type, &result_cps);
911
45.4k
                if (num_result_cps == 0) {
912
31.9k
                    result_buf[i++] = g;
913
31.9k
                }
914
13.4k
                else if (num_result_cps == 1) {
915
13.3k
                    result_buf[i++] = *result_cps;
916
13.3k
                    changed = 1;
917
13.3k
                }
918
98
                else {
919
98
                    /* To maintain NFG, we need to re-normalize when we get an
920
98
                     * expansion. */
921
98
                    MVMNormalizer norm;
922
98
                    MVMint32 num_result_graphs;
923
98
                    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
924
98
                    MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, num_result_cps);
925
98
                    MVM_unicode_normalizer_eof(tc, &norm);
926
98
                    num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
927
98
928
98
                    /* Make space for any extra graphemes. */
929
98
                    if (num_result_graphs > 1) {
930
98
                        result_graphs += num_result_graphs - 1;
931
98
                        result_buf = MVM_realloc(result_buf,
932
98
                            result_graphs * sizeof(MVMGrapheme32));
933
98
                    }
934
98
935
98
                    /* Copy resulting graphemes. */
936
294
                    while (num_result_graphs > 0) {
937
196
                        result_buf[i++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
938
196
                        num_result_graphs--;
939
196
                    }
940
98
                    changed = 1;
941
98
942
98
                    /* Clean up normalizer (we could init one per transform
943
98
                     * and keep it around in the future, if we find it's a
944
98
                     * worthwhile gain). */
945
98
                    MVM_unicode_normalizer_cleanup(tc, &norm);
946
98
                }
947
45.4k
            }
948
0
            else {
949
0
                MVMGrapheme32 *transformed;
950
0
                MVMint32 num_transformed = MVM_nfg_get_case_change(tc, g, type, &transformed);
951
0
                if (num_transformed == 0) {
952
0
                    result_buf[i++] = g;
953
0
                }
954
0
                else if (num_transformed == 1) {
955
0
                    result_buf[i++] = *transformed;
956
0
                    changed = 1;
957
0
                }
958
0
                else {
959
0
                    MVMuint32 j;
960
0
                    result_graphs += num_transformed - 1;
961
0
                    result_buf = MVM_realloc(result_buf,
962
0
                        result_graphs * sizeof(MVMGrapheme32));
963
0
                    for (j = 0; j < num_transformed; j++)
964
0
                        result_buf[i++] = transformed[j];
965
0
                    changed = 1;
966
0
                }
967
0
            }
968
45.4k
        }
969
9.36k
        if (changed) {
970
2.79k
            result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
971
2.79k
            result->body.num_graphs      = result_graphs;
972
2.79k
            result->body.storage_type    = MVM_STRING_GRAPHEME_32;
973
2.79k
            result->body.storage.blob_32 = result_buf;
974
2.79k
            return result;
975
2.79k
        }
976
6.56k
        else {
977
6.56k
            MVM_free(result_buf);
978
6.56k
        }
979
9.36k
    }
980
8.05k
    STRAND_CHECK(tc, s);
981
8.05k
    return s;
982
10.8k
}
983
101
MVMString * MVM_string_uc(MVMThreadContext *tc, MVMString *s) {
984
101
    return do_case_change(tc, s, MVM_unicode_case_change_type_upper, "uc");
985
101
}
986
10.4k
MVMString * MVM_string_lc(MVMThreadContext *tc, MVMString *s) {
987
10.4k
    return do_case_change(tc, s, MVM_unicode_case_change_type_lower, "lc");
988
10.4k
}
989
12
MVMString * MVM_string_tc(MVMThreadContext *tc, MVMString *s) {
990
12
    return do_case_change(tc, s, MVM_unicode_case_change_type_title, "tc");
991
12
}
992
322
MVMString * MVM_string_fc(MVMThreadContext *tc, MVMString *s) {
993
322
    return do_case_change(tc, s, MVM_unicode_case_change_type_fold, "fc");
994
322
}
995
996
/* Decodes a C buffer to an MVMString, dependent on the encoding type flag. */
997
MVMString * MVM_string_decode(MVMThreadContext *tc,
998
5
        const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, MVMint64 encoding_flag) {
999
5
    switch(encoding_flag) {
1000
4
        case MVM_encoding_type_utf8:
1001
4
            return MVM_string_utf8_decode_strip_bom(tc, type_object, Cbuf, byte_length);
1002
0
        case MVM_encoding_type_ascii:
1003
0
            return MVM_string_ascii_decode(tc, type_object, Cbuf, byte_length);
1004
0
        case MVM_encoding_type_latin1:
1005
0
            return MVM_string_latin1_decode(tc, type_object, Cbuf, byte_length);
1006
1
        case MVM_encoding_type_utf16:
1007
1
            return MVM_string_utf16_decode(tc, type_object, Cbuf, byte_length);
1008
0
        case MVM_encoding_type_windows1252:
1009
0
            return MVM_string_windows1252_decode(tc, type_object, Cbuf, byte_length);
1010
0
        case MVM_encoding_type_utf8_c8:
1011
0
            return MVM_string_utf8_c8_decode(tc, type_object, Cbuf, byte_length);
1012
0
        default:
1013
0
            MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
1014
5
    }
1015
5
}
1016
1017
/* Encodes an MVMString to a C buffer, dependent on the encoding type flag */
1018
char * MVM_string_encode(MVMThreadContext *tc, MVMString *s, MVMint64 start,
1019
        MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag,
1020
11.4k
        MVMString *replacement, MVMint32 translate_newlines) {
1021
11.4k
    switch(encoding_flag) {
1022
11.4k
        case MVM_encoding_type_utf8:
1023
11.4k
            return MVM_string_utf8_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1024
2
        case MVM_encoding_type_ascii:
1025
2
            return MVM_string_ascii_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1026
0
        case MVM_encoding_type_latin1:
1027
0
            return MVM_string_latin1_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1028
1
        case MVM_encoding_type_utf16:
1029
1
            return MVM_string_utf16_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1030
0
        case MVM_encoding_type_windows1252:
1031
0
            return MVM_string_windows1252_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
1032
0
        case MVM_encoding_type_utf8_c8:
1033
0
            return MVM_string_utf8_c8_encode_substr(tc, s, output_size, start, length, replacement);
1034
0
        default:
1035
0
            MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
1036
11.4k
    }
1037
11.4k
}
1038
1039
/* Encodes a string, and writes the encoding string into the supplied Buf
1040
 * instance, which should be an integer array with MVMArray REPR. */
1041
14
void MVM_string_encode_to_buf(MVMThreadContext *tc, MVMString *s, MVMString *enc_name, MVMObject *buf, MVMString *replacement) {
1042
14
    MVMuint64 output_size;
1043
14
    MVMuint8 *encoded;
1044
14
    MVMArrayREPRData *buf_rd;
1045
14
    MVMuint8 elem_size = 0;
1046
14
1047
14
    /* Ensure the target is in the correct form. */
1048
14
    MVM_string_check_arg(tc, s, "encode");
1049
14
    if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
1050
0
        MVM_exception_throw_adhoc(tc, "encode requires a native array to write into");
1051
14
    buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
1052
14
    if (buf_rd) {
1053
14
        switch (buf_rd->slot_type) {
1054
0
            case MVM_ARRAY_I64: elem_size = 8; break;
1055
0
            case MVM_ARRAY_I32: elem_size = 4; break;
1056
0
            case MVM_ARRAY_I16: elem_size = 2; break;
1057
0
            case MVM_ARRAY_I8:  elem_size = 1; break;
1058
0
            case MVM_ARRAY_U64: elem_size = 8; break;
1059
0
            case MVM_ARRAY_U32: elem_size = 4; break;
1060
1
            case MVM_ARRAY_U16: elem_size = 2; break;
1061
13
            case MVM_ARRAY_U8:  elem_size = 1; break;
1062
14
        }
1063
14
    }
1064
14
    if (!elem_size)
1065
0
        MVM_exception_throw_adhoc(tc, "encode requires a native int array");
1066
14
    if (((MVMArray *)buf)->body.slots.any)
1067
0
        MVM_exception_throw_adhoc(tc, "encode requires an empty array");
1068
14
1069
14
    /* At least find_encoding may allocate on first call, so root just
1070
14
     * in case. */
1071
14
    MVMROOT(tc, buf, {
1072
14
    MVMROOT(tc, s, {
1073
14
        const MVMuint8 encoding_flag = MVM_string_find_encoding(tc, enc_name);
1074
14
        encoded = (MVMuint8 *)MVM_string_encode(tc, s, 0, MVM_string_graphs(tc, s), &output_size,
1075
14
            encoding_flag, replacement, 0);
1076
14
    });
1077
14
    });
1078
14
1079
14
    /* Stash the encoded data in the VMArray. */
1080
14
    ((MVMArray *)buf)->body.slots.i8 = (MVMint8 *)encoded;
1081
14
    ((MVMArray *)buf)->body.start    = 0;
1082
14
    ((MVMArray *)buf)->body.ssize    = output_size / elem_size;
1083
14
    ((MVMArray *)buf)->body.elems    = output_size / elem_size;
1084
14
}
1085
1086
/* Decodes a string using the data from the specified Buf. */
1087
5
MVMString * MVM_string_decode_from_buf(MVMThreadContext *tc, MVMObject *buf, MVMString *enc_name) {
1088
5
    MVMArrayREPRData *buf_rd;
1089
5
    MVMuint8 encoding_flag;
1090
5
    MVMuint8 elem_size = 0;
1091
5
1092
5
    /* Ensure the source is in the correct form. */
1093
5
    if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
1094
0
        MVM_exception_throw_adhoc(tc, "decode requires a native array to read from");
1095
5
    buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
1096
5
    if (buf_rd) {
1097
5
        switch (buf_rd->slot_type) {
1098
0
            case MVM_ARRAY_I64: elem_size = 8; break;
1099
0
            case MVM_ARRAY_I32: elem_size = 4; break;
1100
0
            case MVM_ARRAY_I16: elem_size = 2; break;
1101
1
            case MVM_ARRAY_I8:  elem_size = 1; break;
1102
0
            case MVM_ARRAY_U64: elem_size = 8; break;
1103
0
            case MVM_ARRAY_U32: elem_size = 4; break;
1104
1
            case MVM_ARRAY_U16: elem_size = 2; break;
1105
3
            case MVM_ARRAY_U8:  elem_size = 1; break;
1106
5
        }
1107
5
    }
1108
5
    if (!elem_size)
1109
0
        MVM_exception_throw_adhoc(tc, "encode requires a native int array");
1110
5
1111
5
    /* Decode. */
1112
5
    MVMROOT(tc, buf, {
1113
5
        encoding_flag = MVM_string_find_encoding(tc, enc_name);
1114
5
    });
1115
5
    return MVM_string_decode(tc, tc->instance->VMString,
1116
5
        (char *)(((MVMArray *)buf)->body.slots.i8 + ((MVMArray *)buf)->body.start),
1117
5
        ((MVMArray *)buf)->body.elems * elem_size,
1118
5
        encoding_flag);
1119
5
}
1120
1121
35.0k
MVMObject * MVM_string_split(MVMThreadContext *tc, MVMString *separator, MVMString *input) {
1122
35.0k
    MVMObject *result;
1123
35.0k
    MVMStringIndex start, end, sep_length;
1124
35.0k
    MVMHLLConfig *hll = MVM_hll_current(tc);
1125
35.0k
1126
35.0k
    MVM_string_check_arg(tc, separator, "split separator");
1127
35.0k
    MVM_string_check_arg(tc, input, "split input");
1128
35.0k
1129
35.0k
    MVMROOT(tc, input, {
1130
35.0k
    MVMROOT(tc, separator, {
1131
35.0k
        result = MVM_repr_alloc_init(tc, hll->slurpy_array_type);
1132
35.0k
        MVMROOT(tc, result, {
1133
35.0k
            start = 0;
1134
35.0k
            end = MVM_string_graphs_nocheck(tc, input);
1135
35.0k
            sep_length = MVM_string_graphs_nocheck(tc, separator);
1136
35.0k
1137
35.0k
            while (start < end) {
1138
35.0k
                MVMString *portion;
1139
35.0k
                MVMStringIndex index;
1140
35.0k
                MVMStringIndex length;
1141
35.0k
1142
35.0k
                /* XXX make this use the dual-traverse iterator, but such that it
1143
35.0k
                    can reset the index of what it's comparing... <!> */
1144
35.0k
                index = MVM_string_index(tc, input, separator, start);
1145
35.0k
                length = sep_length ? (index == -1 ? end : index) - start : 1;
1146
35.0k
                if (length > 0 || (sep_length && length == 0)) {
1147
35.0k
                    portion = MVM_string_substring(tc, input, start, length);
1148
35.0k
                    MVMROOT(tc, portion, {
1149
35.0k
                        MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
1150
35.0k
                        MVM_repr_set_str(tc, pobj, portion);
1151
35.0k
                        MVM_repr_push_o(tc, result, pobj);
1152
35.0k
                    });
1153
35.0k
                }
1154
35.0k
                start += length + sep_length;
1155
35.0k
                /* Gather an empty string if the delimiter is found at the end. */
1156
35.0k
                if (sep_length && start == end) {
1157
35.0k
                    MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
1158
35.0k
                    MVM_repr_set_str(tc, pobj, tc->instance->str_consts.empty);
1159
35.0k
                    MVM_repr_push_o(tc, result, pobj);
1160
35.0k
                }
1161
35.0k
            }
1162
35.0k
        });
1163
35.0k
    });
1164
35.0k
    });
1165
35.0k
1166
35.0k
    return result;
1167
35.0k
}
1168
1169
16.8k
MVMString * MVM_string_join(MVMThreadContext *tc, MVMString *separator, MVMObject *input) {
1170
16.8k
    MVMString  *result;
1171
16.8k
    MVMString **pieces;
1172
16.8k
    MVMint64    elems, num_pieces, sgraphs, i, is_str_array, total_graphs;
1173
16.8k
    MVMuint16   sstrands, total_strands;
1174
16.8k
    MVMint32    concats_stable = 1;
1175
16.8k
1176
16.8k
    MVM_string_check_arg(tc, separator, "join separator");
1177
16.8k
    if (!IS_CONCRETE(input))
1178
0
        MVM_exception_throw_adhoc(tc, "join needs a concrete array to join");
1179
16.8k
1180
16.8k
    /* See how many things we have to join; if the answer is "none" then we
1181
16.8k
     * can make a hasty escape. */
1182
16.8k
    elems = MVM_repr_elems(tc, input);
1183
16.8k
    if (elems == 0)
1184
163
        return tc->instance->str_consts.empty;
1185
16.7k
    is_str_array = REPR(input)->pos_funcs.get_elem_storage_spec(tc,
1186
16.7k
        STABLE(input)).boxed_primitive == MVM_STORAGE_SPEC_BP_STR;
1187
16.7k
1188
16.7k
    /* Allocate result. */
1189
16.7k
    MVMROOT(tc, separator, {
1190
16.7k
    MVMROOT(tc, input, {
1191
16.7k
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1192
16.7k
    });
1193
16.7k
    });
1194
16.7k
1195
16.7k
    /* Take a first pass through the string, counting up length and the total
1196
16.7k
     * number of strands we encounter as well as building a flat array of the
1197
16.7k
     * strings (to we only have to do the indirect calls once). */
1198
16.7k
    sgraphs  = MVM_string_graphs_nocheck(tc, separator);
1199
16.7k
    if (sgraphs)
1200
2.61k
        sstrands = separator->body.storage_type == MVM_STRING_STRAND
1201
0
            ? separator->body.num_strands
1202
2.61k
            : 1;
1203
16.7k
    else
1204
14.1k
        sstrands = 1;
1205
16.7k
    pieces        = MVM_malloc(elems * sizeof(MVMString *));
1206
16.7k
    num_pieces    = 0;
1207
16.7k
    total_graphs  = 0;
1208
16.7k
    total_strands = 0;
1209
46.8k
    for (i = 0; i < elems; i++) {
1210
30.0k
        /* Get piece of the string. */
1211
30.0k
        MVMString *piece;
1212
30.0k
        MVMint64   piece_graphs;
1213
30.0k
        if (is_str_array) {
1214
23.1k
            piece = MVM_repr_at_pos_s(tc, input, i);
1215
23.1k
            if (!piece)
1216
0
                continue;
1217
23.1k
        }
1218
6.93k
        else {
1219
6.93k
            MVMObject *item = MVM_repr_at_pos_o(tc, input, i);
1220
6.93k
            if (!item || !IS_CONCRETE(item))
1221
0
                continue;
1222
6.93k
            piece = MVM_repr_get_str(tc, item);
1223
6.93k
        }
1224
30.0k
1225
30.0k
        /* If it wasn't the first piece, add separator here. */
1226
30.0k
        if (num_pieces) {
1227
13.3k
            total_strands += sstrands;
1228
13.3k
            total_graphs  += sgraphs;
1229
13.3k
        }
1230
30.0k
1231
30.0k
        /* Add on the piece's strands and graphs. */
1232
30.0k
        piece_graphs = MVM_string_graphs(tc, piece);
1233
30.0k
        if (piece_graphs) {
1234
29.9k
            total_strands += piece->body.storage_type == MVM_STRING_STRAND
1235
22.9k
                ? piece->body.num_strands
1236
6.96k
                : 1;
1237
29.9k
            total_graphs += piece_graphs;
1238
29.9k
        }
1239
30.0k
1240
30.0k
        /* Store piece. */
1241
30.0k
        pieces[num_pieces++] = piece;
1242
30.0k
    }
1243
16.7k
1244
16.7k
    /* We now know the total eventual number of graphemes. */
1245
16.7k
    if (total_graphs == 0) {
1246
18
        free(pieces);
1247
18
        return tc->instance->str_consts.empty;
1248
18
    }
1249
16.7k
    result->body.num_graphs = total_graphs;
1250
16.7k
1251
16.7k
    /* If we just collect all the things as strands, are we within bounds, and
1252
16.7k
     * will be come out ahead? */
1253
16.7k
    if (total_strands < MVM_STRING_MAX_STRANDS && total_graphs / total_strands >= 16) {
1254
727
        /* XXX TODO: Implement this, conditionalize branch thing below. */
1255
727
    }
1256
16.7k
    /*else {*/
1257
16.7k
    if (1) {
1258
16.7k
        /* We'll produce a single, flat string. */
1259
16.7k
        MVMint64        position = 0;
1260
16.7k
        MVMGraphemeIter gi;
1261
16.7k
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
1262
16.7k
        result->body.storage.blob_32 = MVM_malloc(total_graphs * sizeof(MVMGrapheme32));
1263
46.7k
        for (i = 0; i < num_pieces; i++) {
1264
30.0k
            /* Get piece. */
1265
30.0k
            MVMString *piece = pieces[i];
1266
30.0k
1267
30.0k
            /* Add separator if needed. */
1268
30.0k
            if (i > 0) {
1269
13.3k
                /* If there's no separator and one piece is The Empty String we
1270
13.3k
                 * have to be extra careful about concat stability */
1271
13.3k
                if (sgraphs == 0 && MVM_string_graphs_nocheck(tc, piece) == 0 && concats_stable
1272
18
                        && i + 1 < num_pieces
1273
15
                        && !MVM_nfg_is_concat_stable(tc, pieces[i - 1], pieces[i + 1])) {
1274
0
                    concats_stable = 0;
1275
0
                }
1276
13.3k
1277
13.3k
                if (sgraphs) {
1278
115
                    if (!concats_stable)
1279
0
                        /* Already unstable; no more checks. */;
1280
115
                    else if (!MVM_nfg_is_concat_stable(tc, pieces[i - 1], separator))
1281
0
                        concats_stable = 0;
1282
115
                    else if (!MVM_nfg_is_concat_stable(tc, separator, piece))
1283
0
                        concats_stable = 0;
1284
115
1285
115
                    switch (separator->body.storage_type) {
1286
0
                    case MVM_STRING_GRAPHEME_32:
1287
0
                        memcpy(
1288
0
                            result->body.storage.blob_32 + position,
1289
0
                            separator->body.storage.blob_32,
1290
0
                            sgraphs * sizeof(MVMGrapheme32));
1291
0
                        position += sgraphs;
1292
0
                        break;
1293
0
                    /* XXX Can special-case 8-bit NFG and ASCII here too. */
1294
115
                    default:
1295
115
                        MVM_string_gi_init(tc, &gi, separator);
1296
233
                        while (MVM_string_gi_has_more(tc, &gi))
1297
118
                            result->body.storage.blob_32[position++] =
1298
118
                                MVM_string_gi_get_grapheme(tc, &gi);
1299
115
                        break;
1300
115
                    }
1301
115
                }
1302
13.2k
                else {
1303
13.2k
                    /* Separator has no graphemes, so NFG stability check
1304
13.2k
                     * should consider pieces. */
1305
13.2k
                    if (!concats_stable)
1306
4
                        /* Already stable; no more checks. */;
1307
13.2k
                    else if (!MVM_nfg_is_concat_stable(tc, pieces[i - 1], piece))
1308
4
                        concats_stable = 0;
1309
13.2k
                }
1310
13.3k
            }
1311
30.0k
1312
30.0k
            /* Add piece. */
1313
30.0k
            switch (piece->body.storage_type) {
1314
2.23k
            case MVM_STRING_GRAPHEME_32: {
1315
2.23k
                MVMint64 pgraphs = MVM_string_graphs(tc, piece);
1316
2.23k
                memcpy(
1317
2.23k
                    result->body.storage.blob_32 + position,
1318
2.23k
                    piece->body.storage.blob_32,
1319
2.23k
                    pgraphs * sizeof(MVMGrapheme32));
1320
2.23k
                position += pgraphs;
1321
2.23k
                break;
1322
2.23k
            }
1323
2.23k
            /* XXX Can special-case 8-bit NFG and ASCII here too. */
1324
27.8k
            default:
1325
27.8k
                MVM_string_gi_init(tc, &gi, piece);
1326
711k
                while (MVM_string_gi_has_more(tc, &gi))
1327
683k
                    result->body.storage.blob_32[position++] =
1328
683k
                        MVM_string_gi_get_grapheme(tc, &gi);
1329
27.8k
                break;
1330
30.0k
            }
1331
30.0k
        }
1332
16.7k
    }
1333
16.7k
1334
16.7k
    MVM_free(pieces);
1335
16.7k
    STRAND_CHECK(tc, result);
1336
16.7k
    return concats_stable ? result : re_nfg(tc, result);
1337
16.7k
}
1338
1339
/* Returning nonzero means it found the char at the position specified in 'a' in 'b'.
1340
 * For character enumerations in regexes. */
1341
250k
MVMint64 MVM_string_char_at_in_string(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMString *b) {
1342
250k
    MVMuint32     bgraphs;
1343
250k
    MVMGrapheme32 search;
1344
250k
1345
250k
    MVM_string_check_arg(tc, a, "char_at_in_string");
1346
250k
    MVM_string_check_arg(tc, b, "char_at_in_string");
1347
250k
1348
250k
    /* We return -2 here only to be able to distinguish between "out of bounds" and "not in string". */
1349
250k
    if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, a))
1350
10.6k
        return -2;
1351
250k
1352
239k
    search  = MVM_string_get_grapheme_at_nocheck(tc, a, offset);
1353
239k
    bgraphs = MVM_string_graphs_nocheck(tc, b);
1354
239k
    switch (b->body.storage_type) {
1355
124k
    case MVM_STRING_GRAPHEME_32: {
1356
124k
        MVMStringIndex i;
1357
810k
        for (i = 0; i < bgraphs; i++)
1358
771k
            if (b->body.storage.blob_32[i] == search)
1359
85.0k
                return i;
1360
39.3k
        break;
1361
124k
    }
1362
0
    case MVM_STRING_GRAPHEME_ASCII:
1363
0
        if (search >= 0 && search <= 127) {
1364
0
            MVMStringIndex i;
1365
0
            for (i = 0; i < bgraphs; i++)
1366
0
                if (b->body.storage.blob_ascii[i] == search)
1367
0
                    return i;
1368
0
        }
1369
0
        break;
1370
115k
    case MVM_STRING_GRAPHEME_8:
1371
115k
        if (search >= -128 && search <= 127) {
1372
115k
            MVMStringIndex i;
1373
256k
            for (i = 0; i < bgraphs; i++)
1374
172k
                if (b->body.storage.blob_8[i] == search)
1375
31.7k
                    return i;
1376
115k
        }
1377
83.6k
        break;
1378
0
    case MVM_STRING_STRAND: {
1379
0
        MVMGraphemeIter gi;
1380
0
        MVMStringIndex  i;
1381
0
        MVM_string_gi_init(tc, &gi, b);
1382
0
        for (i = 0; i < bgraphs; i++)
1383
0
            if (MVM_string_gi_get_grapheme(tc, &gi) == search)
1384
0
                return i;
1385
0
    }
1386
239k
    }
1387
123k
    return -1;
1388
239k
}
1389
1390
0
MVMint64 MVM_string_offset_has_unicode_property_value(MVMThreadContext *tc, MVMString *s, MVMint64 offset, MVMint64 property_code, MVMint64 property_value_code) {
1391
0
    MVMGrapheme32 g;
1392
0
    MVMCodepoint  cp;
1393
0
1394
0
    MVM_string_check_arg(tc, s, "uniprop");
1395
0
1396
0
    if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s))
1397
0
        return 0;
1398
0
1399
0
    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
1400
0
    if (g >= 0)
1401
0
        cp = (MVMCodepoint)g;
1402
0
    else
1403
0
        cp = MVM_nfg_get_synthetic_info(tc, g)->base;
1404
0
    return MVM_unicode_codepoint_has_property_value(tc, cp, property_code, property_value_code);
1405
0
}
1406
1407
/* If the string is made up of strands, then produces a flattend string
1408
 * representing the exact same graphemes but without strands. Otherwise,
1409
 * returns the input string. Intended for strings that will be indexed
1410
 * into heavily (when evaluating regexes, for example). */
1411
20.5k
MVMString * MVM_string_indexing_optimized(MVMThreadContext *tc, MVMString *s) {
1412
20.5k
    MVM_string_check_arg(tc, s, "indexingoptimized");
1413
20.5k
    if (s->body.storage_type == MVM_STRING_STRAND) {
1414
17.8k
        MVMuint32        chars = MVM_string_graphs_nocheck(tc, s);
1415
17.8k
        MVMGrapheme32   *flat  = MVM_malloc(chars * sizeof(MVMGrapheme32));
1416
17.8k
        MVMuint32        i     = 0;
1417
17.8k
        MVMString       *res;
1418
17.8k
        MVMint8          can_fit_into_8bit = 1;
1419
17.8k
1420
17.8k
        MVMGraphemeIter  gi;
1421
17.8k
        MVM_string_gi_init(tc, &gi, s);
1422
380k
        while (MVM_string_gi_has_more(tc, &gi)) {
1423
362k
            MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
1424
362k
            if (g < -128 || g >= 128)
1425
106
                can_fit_into_8bit = 0;
1426
362k
            flat[i++] = g;
1427
362k
        }
1428
17.8k
1429
17.8k
        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1430
17.8k
        res->body.storage_type    = MVM_STRING_GRAPHEME_32;
1431
17.8k
        res->body.storage.blob_32 = flat;
1432
17.8k
        res->body.num_graphs      = chars;
1433
17.8k
1434
17.8k
        if (can_fit_into_8bit)
1435
17.8k
            turn_32bit_into_8bit_unchecked(tc, res);
1436
17.8k
1437
17.8k
        return res;
1438
17.8k
    }
1439
2.69k
    else {
1440
2.69k
        return s;
1441
2.69k
    }
1442
20.5k
}
1443
1444
/* Escapes a string, replacing various chars like \n with \\n. Can no doubt be
1445
 * further optimized. */
1446
561
MVMString * MVM_string_escape(MVMThreadContext *tc, MVMString *s) {
1447
561
    MVMString      *res     = NULL;
1448
561
    MVMStringIndex  spos    = 0;
1449
561
    MVMStringIndex  bpos    = 0;
1450
561
    MVMStringIndex  sgraphs, balloc;
1451
561
    MVMGrapheme32  *buffer;
1452
561
    MVMGrapheme32   crlf;
1453
561
    MVMint8         can_fit_into_8bit = 1;
1454
561
1455
561
    MVM_string_check_arg(tc, s, "escape");
1456
561
1457
561
    sgraphs = MVM_string_graphs_nocheck(tc, s);
1458
561
    balloc  = sgraphs;
1459
561
    buffer  = MVM_malloc(sizeof(MVMGrapheme32) * balloc);
1460
561
1461
561
    crlf = MVM_nfg_crlf_grapheme(tc);
1462
561
1463
6.75k
    for (; spos < sgraphs; spos++) {
1464
6.19k
        MVMGrapheme32 graph = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
1465
6.19k
        MVMGrapheme32 esc   = 0;
1466
6.19k
        switch (graph) {
1467
11
            case '\\': esc = '\\'; break;
1468
0
            case 7:    esc = 'a';  break;
1469
1
            case '\b': esc = 'b';  break;
1470
3
            case '\n': esc = 'n';  break;
1471
3
            case '\r': esc = 'r';  break;
1472
13
            case '\t': esc = 't';  break;
1473
1
            case '\f': esc = 'f';  break;
1474
2
            case '"':  esc = '"';  break;
1475
1
            case 27:   esc = 'e';  break;
1476
6.19k
        }
1477
6.19k
        if (esc) {
1478
35
            if (bpos + 2 > balloc) {
1479
12
                balloc += 32;
1480
12
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
1481
12
            }
1482
35
            buffer[bpos++] = '\\';
1483
35
            buffer[bpos++] = esc;
1484
35
        }
1485
6.15k
        else if (graph == crlf) {
1486
1
            if (bpos + 4 > balloc) {
1487
1
                balloc += 32;
1488
1
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
1489
1
            }
1490
1
            buffer[bpos++] = '\\';
1491
1
            buffer[bpos++] = 'r';
1492
1
            buffer[bpos++] = '\\';
1493
1
            buffer[bpos++] = 'n';
1494
1
        }
1495
6.15k
        else {
1496
6.15k
            if (bpos + 1 > balloc) {
1497
11
                balloc += 32;
1498
11
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
1499
11
            }
1500
6.15k
            if (graph < -128 || graph >= 128)
1501
0
                can_fit_into_8bit = 0;
1502
6.15k
            buffer[bpos++] = graph;
1503
6.15k
        }
1504
6.19k
    }
1505
561
1506
561
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1507
561
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;
1508
561
    res->body.storage.blob_32 = buffer;
1509
561
    res->body.num_graphs      = bpos;
1510
561
1511
561
    if (can_fit_into_8bit)
1512
561
        turn_32bit_into_8bit_unchecked(tc, res);
1513
561
1514
561
    STRAND_CHECK(tc, res);
1515
561
    return res;
1516
561
}
1517
1518
/* Takes a string and reverses its characters. */
1519
1
MVMString * MVM_string_flip(MVMThreadContext *tc, MVMString *s) {
1520
1
    MVMString      *res     = NULL;
1521
1
    MVMStringIndex  spos    = 0;
1522
1
    MVMStringIndex  sgraphs;
1523
1
    MVMStringIndex  rpos;
1524
1
1525
1
    MVM_string_check_arg(tc, s, "flip");
1526
1
    sgraphs = MVM_string_graphs_nocheck(tc, s);
1527
1
    rpos    = sgraphs;
1528
1
1529
1
    if (s->body.storage_type == MVM_STRING_GRAPHEME_8) {
1530
1
        MVMGrapheme8   *rbuffer;
1531
1
        rbuffer = MVM_malloc(sizeof(MVMGrapheme8) * sgraphs);
1532
1
1533
4
        for (; spos < sgraphs; spos++)
1534
3
            rbuffer[--rpos] = s->body.storage.blob_8[spos];
1535
1
1536
1
        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1537
1
        res->body.storage_type    = MVM_STRING_GRAPHEME_8;
1538
1
        res->body.storage.blob_8  = rbuffer;
1539
0
    } else {
1540
0
        MVMGrapheme32  *rbuffer;
1541
0
        rbuffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
1542
0
1543
0
        if (s->body.storage_type == MVM_STRING_GRAPHEME_32)
1544
0
            for (; spos < sgraphs; spos++)
1545
0
                rbuffer[--rpos] = s->body.storage.blob_32[spos];
1546
0
        else
1547
0
            for (; spos < sgraphs; spos++)
1548
0
                rbuffer[--rpos] = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
1549
0
1550
0
        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1551
0
        res->body.storage_type    = MVM_STRING_GRAPHEME_32;
1552
0
        res->body.storage.blob_32 = rbuffer;
1553
0
    }
1554
1
1555
1
    res->body.num_graphs      = sgraphs;
1556
1
1557
1
    STRAND_CHECK(tc, res);
1558
1
    return res;
1559
1
}
1560
1561
/* Compares two strings, returning -1, 0 or 1 to indicate less than,
1562
 * equal or greater than. */
1563
25.1k
MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) {
1564
25.1k
    MVMStringIndex alen, blen, i, scanlen;
1565
25.1k
1566
25.1k
    MVM_string_check_arg(tc, a, "compare");
1567
25.1k
    MVM_string_check_arg(tc, b, "compare");
1568
25.1k
1569
25.1k
    /* Simple cases when one or both are zero length. */
1570
25.1k
    alen = MVM_string_graphs_nocheck(tc, a);
1571
25.1k
    blen = MVM_string_graphs_nocheck(tc, b);
1572
25.1k
    if (alen == 0)
1573
286
        return blen == 0 ? 0 : -1;
1574
24.8k
    if (blen == 0)
1575
8.61k
        return 1;
1576
24.8k
1577
24.8k
    /* Otherwise, need to scan them. */
1578
16.1k
    scanlen = alen > blen ? blen : alen;
1579
22.3k
    for (i = 0; i < scanlen; i++) {
1580
19.3k
        MVMGrapheme32 ai = MVM_string_get_grapheme_at_nocheck(tc, a, i);
1581
19.3k
        MVMGrapheme32 bi = MVM_string_get_grapheme_at_nocheck(tc, b, i);
1582
19.3k
        if (ai != bi)
1583
13.1k
            return ai < bi ? -1 : 1;
1584
19.3k
    }
1585
16.1k
1586
16.1k
    /* All shared chars equal, so go on length. */
1587
3.07k
    return alen < blen ? -1 :
1588
3.07k
           alen > blen ?  1 :
1589
3.07k
                          0 ;
1590
16.1k
}
1591
1592
/* Takes two strings and AND's their characters. */
1593
1
MVMString * MVM_string_bitand(MVMThreadContext *tc, MVMString *a, MVMString *b) {
1594
1
    MVMString      *res;
1595
1
    MVMGrapheme32  *buffer;
1596
1
    MVMStringIndex  i, alen, blen, sgraphs;
1597
1
1598
1
    MVM_string_check_arg(tc, a, "bitwise and");
1599
1
    MVM_string_check_arg(tc, b, "bitwise and");
1600
1
1601
1
    alen = MVM_string_graphs_nocheck(tc, a);
1602
1
    blen = MVM_string_graphs_nocheck(tc, b);
1603
1
    sgraphs = alen < blen ? alen : blen;
1604
1
    buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
1605
1
1606
1
    /* Binary-and up to the length of the shortest string. */
1607
11
    for (i = 0; i < sgraphs; i++)
1608
10
        buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i)
1609
10
                   & MVM_string_get_grapheme_at_nocheck(tc, b, i));
1610
1
1611
1
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1612
1
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;
1613
1
    res->body.storage.blob_32 = buffer;
1614
1
    res->body.num_graphs      = sgraphs;
1615
1
1616
1
    STRAND_CHECK(tc, res);
1617
1
    return res;
1618
1
}
1619
1620
/* Takes two strings and OR's their characters. */
1621
1
MVMString * MVM_string_bitor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
1622
1
    MVMString      *res;
1623
1
    MVMGrapheme32  *buffer;
1624
1
    MVMStringIndex  alen, blen, sgraphs, i, scanlen;
1625
1
1626
1
    MVM_string_check_arg(tc, a, "bitwise or");
1627
1
    MVM_string_check_arg(tc, b, "bitwise or");
1628
1
1629
1
    alen = MVM_string_graphs_nocheck(tc, a);
1630
1
    blen = MVM_string_graphs_nocheck(tc, b);
1631
1
    sgraphs = (alen > blen ? alen : blen);
1632
1
    buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
1633
1
1634
1
    /* First, binary-or up to the length of the shortest string. */
1635
1
    scanlen = alen > blen ? blen : alen;
1636
11
    for (i = 0; i < scanlen; i++)
1637
10
        buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i)
1638
10
                   | MVM_string_get_grapheme_at_nocheck(tc, b, i));
1639
1
1640
1
    /* Second pass, fill with characters of the longest string. */
1641
1
    if (alen > blen)
1642
2
        for (; i < sgraphs; i++)
1643
1
            buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, a, i);
1644
1
    else
1645
0
        for (; i < sgraphs; i++)
1646
0
            buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, b, i);
1647
1
1648
1
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1649
1
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;
1650
1
    res->body.storage.blob_32 = buffer;
1651
1
    res->body.num_graphs      = sgraphs;
1652
1
1653
1
    STRAND_CHECK(tc, res);
1654
1
    return res;
1655
1
}
1656
1657
/* Takes two strings and XOR's their characters. */
1658
1
MVMString * MVM_string_bitxor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
1659
1
    MVMString      *res;
1660
1
    MVMGrapheme32  *buffer;
1661
1
    MVMStringIndex  alen, blen, sgraphs, i, scanlen;
1662
1
1663
1
    MVM_string_check_arg(tc, a, "bitwise xor");
1664
1
    MVM_string_check_arg(tc, b, "bitwise xor");
1665
1
1666
1
    alen = MVM_string_graphs_nocheck(tc, a);
1667
1
    blen = MVM_string_graphs_nocheck(tc, b);
1668
1
    sgraphs = (alen > blen ? alen : blen);
1669
1
    buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
1670
1
1671
1
    /* First, binary-xor up to the length of the shorter string. */
1672
1
    scanlen = alen > blen ? blen : alen;
1673
2
    for (i = 0; i < scanlen; i++)
1674
1
        buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i)
1675
1
                   ^ MVM_string_get_grapheme_at_nocheck(tc, b, i));
1676
1
1677
1
    /* Second pass, fill with characters of the longest string. */
1678
1
    if (alen > blen)
1679
2
        for (; i < sgraphs; i++)
1680
1
            buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, a, i);
1681
1
    else
1682
0
        for (; i < sgraphs; i++)
1683
0
            buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, b, i);
1684
1
1685
1
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
1686
1
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;
1687
1
    res->body.storage.blob_32 = buffer;
1688
1
    res->body.num_graphs      = sgraphs;
1689
1
1690
1
    STRAND_CHECK(tc, res);
1691
1
    return res;
1692
1
}
1693
1694
/* The following statics hold on to various unicode property values we will
1695
 * resolve once so we don't have to do it repeatedly. */
1696
static MVMint64 UPV_Nd = 0;
1697
static MVMint64 UPV_Lu = 0;
1698
static MVMint64 UPV_Ll = 0;
1699
static MVMint64 UPV_Lt = 0;
1700
static MVMint64 UPV_Lm = 0;
1701
static MVMint64 UPV_Lo = 0;
1702
static MVMint64 UPV_Zs = 0;
1703
static MVMint64 UPV_Zl = 0;
1704
static MVMint64 UPV_Pc = 0;
1705
static MVMint64 UPV_Pd = 0;
1706
static MVMint64 UPV_Ps = 0;
1707
static MVMint64 UPV_Pe = 0;
1708
static MVMint64 UPV_Pi = 0;
1709
static MVMint64 UPV_Pf = 0;
1710
static MVMint64 UPV_Po = 0;
1711
1712
/* concatenating with "" ensures that only literal strings are accepted as argument. */
1713
1.95k
#define STR_WITH_LEN(str)  ("" str ""), (sizeof(str) - 1)
1714
1715
/* Resolves various unicode property values that we'll need. */
1716
130
void MVM_string_cclass_init(MVMThreadContext *tc) {
1717
130
    UPV_Nd = MVM_unicode_cname_to_property_value_code(tc,
1718
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Nd"));
1719
130
    UPV_Lu = MVM_unicode_cname_to_property_value_code(tc,
1720
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lu"));
1721
130
    UPV_Ll = MVM_unicode_cname_to_property_value_code(tc,
1722
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Ll"));
1723
130
    UPV_Lt = MVM_unicode_cname_to_property_value_code(tc,
1724
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lt"));
1725
130
    UPV_Lm = MVM_unicode_cname_to_property_value_code(tc,
1726
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lm"));
1727
130
    UPV_Lo = MVM_unicode_cname_to_property_value_code(tc,
1728
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lo"));
1729
130
    UPV_Zs = MVM_unicode_cname_to_property_value_code(tc,
1730
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Zs"));
1731
130
    UPV_Zl = MVM_unicode_cname_to_property_value_code(tc,
1732
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Zl"));
1733
130
    UPV_Pc = MVM_unicode_cname_to_property_value_code(tc,
1734
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pc"));
1735
130
    UPV_Pd = MVM_unicode_cname_to_property_value_code(tc,
1736
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pd"));
1737
130
    UPV_Ps = MVM_unicode_cname_to_property_value_code(tc,
1738
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Ps"));
1739
130
    UPV_Pe = MVM_unicode_cname_to_property_value_code(tc,
1740
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pe"));
1741
130
    UPV_Pi = MVM_unicode_cname_to_property_value_code(tc,
1742
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pi"));
1743
130
    UPV_Pf = MVM_unicode_cname_to_property_value_code(tc,
1744
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pf"));
1745
130
    UPV_Po = MVM_unicode_cname_to_property_value_code(tc,
1746
130
        MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Po"));
1747
130
}
1748
1749
/* Checks if the specified grapheme is in the given character class. */
1750
1.68M
static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) {
1751
1.68M
    /* If it's a synthetic, then grab the base codepoint. */
1752
1.68M
    MVMCodepoint cp;
1753
1.68M
    if (g >= 0)
1754
1.68M
        cp = (MVMCodepoint)g;
1755
1.68M
    else
1756
4
        cp = MVM_nfg_get_synthetic_info(tc, g)->base;
1757
1.68M
1758
1.68M
    switch (cclass) {
1759
5.33k
        case MVM_CCLASS_ANY:
1760
5.33k
            return 1;
1761
5.33k
1762
230
        case MVM_CCLASS_UPPERCASE:
1763
230
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1764
230
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu);
1765
5.33k
1766
243
        case MVM_CCLASS_LOWERCASE:
1767
243
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1768
243
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll);
1769
5.33k
1770
519k
        case MVM_CCLASS_WORD:
1771
519k
            if (cp <= 'z') {  /* short circuit common case */
1772
518k
                if (cp >= 'a' || cp == '_' || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9'))
1773
351k
                    return 1;
1774
518k
                else
1775
167k
                    return 0;
1776
518k
            }
1777
519k
            /* Deliberate fall-through; word is _ or digit or alphabetic. */
1778
519k
1779
870
        case MVM_CCLASS_ALPHANUMERIC:
1780
870
            if (cp <= '9' && cp >= '0')  /* short circuit common case */
1781
5
                return 1;
1782
865
            if (MVM_unicode_codepoint_has_property_value(tc, cp,
1783
865
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd))
1784
0
                return 1;
1785
865
            /* Deliberate fall-through; alphanumeric is digit or alphabetic. */
1786
865
1787
165k
        case MVM_CCLASS_ALPHABETIC:
1788
165k
            if (cp <= 'z') {  /* short circuit common case */
1789
163k
                if (cp >= 'a' || (cp >= 'A' && cp <= 'Z'))
1790
89.6k
                    return 1;
1791
163k
                else
1792
73.3k
                    return 0;
1793
163k
            }
1794
2.07k
            return
1795
2.07k
                MVM_unicode_codepoint_has_property_value(tc, cp,
1796
2.07k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lo) /* lots of CJK chars */
1797
2.07k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1798
2.07k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll) /* (ascii handled above) */
1799
2.04k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1800
2.04k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu)
1801
2.04k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1802
2.04k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lt)
1803
2.04k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1804
2.04k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lm);
1805
165k
1806
281k
        case MVM_CCLASS_NUMERIC:
1807
281k
            if (cp <= '9' && cp >= '0')  /* short circuit common case */
1808
106k
                return 1;
1809
174k
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1810
174k
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd);
1811
281k
1812
185
        case MVM_CCLASS_HEXADECIMAL:
1813
185
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1814
185
                MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1);
1815
281k
1816
147k
        case MVM_CCLASS_WHITESPACE:
1817
147k
            if (cp <= '~') {
1818
147k
                if (cp == ' ' || (cp <= 13 && cp >= 9))
1819
33.4k
                    return 1;
1820
147k
                else
1821
114k
                    return 0;
1822
147k
            }
1823
102
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1824
102
                MVM_UNICODE_PROPERTY_WHITE_SPACE, 1);
1825
147k
1826
63
        case MVM_CCLASS_BLANK:
1827
63
            if (cp == '\t')
1828
7
                return 1;
1829
56
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1830
56
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zs);
1831
63
1832
60
        case MVM_CCLASS_CONTROL: {
1833
60
            return (cp >= 0 && cp < 32) || (cp >= 127 && cp < 160);
1834
63
        }
1835
63
1836
55
        case MVM_CCLASS_PRINTING: {
1837
55
            return !((cp >= 0 && cp < 32) || (cp >= 127 && cp < 160));
1838
63
        }
1839
63
1840
7.45k
        case MVM_CCLASS_PUNCTUATION:
1841
7.45k
            return
1842
7.45k
                MVM_unicode_codepoint_has_property_value(tc, cp,
1843
7.45k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pc)
1844
7.45k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1845
7.45k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pd)
1846
7.45k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1847
7.45k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ps)
1848
7.44k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1849
7.44k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pe)
1850
7.44k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1851
7.44k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pi)
1852
7.44k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1853
7.44k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pf)
1854
7.44k
             || MVM_unicode_codepoint_has_property_value(tc, cp,
1855
7.44k
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Po);
1856
63
1857
559k
        case MVM_CCLASS_NEWLINE: {
1858
559k
            if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
1859
541k
                cp == 0x85 || cp == 0x2028 || cp == 0x2029)
1860
18.0k
                return 1;
1861
541k
            return MVM_unicode_codepoint_has_property_value(tc, cp,
1862
541k
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zl);
1863
559k
        }
1864
559k
1865
0
        default:
1866
0
            return 0;
1867
1.68M
    }
1868
1.68M
}
1869
1870
/* Checks if the character at the specified offset is a member of the
1871
 * indicated character class. */
1872
1.14M
MVMint64 MVM_string_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset) {
1873
1.14M
    MVMGrapheme32 g;
1874
1.14M
    MVM_string_check_arg(tc, s, "is_cclass");
1875
1.14M
    if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s))
1876
16
        return 0;
1877
1.14M
    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
1878
1.14M
    return grapheme_is_cclass(tc, cclass, g);
1879
1.14M
}
1880
1881
/* Searches for the next char that is in the specified character class. */
1882
14.2k
MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
1883
14.2k
    MVMGraphemeIter gi;
1884
14.2k
    MVMint64        length, end, pos;
1885
14.2k
1886
14.2k
    MVM_string_check_arg(tc, s, "find_cclass");
1887
14.2k
1888
14.2k
    length = MVM_string_graphs_nocheck(tc, s);
1889
14.2k
    end    = offset + count;
1890
13.4k
    end = length < end ? length : end;
1891
14.2k
    if (offset < 0 || offset >= length)
1892
135
        return end;
1893
14.2k
1894
14.0k
    MVM_string_gi_init(tc, &gi, s);
1895
14.0k
    MVM_string_gi_move_to(tc, &gi, offset);
1896
445k
    for (pos = offset; pos < end; pos++) {
1897
444k
        MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
1898
444k
        if (grapheme_is_cclass(tc, cclass, g) > 0)
1899
13.4k
            return pos;
1900
444k
    }
1901
14.0k
1902
609
    return end;
1903
14.0k
}
1904
1905
/* Searches for the next char that is not in the specified character class. */
1906
19.7k
MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
1907
19.7k
    MVMGraphemeIter gi;
1908
19.7k
    MVMint64        length, end, pos;
1909
19.7k
1910
19.7k
    MVM_string_check_arg(tc, s, "find_not_cclass");
1911
19.7k
1912
19.7k
    length = MVM_string_graphs_nocheck(tc, s);
1913
19.7k
    end    = offset + count;
1914
19.1k
    end = length < end ? length : end;
1915
19.7k
    if (offset < 0 || offset >= length)
1916
534
        return end;
1917
19.7k
1918
19.2k
    MVM_string_gi_init(tc, &gi, s);
1919
19.2k
    MVM_string_gi_move_to(tc, &gi, offset);
1920
97.5k
    for (pos = offset; pos < end; pos++) {
1921
97.5k
        MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
1922
97.5k
        if (grapheme_is_cclass(tc, cclass, g) == 0)
1923
19.2k
            return pos;
1924
97.5k
    }
1925
19.2k
1926
26
    return end;
1927
19.2k
}
1928
1929
static MVMint16   encoding_name_init         = 0;
1930
static MVMString *encoding_utf8_name         = NULL;
1931
static MVMString *encoding_ascii_name        = NULL;
1932
static MVMString *encoding_latin1_name       = NULL;
1933
static MVMString *encoding_utf16_name        = NULL;
1934
static MVMString *encoding_windows1252_name  = NULL;
1935
static MVMString *encoding_utf8_c8_name      = NULL;
1936
189
MVMuint8 MVM_string_find_encoding(MVMThreadContext *tc, MVMString *name) {
1937
189
    MVM_string_check_arg(tc, name, "find encoding");
1938
189
    if (!encoding_name_init) {
1939
130
        MVM_gc_allocate_gen2_default_set(tc);
1940
130
        encoding_utf8_name        = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8");
1941
130
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_name, "Encoding name");
1942
130
        encoding_ascii_name       = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "ascii");
1943
130
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_ascii_name, "Encoding name");
1944
130
        encoding_latin1_name      = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "iso-8859-1");
1945
130
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_latin1_name, "Encoding name");
1946
130
        encoding_utf16_name       = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf16");
1947
130
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf16_name, "Encoding name");
1948
130
        encoding_windows1252_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1252");
1949
130
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1252_name, "Encoding name");
1950
130
        encoding_utf8_c8_name     = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8-c8");
1951
130
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_c8_name, "Encoding name");
1952
130
        encoding_name_init   = 1;
1953
130
        MVM_gc_allocate_gen2_default_clear(tc);
1954
130
    }
1955
189
1956
189
    if (MVM_string_equal(tc, name, encoding_utf8_name)) {
1957
182
        return MVM_encoding_type_utf8;
1958
182
    }
1959
7
    else if (MVM_string_equal(tc, name, encoding_ascii_name)) {
1960
3
        return MVM_encoding_type_ascii;
1961
3
    }
1962
4
    else if (MVM_string_equal(tc, name, encoding_latin1_name)) {
1963
1
        return MVM_encoding_type_latin1;
1964
1
    }
1965
3
    else if (MVM_string_equal(tc, name, encoding_windows1252_name)) {
1966
0
        return MVM_encoding_type_windows1252;
1967
0
    }
1968
3
    else if (MVM_string_equal(tc, name, encoding_utf16_name)) {
1969
2
        return MVM_encoding_type_utf16;
1970
2
    }
1971
1
    else if (MVM_string_equal(tc, name, encoding_utf8_c8_name)) {
1972
0
        return MVM_encoding_type_utf8_c8;
1973
0
    }
1974
1
    else {
1975
1
        char *c_name = MVM_string_utf8_encode_C_string(tc, name);
1976
1
        char *waste[] = { c_name, NULL };
1977
1
        MVM_exception_throw_adhoc_free(tc, waste, "Unknown string encoding: '%s'",
1978
1
            c_name);
1979
1
    }
1980
189
}
1981
1982
/* Turns a codepoint into a string. If required uses the normalizer to ensure
1983
 * that we get a valid NFG string (NFG is a superset of NFC, and singleton
1984
 * decompositions exist). */
1985
8.43k
MVMString * MVM_string_chr(MVMThreadContext *tc, MVMCodepoint cp) {
1986
8.43k
    MVMString *s;
1987
8.43k
    MVMGrapheme32 g;
1988
8.43k
1989
8.43k
    if (cp < 0)
1990
0
        MVM_exception_throw_adhoc(tc, "chr codepoint cannot be negative");
1991
8.43k
    /* If the codepoint decomposes we may need to normalize it.
1992
8.43k
     * The first cp that decomposes is U+0340, but to be on the safe side
1993
8.43k
     * for now we go with the first significant character which at the time
1994
8.43k
     * of writing (Unicode 9.0) is COMBINING GRAVE ACCENT U+300 */
1995
8.43k
    if (cp >= MVM_NORMALIZE_FIRST_SIG_NFC
1996
189
        && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE)
1997
189
        != MVM_UNICODE_PVALUE_DT_NONE) {
1998
72
1999
72
        MVMNormalizer norm;
2000
72
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
2001
72
        if (!MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp, &g)) {
2002
72
            MVM_unicode_normalizer_eof(tc, &norm);
2003
72
            g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
2004
72
        }
2005
72
        MVM_unicode_normalizer_cleanup(tc, &norm);
2006
72
    }
2007
8.36k
    else {
2008
8.36k
        g = (MVMGrapheme32) cp;
2009
8.36k
    }
2010
8.43k
2011
8.43k
    s = (MVMString *)REPR(tc->instance->VMString)->allocate(tc, STABLE(tc->instance->VMString));
2012
8.43k
    if (g >= -128 && g < 128) {
2013
8.23k
        s->body.storage_type       = MVM_STRING_GRAPHEME_8;
2014
8.23k
        s->body.storage.blob_8     = MVM_malloc(sizeof(MVMGrapheme8));
2015
8.23k
        s->body.storage.blob_8[0]  = g;
2016
202
    } else {
2017
202
        s->body.storage_type       = MVM_STRING_GRAPHEME_32;
2018
202
        s->body.storage.blob_32    = MVM_malloc(sizeof(MVMGrapheme32));
2019
202
        s->body.storage.blob_32[0] = g;
2020
202
    }
2021
8.43k
    s->body.num_graphs         = 1;
2022
8.43k
    return s;
2023
8.43k
}
2024
2025
/* Takes a string and computes a hash code for it, storing it in the hash code
2026
 * cache field of the string. Hashing code is derived from the Jenkins hash
2027
 * implementation in uthash.h. */
2028
typedef union {
2029
    MVMint32 graphs[3];
2030
    unsigned char bytes[12];
2031
} MVMJenHashGraphemeView;
2032
784k
void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) {
2033
784k
    /* The hash algorithm works in bytes. Since we can represent strings in a
2034
784k
     * number of ways, and we want consistent hashing, then we'll read the
2035
784k
     * strings using the grapheme iterator in groups of 3, using 32-bit ints
2036
784k
     * for the graphemes no matter what the string really holds them as. Then
2037
784k
     * we'll use the bytes view of that in the hashing function. */
2038
784k
    MVMJenHashGraphemeView hash_block;
2039
784k
    MVMGraphemeIter gi;
2040
784k
    MVMuint32 graphs_remaining = MVM_string_graphs(tc, s);
2041
784k
2042
784k
    /* Initialize hash state. */
2043
784k
    MVMuint32 hashv = 0xfeedbeef;
2044
784k
    MVMuint32 _hj_i, _hj_j;
2045
784k
    _hj_i = _hj_j = 0x9e3779b9;
2046
784k
2047
784k
    /* Work through the string 3 graphemes at a time. */
2048
784k
    MVM_string_gi_init(tc, &gi, s);
2049
2.99M
    while (graphs_remaining >= 3) {
2050
2.21M
        hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
2051
2.21M
        hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
2052
2.21M
        hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi);
2053
2.21M
        _hj_i += (hash_block.bytes[0] + ( (unsigned)hash_block.bytes[1] << 8 )
2054
2.21M
            + ( (unsigned)hash_block.bytes[2] << 16 )
2055
2.21M
            + ( (unsigned)hash_block.bytes[3] << 24 ) );
2056
2.21M
        _hj_j +=    (hash_block.bytes[4] + ( (unsigned)hash_block.bytes[5] << 8 )
2057
2.21M
            + ( (unsigned)hash_block.bytes[6] << 16 )
2058
2.21M
            + ( (unsigned)hash_block.bytes[7] << 24 ) );
2059
2.21M
        hashv += (hash_block.bytes[8] + ( (unsigned)hash_block.bytes[9] << 8 )
2060
2.21M
            + ( (unsigned)hash_block.bytes[10] << 16 )
2061
2.21M
            + ( (unsigned)hash_block.bytes[11] << 24 ) );
2062
2.21M
2063
2.21M
        HASH_JEN_MIX(_hj_i, _hj_j, hashv);
2064
2.21M
2065
2.21M
        graphs_remaining -= 3;
2066
2.21M
    }
2067
784k
2068
784k
    /* Mix in key length (in bytes, not graphemes). */
2069
784k
    hashv += MVM_string_graphs(tc, s) * sizeof(MVMGrapheme32);
2070
784k
2071
784k
    /* Now handle trailing graphemes (must be 2, 1, or 0). */
2072
784k
    switch (graphs_remaining) {
2073
260k
        case 2:
2074
260k
            hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
2075
260k
            _hj_j += ( (unsigned)hash_block.bytes[7] << 24 ) +
2076
260k
                     ( (unsigned)hash_block.bytes[6] << 16 ) +
2077
260k
                     ( (unsigned)hash_block.bytes[5] << 8 ) +
2078
260k
                     hash_block.bytes[4];
2079
260k
        /* Fallthrough */
2080
516k
        case 1:
2081
516k
            hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
2082
516k
            _hj_i += ( (unsigned)hash_block.bytes[3] << 24 ) +
2083
516k
                     ( (unsigned)hash_block.bytes[2] << 16 ) +
2084
516k
                     ( (unsigned)hash_block.bytes[1] << 8 ) +
2085
516k
                     hash_block.bytes[0];
2086
784k
    }
2087
784k
    HASH_JEN_MIX(_hj_i, _hj_j, hashv);
2088
784k
2089
784k
    /* Store computed hash value. */
2090
784k
    s->body.cached_hash_code = hashv;
2091
784k
}