Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/utf8.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* The below section has an MIT-style license, included here.
4
5
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7
 *
8
 * Permission is hereby granted, free of charge, to any person obtaining
9
 * a copy of this software and associated documentation files (the
10
 * "Software"), to deal in the Software without restriction, including
11
 * without limitation the rights to use, copy, modify, merge, publish,
12
 * distribute, sublicense, and/or sell copies of the Software, and to
13
 * permit persons to whom the Software is furnished to do so, subject
14
 * to the following conditions:
15
 *
16
 * The above copyright notice and this permission notice shall be
17
 * included in all copies or substantial portions of the Software.
18
 *
19
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
 * SOFTWARE.
27
 */
28
1.82M
#define UTF8_ACCEPT 0
29
0
#define UTF8_REJECT 12
30
31
static const MVMuint8 utf8d[] = {
32
  // The first part of the table maps bytes to character classes that
33
  // to reduce the size of the transition table and create bitmasks.
34
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
36
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
39
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
40
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
41
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
42
43
  // The second part is a transition table that maps a combination
44
  // of a state of the automaton and a character class to a state.
45
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
46
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
47
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
48
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
49
  12,36,12,12,12,12,12,12,12,12,12,12,
50
};
51
52
MVM_STATIC_INLINE MVMint32
53
956k
decode_utf8_byte(MVMint32 *state, MVMGrapheme32 *codep, MVMuint8 byte) {
54
956k
  const MVMint32 type = utf8d[byte];
55
956k
56
956k
  *codep = (*state != UTF8_ACCEPT) ?
57
118k
    (byte & 0x3fu) | (*codep << 6) :
58
837k
    (0xff >> type) & (byte);
59
956k
60
956k
  *state = utf8d[256 + *state + type];
61
956k
  return *state;
62
956k
}
63
/* end Bjoern Hoehrmann section (some things were changed from the original) */
64
65
/* begin not_gerd section (modified from original)
66
// Copyright 2012 not_gerd
67
// see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122
68
69
Permission is granted to use, modify, and / or redistribute at will.
70
71
This includes removing authorship notices, re-use of code parts in
72
other software (with or without giving credit), and / or creating a
73
commercial product based on it.
74
75
This permission is not revocable by the author.
76
77
This software is provided as-is. Use it at your own risk. There is
78
no warranty whatsoever, neither expressed nor implied, and by using
79
this software you accept that the author(s) shall not be held liable
80
for any loss of data, loss of service, or other damages, be they
81
incidental or consequential. Your only option other than accepting
82
this is not to use the software at all.
83
*/
84
85
enum {
86
    CP_CHAR            = 1 << 0,
87
    CP_LOW_SURROGATE   = 1 << 1,
88
    CP_HIGH_SURROGATE  = 1 << 2,
89
    CP_NONCHAR         = 1 << 3,
90
    CP_OVERFLOW        = 1 << 4,
91
92
    U8_SINGLE          = 1 << 5,
93
    U8_DOUBLE          = 1 << 6,
94
    U8_TRIPLE          = 1 << 7,
95
    U8_QUAD            = 1 << 8
96
};
97
98
1.35M
static unsigned classify(MVMCodepoint cp) {
99
1.35M
    if(cp <= 0x7F)
100
1.35M
        return CP_CHAR | U8_SINGLE;
101
1.35M
102
3.98k
    if(cp <= 0x07FF)
103
3.13k
        return CP_CHAR | U8_DOUBLE;
104
3.98k
105
858
    if(0xD800 <= cp && cp <= 0xDBFF)
106
0
        return CP_HIGH_SURROGATE | U8_TRIPLE;
107
858
108
858
    if(0xDC00 <= cp && cp <= 0xDFFF)
109
0
        return CP_LOW_SURROGATE | U8_TRIPLE;
110
858
111
858
    if(0xFDD0 <= cp && cp <= 0xFDEF)
112
0
        return CP_NONCHAR | U8_TRIPLE;
113
858
114
858
    if(cp <= 0xFFFD)
115
845
        return CP_CHAR | U8_TRIPLE;
116
858
117
13
    if(cp == 0xFFFE || cp == 0xFFFF)
118
0
        return CP_NONCHAR | U8_TRIPLE;
119
13
120
13
    if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
121
0
        return CP_NONCHAR | U8_QUAD;
122
13
123
13
    if(cp <= 0x10FFFF)
124
13
        return CP_CHAR | U8_QUAD;
125
13
126
0
    if(cp <= 0x1FFFFF)
127
0
        return CP_OVERFLOW | U8_QUAD;
128
0
129
0
    return 0;
130
0
}
131
132
1.35M
static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) {
133
1.35M
    unsigned cc = classify(cp);
134
1.35M
135
1.35M
    if (!(cc & (CP_CHAR | CP_NONCHAR)))
136
0
        return 0;
137
1.35M
138
1.35M
    if (cc & U8_SINGLE) {
139
1.35M
        bp[0] = (MVMuint8)cp;
140
1.35M
        return 1;
141
1.35M
    }
142
1.35M
143
3.98k
    if (cc & U8_DOUBLE) {
144
3.13k
        bp[0] = (MVMuint8)(( 6 << 5) |  (cp >> 6));
145
3.13k
        bp[1] = (MVMuint8)(( 2 << 6) |  (cp &  0x3F));
146
3.13k
        return 2;
147
3.13k
    }
148
3.98k
149
858
    if (cc & U8_TRIPLE) {
150
845
        bp[0] = (MVMuint8)((14 << 4) |  (cp >> 12));
151
845
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F));
152
845
        bp[2] = (MVMuint8)(( 2 << 6) | ( cp       & 0x3F));
153
845
        return 3;
154
845
    }
155
858
156
13
    if (cc & U8_QUAD) {
157
13
        bp[0] = (MVMuint8)((30 << 3) |  (cp >> 18));
158
13
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F));
159
13
        bp[2] = (MVMuint8)(( 2 << 6) | ((cp >>  6) & 0x3F));
160
13
        bp[3] = (MVMuint8)(( 2 << 6) | ( cp        & 0x3F));
161
13
        return 4;
162
13
    }
163
13
164
0
    return 0;
165
13
}
166
167
 /* end not_gerd section */
168
169
0
#define UTF8_MAXINC (32 * 1024 * 1024)
170
171
/* Decodes the specified number of bytes of utf8 into an NFG string, creating
172
 * a result of the specified type. The type must have the MVMString REPR. */
173
27.7k
MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) {
174
27.7k
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
175
27.7k
    MVMint32 count = 0;
176
27.7k
    MVMCodepoint codepoint;
177
27.7k
    MVMint32 line_ending = 0;
178
27.7k
    MVMint32 state = 0;
179
27.7k
    MVMint32 bufsize = bytes;
180
27.7k
    MVMGrapheme32 *buffer = MVM_malloc(sizeof(MVMGrapheme32) * bufsize);
181
27.7k
    size_t orig_bytes;
182
27.7k
    const char *orig_utf8;
183
27.7k
    MVMint32 line;
184
27.7k
    MVMint32 col;
185
27.7k
    MVMint32 ready;
186
27.7k
187
27.7k
    /* Need to normalize to NFG as we decode. */
188
27.7k
    MVMNormalizer norm;
189
27.7k
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
190
27.7k
191
27.7k
    orig_bytes = bytes;
192
27.7k
    orig_utf8 = utf8;
193
27.7k
194
377k
    for (; bytes; ++utf8, --bytes) {
195
350k
        switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8), UTF8_ACCEPT)) {
196
231k
        case UTF8_ACCEPT: { /* got a codepoint */
197
231k
            MVMGrapheme32 g;
198
231k
            ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, codepoint, &g);
199
231k
            if (ready) {
200
203k
                while (count + ready > bufsize) { /* if the buffer's full make a bigger one */
201
0
                    buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (
202
0
                        bufsize >= UTF8_MAXINC ? (bufsize += UTF8_MAXINC) : (bufsize *= 2)
203
0
                    ));
204
0
                }
205
203k
                buffer[count++] = g;
206
203k
                while (--ready > 0) {
207
340
                    buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
208
340
                }
209
203k
            }
210
231k
            break;
211
231k
        }
212
0
        case UTF8_REJECT:
213
0
            /* found a malformed sequence; parse it again this time tracking
214
0
             * line and col numbers. */
215
0
            MVM_unicode_normalizer_cleanup(tc, &norm); /* Since we'll throw. */
216
0
            bytes = orig_bytes; utf8 = orig_utf8; state = 0; line = 1; col = 1;
217
0
            for (; bytes; ++utf8, --bytes) {
218
0
                switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
219
0
                case UTF8_ACCEPT:
220
0
                    /* this could be reorganized into several nested ugly if/else :/ */
221
0
                    if (!line_ending && (codepoint == 10 || codepoint == 13)) {
222
0
                        /* Detect the style of line endings.
223
0
                         * Select whichever comes first.
224
0
                         * First or only part of first line ending. */
225
0
                        line_ending = codepoint;
226
0
                        col = 1; line++;
227
0
                    }
228
0
                    else if (line_ending && codepoint == line_ending) {
229
0
                        /* first or only part of next line ending */
230
0
                        col = 1; line++;
231
0
                    }
232
0
                    else if (codepoint == 10 || codepoint == 13) {
233
0
                        /* second part of line ending; ignore */
234
0
                    }
235
0
                    else /* non-line ending codepoint */
236
0
                        col++;
237
0
                    break;
238
0
                case UTF8_REJECT:
239
0
                    MVM_free(buffer);
240
0
                    MVM_exception_throw_adhoc(tc, "Malformed UTF-8 at line %u col %u", line, col);
241
0
                }
242
0
            }
243
0
            MVM_free(buffer);
244
0
            MVM_exception_throw_adhoc(tc, "Concurrent modification of UTF-8 input buffer!");
245
0
            break;
246
350k
        }
247
350k
    }
248
27.7k
    if (state != UTF8_ACCEPT) {
249
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
250
0
        MVM_free(buffer);
251
0
        MVM_exception_throw_adhoc(tc, "Malformed termination of UTF-8 string");
252
0
    }
253
27.7k
254
27.7k
    /* Get any final graphemes from the normalizer, and clean it up. */
255
27.7k
    MVM_unicode_normalizer_eof(tc, &norm);
256
27.7k
    ready = MVM_unicode_normalizer_available(tc, &norm);
257
27.7k
    if (ready) {
258
27.4k
        if (count + ready > bufsize) {
259
0
            buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (count + ready));
260
0
        }
261
54.9k
        while (ready--) {
262
27.4k
            buffer[count++] =  MVM_unicode_normalizer_get_grapheme(tc, &norm);
263
27.4k
        }
264
27.4k
    }
265
27.7k
    MVM_unicode_normalizer_cleanup(tc, &norm);
266
27.7k
267
27.7k
    /* If we're lucky, we can fit our string in 8 bits per grapheme. */
268
27.7k
    if (MVM_string_buf32_can_fit_into_8bit(buffer, count)) {
269
21.2k
        MVMGrapheme8 *new_buffer = MVM_malloc(sizeof(MVMGrapheme8) * count);
270
21.2k
        MVM_VECTORIZE_LOOP
271
191k
        for (ready = 0; ready < count; ready++) {
272
169k
            new_buffer[ready] = buffer[ready];
273
169k
        }
274
21.2k
        MVM_free(buffer);
275
21.2k
        result->body.storage.blob_8  = new_buffer;
276
21.2k
        result->body.storage_type    = MVM_STRING_GRAPHEME_8;
277
6.51k
    } else {
278
6.51k
        /* just keep the same buffer as the MVMString's buffer.  Later
279
6.51k
         * we can add heuristics to resize it if we have enough free
280
6.51k
         * memory */
281
6.51k
        if (bufsize - count > 4) {
282
5.94k
            buffer = MVM_realloc(buffer, count * sizeof(MVMGrapheme32));
283
5.94k
        }
284
6.51k
        result->body.storage.blob_32 = buffer;
285
6.51k
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
286
6.51k
    }
287
27.7k
    result->body.num_graphs      = count;
288
27.7k
289
27.7k
    return result;
290
27.7k
}
291
292
198
static MVMint32 its_the_bom(const char *utf8) {
293
198
    const MVMuint8 *uns_utf8 = (const MVMuint8 *)utf8;
294
0
    return uns_utf8[0] == 0xEF && uns_utf8[1] == 0xBB && uns_utf8[2] == 0xBF;
295
198
}
296
297
/* Same as MVM_string_utf8_decode, but strips a BOM if it finds one. */
298
6
MVMString * MVM_string_utf8_decode_strip_bom(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) {
299
6
    if (bytes >= 3 && its_the_bom(utf8)) {
300
0
        utf8 += 3;
301
0
        bytes -= 3;
302
0
    }
303
6
    return MVM_string_utf8_decode(tc, result_type, utf8, bytes);
304
6
}
305
306
/* Decodes using a decodestream. Decodes as far as it can with the input
307
 * buffers, or until a stopper is reached. */
308
MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
309
                                  const MVMint32 *stopper_chars,
310
271
                                  MVMDecodeStreamSeparators *seps) {
311
271
    MVMint32 count = 0, total = 0;
312
271
    MVMint32 state = 0;
313
271
    MVMCodepoint codepoint = 0;
314
271
    MVMCodepoint lag_codepoint = -1;
315
271
    MVMint32 bufsize;
316
271
    MVMGrapheme32 *buffer           = NULL;
317
271
    MVMDecodeStreamBytes *cur_bytes = NULL;
318
271
    MVMDecodeStreamBytes *last_accept_bytes     = ds->bytes_head;
319
271
    MVMDecodeStreamBytes *lag_last_accept_bytes = NULL;
320
271
    MVMint32 last_accept_pos, lag_last_accept_pos, ready, at_start;
321
271
    MVMuint32 reached_stopper;
322
271
    MVMuint32 can_fast_path;
323
271
324
271
    /* If there's no buffers, we're done. */
325
271
    if (!ds->bytes_head)
326
28
        return 0;
327
243
    last_accept_pos = ds->bytes_head_pos;
328
243
329
243
    /* If we're asked for zero chars, also done. */
330
243
    if (stopper_chars && *stopper_chars == 0)
331
0
        return 1;
332
243
333
243
    /* If there's nothing hanging around in the normalization buffer, we can
334
243
     * use the fast path. */
335
243
    can_fast_path = MVM_unicode_normalizer_empty(tc, &(ds->norm));
336
243
337
243
    bufsize = ds->result_size_guess;
338
243
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
339
243
340
243
    /* Decode each of the buffers. */
341
243
    cur_bytes = ds->bytes_head;
342
243
    at_start = ds->abs_byte_pos == 0;
343
243
    reached_stopper = 0;
344
449
    while (cur_bytes) {
345
248
        /* Process this buffer. */
346
243
        MVMint32  pos   = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
347
248
        char     *bytes = cur_bytes->bytes;
348
248
        if (at_start) {
349
200
            /* We're right at the start of the stream of things to decode. See
350
200
             * if we have a BOM, and skip over it if so. */
351
200
            if (pos + 3 <= cur_bytes->length) {
352
195
                if (its_the_bom(bytes + pos)) {
353
0
                    pos += 3;
354
0
                    last_accept_bytes = cur_bytes;
355
0
                    last_accept_pos = pos;
356
0
                }
357
195
            }
358
200
            at_start = 0;
359
200
        }
360
248
361
248
        /* We have both a fast path and a slow path for UTF-8 decoding. The
362
248
         * fast path covers the common case where we have no chars that are
363
248
         * significant to normalization, and so we can skip the normalizer.
364
248
         * This is true of the ASCII and Latin-1 ranges of UTF-8, with the
365
248
         * exception of \r. Note that since the following codepoint may be
366
248
         * the one that causes us to need to compose, we need a lag of 1
367
248
         * codepoint. */
368
248
        if (can_fast_path) {
369
240
            /* Lift the no lag codepoint case out of the hot loop below,
370
240
             * to save on a couple of branches. */
371
240
            MVMCodepoint first_significant = ds->norm.first_significant;
372
480
            while (lag_codepoint == -1 && pos < cur_bytes->length) {
373
242
                switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, bytes[pos++]), UTF8_ACCEPT)) {
374
236
                case UTF8_ACCEPT: {
375
236
                    if (codepoint == '\r' || codepoint >= first_significant) {
376
2
                        can_fast_path = 0;
377
2
                        last_accept_bytes = cur_bytes;
378
2
                        last_accept_pos = pos;
379
2
                        goto slow_path;
380
2
                    }
381
234
                    lag_codepoint = codepoint;
382
234
                    lag_last_accept_bytes = cur_bytes;
383
234
                    lag_last_accept_pos = pos;
384
234
                    break;
385
236
                }
386
0
                case UTF8_REJECT:
387
0
                    MVM_free(buffer);
388
0
                    MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
389
0
                    break;
390
242
                }
391
242
            }
392
240
393
566k
            while (pos < cur_bytes->length) {
394
566k
                switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, bytes[pos++]), UTF8_ACCEPT)) {
395
566k
                case UTF8_ACCEPT: {
396
566k
                    /* If we hit something that needs the normalizer, we put
397
566k
                     * any lagging codepoint into its buffer and jump to it. */
398
566k
                    if (codepoint == '\r' || codepoint >= first_significant) {
399
20
                        MVM_unicode_normalizer_push_codepoints(tc, &(ds->norm),
400
20
                            &lag_codepoint, 1);
401
20
                        lag_codepoint = -1; /* Invalidate, we used it. */
402
20
                        can_fast_path = 0;
403
20
                        last_accept_bytes = cur_bytes;
404
20
                        last_accept_pos = pos;
405
20
                        goto slow_path;
406
20
                    }
407
566k
408
566k
                    /* As we have a lagging codepoint, and this one does not
409
566k
                     * need normalization, then we know we can spit out the
410
566k
                     * lagging one. */
411
566k
                    if (count == bufsize) {
412
8.76k
                        /* Valid character, but we filled the buffer. Attach this
413
8.76k
                        * one to the buffers linked list, and continue with a new
414
8.76k
                        * one. */
415
8.76k
                        MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
416
8.76k
                        buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
417
8.76k
                        count = 0;
418
8.76k
                    }
419
566k
                    buffer[count++] = lag_codepoint;
420
566k
                    total++;
421
566k
                    if (MVM_string_decode_stream_maybe_sep(tc, seps, lag_codepoint) ||
422
566k
                            stopper_chars && *stopper_chars == total) {
423
32
                        reached_stopper = 1;
424
32
                        last_accept_bytes = lag_last_accept_bytes;
425
32
                        last_accept_pos = lag_last_accept_pos;
426
32
                        goto done;
427
32
                    }
428
566k
429
566k
                    /* The current state becomes the lagged state. */
430
566k
                    lag_codepoint = codepoint;
431
566k
                    lag_last_accept_bytes = cur_bytes;
432
566k
                    lag_last_accept_pos = pos;
433
566k
                    break;
434
566k
                }
435
0
                case UTF8_REJECT:
436
0
                    MVM_free(buffer);
437
0
                    MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
438
0
                    break;
439
566k
                }
440
566k
            }
441
238
442
238
            /* If we fall out of the loop and have a lagged codepoint, but
443
238
             * no next buffer, then we fall into the slow path to process it
444
238
             * correctly. */
445
186
            if (lag_codepoint != -1 && !cur_bytes->next) {
446
182
                codepoint = lag_codepoint;
447
182
                lag_codepoint = -1;
448
182
                can_fast_path = 0;
449
182
                last_accept_bytes = lag_last_accept_bytes;
450
182
                last_accept_pos = lag_last_accept_pos;
451
182
                goto slow_path;
452
182
            }
453
186
        }
454
8
        else {
455
39.3k
            while (pos < cur_bytes->length) {
456
39.1k
                switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, bytes[pos++]), UTF8_ACCEPT)) {
457
38.7k
                case UTF8_ACCEPT: {
458
38.7k
                    MVMGrapheme32 g;
459
38.7k
                    MVMint32 first;
460
38.7k
                    last_accept_bytes = cur_bytes;
461
38.7k
                    last_accept_pos = pos;
462
39.0k
                  slow_path:
463
39.0k
                    first = 1;
464
39.0k
                    ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc,
465
39.0k
                        &(ds->norm), codepoint, &g);
466
77.9k
                    while (ready--) {
467
38.9k
                        if (first)
468
37.8k
                            first = 0;
469
38.9k
                        else
470
1.14k
                            g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
471
38.9k
                        if (count == bufsize) {
472
612
                            /* Valid character, but we filled the buffer. Attach this
473
612
                            * one to the buffers linked list, and continue with a new
474
612
                            * one. */
475
612
                            MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
476
612
                            buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
477
612
                            count = 0;
478
612
                        }
479
38.9k
                        buffer[count++] = g;
480
38.9k
                        total++;
481
38.9k
                        if (MVM_string_decode_stream_maybe_sep(tc, seps, g) ||
482
38.9k
                                stopper_chars && *stopper_chars == total) {
483
10
                            reached_stopper = 1;
484
10
                            goto done;
485
10
                        }
486
38.9k
                    }
487
38.9k
                    break;
488
39.0k
                }
489
0
                case UTF8_REJECT:
490
0
                    MVM_free(buffer);
491
0
                    MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
492
0
                    break;
493
39.1k
                }
494
39.1k
            }
495
8
        }
496
206
        cur_bytes = cur_bytes->next;
497
206
    }
498
243
  done:
499
243
500
243
    /* Attach what we successfully parsed as a result buffer, and trim away
501
243
     * what we chewed through. */
502
243
    if (count) {
503
239
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
504
239
    }
505
4
    else {
506
4
        MVM_free(buffer);
507
4
    }
508
243
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
509
243
510
243
    return reached_stopper;
511
243
}
512
513
/* Encodes the specified string to UTF-8. */
514
char * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
515
        MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
516
16.0k
        MVMString *replacement, MVMint32 translate_newlines) {
517
16.0k
    MVMuint8        *result = NULL;
518
16.0k
    size_t           result_pos, result_limit;
519
16.0k
    MVMCodepointIter ci;
520
16.0k
    MVMStringIndex   strgraphs  = MVM_string_graphs(tc, str);
521
16.0k
    MVMuint8        *repl_bytes = NULL;
522
16.0k
    MVMuint64        repl_length;
523
16.0k
524
16.0k
    if (start < 0 || start > strgraphs)
525
0
        MVM_exception_throw_adhoc(tc, "start out of range");
526
16.0k
    if (length == -1)
527
3.02k
        length = strgraphs;
528
16.0k
    if (length < 0 || start + length > strgraphs)
529
0
        MVM_exception_throw_adhoc(tc, "length out of range");
530
16.0k
531
16.0k
    if (replacement)
532
0
        repl_bytes = (MVMuint8 *) MVM_string_utf8_encode_substr(tc,
533
0
            replacement, &repl_length, 0, -1, NULL, translate_newlines);
534
16.0k
535
16.0k
    /* Guesstimate that we'll be within 2 bytes for most chars most of the
536
16.0k
     * time, and give ourselves 4 bytes breathing space. */
537
16.0k
    result_limit = 2 * length;
538
16.0k
    result       = MVM_malloc(result_limit + 4);
539
16.0k
    result_pos   = 0;
540
16.0k
541
16.0k
    /* Iterate the codepoints and encode them. */
542
16.0k
    MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
543
1.37M
    while (MVM_string_ci_has_more(tc, &ci)) {
544
1.35M
        MVMint32 bytes;
545
1.35M
        MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci);
546
1.35M
        if (result_pos >= result_limit) {
547
9
            result_limit *= 2;
548
9
            result = MVM_realloc(result, result_limit + 4);
549
9
        }
550
1.35M
        bytes = utf8_encode(result + result_pos, cp);
551
1.35M
        if (bytes)
552
1.35M
            result_pos += bytes;
553
0
        else if (replacement) {
554
0
            if (repl_length >= result_limit || result_pos >= result_limit - repl_length) {
555
0
                result_limit += repl_length;
556
0
                result = MVM_realloc(result, result_limit + 4);
557
0
            }
558
0
            memcpy(result + result_pos, repl_bytes, repl_length);
559
0
            result_pos += repl_length;
560
0
        }
561
0
        else {
562
0
            MVM_free(result);
563
0
            MVM_free(repl_bytes);
564
0
            MVM_string_utf8_throw_encoding_exception(tc, cp);
565
0
        }
566
1.35M
    }
567
16.0k
568
16.0k
    if (output_size)
569
16.0k
        *output_size = (MVMuint64)result_pos;
570
16.0k
    MVM_free(repl_bytes);
571
16.0k
    return (char *)result;
572
16.0k
}
573
574
/* Encodes the specified string to UTF-8. */
575
char * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size,
576
3.02k
        MVMint32 translate_newlines) {
577
3.02k
    return MVM_string_utf8_encode_substr(tc, str, output_size, 0, -1, NULL,
578
3.02k
        translate_newlines);
579
3.02k
}
580
581
/* Encodes the specified string to a UTF-8 C string. */
582
2.46k
char * MVM_string_utf8_encode_C_string(MVMThreadContext *tc, MVMString *str) {
583
2.46k
    MVMuint64 output_size;
584
2.46k
    char * result = NULL;
585
2.46k
    char * utf8_string = MVM_string_utf8_encode(tc, str, &output_size, 0);
586
2.46k
    /* this is almost always called from error-handling code. Don't care if it
587
2.46k
     * contains embedded NULs. XXX TODO: Make sure all uses of this free what it returns */
588
2.46k
    result = MVM_malloc(output_size + 1);
589
2.46k
    memcpy(result, utf8_string, output_size);
590
2.46k
    MVM_free(utf8_string);
591
2.46k
    result[output_size] = (char)0;
592
2.46k
    return result;
593
2.46k
}
594
595
/* Encodes the specified string to a UTF-8 C string if it is not NULL. */
596
0
char * MVM_string_utf8_maybe_encode_C_string(MVMThreadContext *tc, MVMString *str) {
597
0
    return str ? MVM_string_utf8_encode_C_string(tc, str) : NULL;
598
0
}
599
600
0
void MVM_string_utf8_throw_encoding_exception (MVMThreadContext *tc, MVMCodepoint cp) {
601
0
    const char *gencat = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
602
0
    if(cp > 0x10FFFF) {
603
0
        MVM_exception_throw_adhoc(tc,
604
0
            "Error encoding UTF-8 string: could not encode codepoint %d (0x%X), codepoint out of bounds. Cannot encode higher than %d (0x%X)",
605
0
            cp, cp, 0x10FFFF, 0x10FFFF);
606
0
    }
607
0
    else if (strcmp("Cs", gencat) == 0) {
608
0
        MVM_exception_throw_adhoc(tc,
609
0
            "Error encoding UTF-8 string: could not encode Unicode Surrogate codepoint %d (0x%X)",
610
0
            cp, cp);
611
0
    }
612
0
    else {
613
0
        MVM_exception_throw_adhoc(tc,
614
0
            "Error encoding UTF-8 string: could not encode codepoint %d (0x%X)",
615
0
            cp, cp);
616
0
    }
617
0
}