Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/utf8_c8.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* UTF-8 Clean-8 is an encoder/decoder that primarily works as the UTF-8 one.
4
 * However, upon encountering a byte sequence that will either not decode as
5
 * valid UTF-8, or that would not round-trip due to normalization, it will use
6
 * NFG synthetics to keep track of the original bytes involved. This means that
7
 * encoding back to UTF-8 Clean-8 will be able to recreate the bytes as they
8
 * originally existed. The synthetics contain 4 codepoints:
9
 *
10
 *   * The codepoint 0x10FFFD (which is a private use codepoint)
11
 *   * The codepoint 'x'
12
 *   * The upper 4 bits of the non-decodable byte as a hex char (0..9A..F)
13
 *   * The lower 4 bits as the non-decodable byte as a hex char (0..9A..F)
14
 *
15
 * Under normal UTF-8 encoding, this means the unrepresentable characters will
16
 * come out as something like `?xFF`.
17
 *
18
 * UTF-8 Clean-8 is used in places where MoarVM receives strings from the
19
 * environment, command line arguments, and file system queries.
20
 */
21
22
/* begin not_gerd section (modified from original)
23
// Copyright 2012 not_gerd
24
// see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122
25
26
Permission is granted to use, modify, and / or redistribute at will.
27
28
This includes removing authorship notices, re-use of code parts in
29
other software (with or without giving credit), and / or creating a
30
commercial product based on it.
31
32
This permission is not revocable by the author.
33
34
This software is provided as-is. Use it at your own risk. There is
35
no warranty whatsoever, neither expressed nor implied, and by using
36
this software you accept that the author(s) shall not be held liable
37
for any loss of data, loss of service, or other damages, be they
38
incidental or consequential. Your only option other than accepting
39
this is not to use the software at all.
40
*/
41
42
enum {
43
    CP_CHAR            = 1 << 0,
44
    CP_LOW_SURROGATE   = 1 << 1,
45
    CP_HIGH_SURROGATE  = 1 << 2,
46
    CP_NONCHAR         = 1 << 3,
47
    CP_OVERFLOW        = 1 << 4,
48
49
    U8_SINGLE          = 1 << 5,
50
    U8_DOUBLE          = 1 << 6,
51
    U8_TRIPLE          = 1 << 7,
52
    U8_QUAD            = 1 << 8
53
};
54
55
90.7k
static unsigned classify(MVMCodepoint cp) {
56
90.7k
    if(cp <= 0x7F)
57
90.7k
        return CP_CHAR | U8_SINGLE;
58
90.7k
59
4
    if(cp <= 0x07FF)
60
4
        return CP_CHAR | U8_DOUBLE;
61
4
62
0
    if(0xD800 <= cp && cp <= 0xDBFF)
63
0
        return CP_HIGH_SURROGATE | U8_TRIPLE;
64
0
65
0
    if(0xDC00 <= cp && cp <= 0xDFFF)
66
0
        return CP_LOW_SURROGATE | U8_TRIPLE;
67
0
68
0
    if(0xFDD0 <= cp && cp <= 0xFDEF)
69
0
        return CP_NONCHAR | U8_TRIPLE;
70
0
71
0
    if(cp <= 0xFFFD)
72
0
        return CP_CHAR | U8_TRIPLE;
73
0
74
0
    if(cp == 0xFFFE || cp == 0xFFFF)
75
0
        return CP_NONCHAR | U8_TRIPLE;
76
0
77
0
    if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
78
0
        return CP_NONCHAR | U8_QUAD;
79
0
80
0
    if(cp <= 0x10FFFF)
81
0
        return CP_CHAR | U8_QUAD;
82
0
83
0
    if(cp <= 0x1FFFFF)
84
0
        return CP_OVERFLOW | U8_QUAD;
85
0
86
0
    return 0;
87
0
}
88
89
90.7k
static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) {
90
90.7k
    unsigned cc = classify(cp);
91
90.7k
92
90.7k
    if (!(cc & (CP_CHAR | CP_NONCHAR)))
93
0
        return 0;
94
90.7k
95
90.7k
    if (cc & U8_SINGLE) {
96
90.7k
        bp[0] = (MVMuint8)cp;
97
90.7k
        return 1;
98
90.7k
    }
99
90.7k
100
4
    if (cc & U8_DOUBLE) {
101
4
        bp[0] = (MVMuint8)(( 6 << 5) |  (cp >> 6));
102
4
        bp[1] = (MVMuint8)(( 2 << 6) |  (cp &  0x3F));
103
4
        return 2;
104
4
    }
105
4
106
0
    if (cc & U8_TRIPLE) {
107
0
        bp[0] = (MVMuint8)((14 << 4) |  (cp >> 12));
108
0
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F));
109
0
        bp[2] = (MVMuint8)(( 2 << 6) | ( cp       & 0x3F));
110
0
        return 3;
111
0
    }
112
0
113
0
    if (cc & U8_QUAD) {
114
0
        bp[0] = (MVMuint8)((30 << 3) |  (cp >> 18));
115
0
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F));
116
0
        bp[2] = (MVMuint8)(( 2 << 6) | ((cp >>  6) & 0x3F));
117
0
        bp[3] = (MVMuint8)(( 2 << 6) | ( cp        & 0x3F));
118
0
        return 4;
119
0
    }
120
0
121
0
    return 0;
122
0
}
123
124
 /* end not_gerd section */
125
126
#define UTF8_MAXINC (32 * 1024 * 1024)
127
128
0
static void ensure_buffer(MVMGrapheme32 **buffer, MVMint32 *bufsize, MVMint32 needed) {
129
0
    while (needed >= *bufsize)
130
0
        *buffer = MVM_realloc(*buffer, sizeof(MVMGrapheme32) * (
131
0
            *bufsize >= UTF8_MAXINC ? (*bufsize += UTF8_MAXINC) : (*bufsize *= 2)
132
0
        ));
133
0
}
134
135
static const MVMuint8 hex_chars[] = { '0', '1', '2', '3', '4', '5', '6', '7',
136
                                      '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
137
10
static MVMGrapheme32 synthetic_for(MVMThreadContext *tc, MVMuint8 invalid) {
138
10
    if (invalid > 0x7F) {
139
8
        /* A real invalid. */
140
8
        MVMuint8 high = invalid >> 4;
141
8
        MVMuint8 low = invalid & 0x0F;
142
8
        MVMCodepoint cps[] = { 0x10FFFD, 'x', hex_chars[high], hex_chars[low] };
143
8
        return MVM_nfg_codes_to_grapheme_utf8_c8(tc, cps, 4);
144
8
    }
145
2
    else {
146
2
        /* Was in things thrown out as invalid by the decoder, but has an
147
2
         * ASCII interpretation, so hand it back as is. */
148
2
        return invalid;
149
2
    }
150
10
}
151
152
/* What the UTF-C8 decode process is expecting. */
153
typedef enum {
154
    EXPECT_START = 0,
155
    EXPECT_CONTINUATION = 1
156
} Expecting;
157
158
/* Decode state for the UTF8-C8 decoder. */
159
typedef struct {
160
    /* The UTF-8 we're decoding. */
161
    const MVMuint8 *utf8;
162
163
    /* The index of the current byte we're decoding. */
164
    size_t cur_byte;
165
166
    /* The index of the first unaccepted byte. */
167
    size_t unaccepted_start;
168
169
    /* What kind of byte we're expecting next. */
170
    Expecting expecting;
171
172
    /* The current codepoint we're decoding. */
173
    MVMCodepoint cur_codepoint;
174
175
    /* The result buffer we're decoding into. */
176
    MVMGrapheme32 *result;
177
178
    /* The current position in the result buffer. */
179
    size_t result_pos;
180
181
    /* Buffer of original codepoints, to ensure we will not spit out any
182
     * synthetics into the result that will re-order on round-trip. */
183
    MVMCodepoint *orig_codes;
184
185
    /* Position we're at in inserting into orig_codes. */
186
    size_t orig_codes_pos;
187
188
    /* First orig_codes index that did not yet go through the normalizer. */
189
    size_t orig_codes_unnormalized;
190
191
    /* The normalizer we're using to make synthetics that will not cause an
192
     * order change on output. */
193
    MVMNormalizer norm;
194
195
    /* Bad bytes from an earlier buffer, for the sake of streaming decode. */
196
    MVMuint8 prev_bad_bytes[4];
197
    MVMint32 num_prev_bad_bytes;
198
} DecodeState;
199
200
/* Appends a single grapheme to the buffer if it will not cause a mismatch
201
 * with the original codepoints upon encoding back to UTF-8. Returns non-zero
202
 * in this case. Otherwise, appends synthetics for the bytes the original code
203
 * points were encoded as. Since we can end up with index mis-matches, we just
204
 * spit out codepoints to catch the normalizer up to everything in the orig
205
 * codes buffer. */
206
766k
static int append_grapheme(MVMThreadContext *tc, DecodeState *state, MVMGrapheme32 g) {
207
766k
    if (g == state->orig_codes[state->orig_codes_unnormalized]) {
208
766k
        /* Easy case: exact match. */
209
766k
        state->result[state->result_pos++] = g;
210
766k
        state->orig_codes_unnormalized++;
211
766k
        return 1;
212
766k
    }
213
2
    else if (g < 0) {
214
0
        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
215
0
        int mismatch = 0;
216
0
        if (synth->codes[0] == state->orig_codes[state->orig_codes_unnormalized]) {
217
0
            MVMint32 i;
218
0
            for (i = 1; i < synth->num_codes; i++) {
219
0
                size_t orig_idx = state->orig_codes_unnormalized + i;
220
0
                if (state->orig_codes_pos <= orig_idx ||
221
0
                        state->orig_codes[orig_idx] != synth->codes[i]) {
222
0
                    mismatch = 1;
223
0
                    break;
224
0
                }
225
0
            }
226
0
        }
227
0
        else {
228
0
            mismatch = 1;
229
0
        }
230
0
        if (!mismatch) {
231
0
            state->result[state->result_pos++] = g;
232
0
            state->orig_codes_unnormalized += synth->num_codes;
233
0
            return 1;
234
0
        }
235
0
    }
236
766k
237
766k
    /* If we get here, then normalization would trash the original bytes. */
238
2
    {
239
2
        /* Spit out synthetics to keep the bytes as is. */
240
2
        size_t i, j;
241
6
        for (i = state->orig_codes_unnormalized; i < state->orig_codes_pos; i++) {
242
4
            MVMCodepoint to_encode = state->orig_codes[i];
243
4
            MVMuint8 encoded[4];
244
4
            MVMint32 bytes = utf8_encode(encoded, to_encode);
245
10
            for (j = 0; j < bytes; j++)
246
6
                state->result[state->result_pos++] = synthetic_for(tc, encoded[j]);
247
4
        }
248
2
249
2
        /* Consider all codes pushed now normalized. */
250
2
        state->orig_codes_unnormalized = state->orig_codes_pos;
251
2
252
2
        /* Put a clean normalizer in place. */
253
2
        MVM_unicode_normalizer_cleanup(tc, &(state->norm));
254
2
        MVM_unicode_normalizer_init(tc, &(state->norm), MVM_NORMALIZE_NFG);
255
2
        return 0;
256
766k
    }
257
766k
}
258
259
/* Called when decoding has reached an acceptable codepoint. */
260
766k
static void process_ok_codepoint(MVMThreadContext *tc, DecodeState *state) {
261
766k
    MVMint32 ready;
262
766k
    MVMGrapheme32 g;
263
766k
264
766k
    /* Consider the byte range accepted. */
265
766k
    state->unaccepted_start = state->cur_byte + 1;
266
766k
267
766k
    /* Insert into original codepoints list and hand it to the normalizer. */
268
766k
    state->orig_codes[state->orig_codes_pos++] = state->cur_codepoint;
269
766k
    ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc,
270
766k
            &(state->norm), state->cur_codepoint, &g);
271
766k
272
766k
    /* If the normalizer produced some output... */
273
766k
    if (ready) {
274
747k
        if (append_grapheme(tc, state, g)) {
275
748k
            while (--ready > 0) {
276
288
                g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm));
277
288
                if (!append_grapheme(tc, state, g))
278
0
                    break;
279
288
            }
280
747k
        }
281
747k
    }
282
766k
283
766k
    /* We've no longer any bad bytes to care about from earlier buffers;
284
766k
     * they ended up making an acceptable codepoint. */
285
766k
    state->num_prev_bad_bytes = 0;
286
766k
}
287
288
/* Called when a bad byte has been encountered, or at the end of output. */
289
18.5k
static void process_bad_bytes(MVMThreadContext *tc, DecodeState *state) {
290
18.5k
    size_t i;
291
18.5k
    MVMint32 ready;
292
18.5k
293
18.5k
    /* Flush normalization buffer and take from that. */
294
18.5k
    MVM_unicode_normalizer_eof(tc, &(state->norm));
295
18.5k
    ready = MVM_unicode_normalizer_available(tc, &(state->norm));
296
37.1k
    while (ready-- > 0) {
297
18.5k
        MVMGrapheme32 g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm));
298
18.5k
        if (!append_grapheme(tc, state, g))
299
2
            break;
300
18.5k
    }
301
18.5k
302
18.5k
    /* Now add in synthetics for bad bytes. */
303
18.5k
    for (i = 0; i < state->num_prev_bad_bytes; i++)
304
0
        state->result[state->result_pos++] = synthetic_for(tc, state->prev_bad_bytes[i]);
305
18.5k
    state->num_prev_bad_bytes = 0;
306
18.6k
    for (i = state->unaccepted_start; i <= state->cur_byte; i++)
307
4
        state->result[state->result_pos++] = synthetic_for(tc, state->utf8[i]);
308
18.5k
    state->unaccepted_start = state->cur_byte + 1;
309
18.5k
}
310
/* Check for if the codepoint is in range. Make sure it's not over 0x10FFFF
311
 * and make sure it isn't a Surrogate */
312
5
MVM_STATIC_INLINE int in_range (MVMCodepoint cp) {
313
5
    return ( 0 <= cp && cp <= 0x10FFFF)
314
5
        && (cp < 0xD800 || 0xDFFF < cp); /* Surrogates */
315
5
}
316
/* Decodes the specified number of bytes of utf8 into an NFG string, creating
317
 * a result of the specified type. The type must have the MVMString REPR. */
318
MVMString * MVM_string_utf8_c8_decode(MVMThreadContext *tc, const MVMObject *result_type,
319
18.5k
                                      const char *utf8, size_t bytes) {
320
18.5k
    DecodeState state;
321
18.5k
322
18.5k
    /* Local state for decode loop. */
323
18.5k
    int expected_continuations = 0;
324
18.5k
    int min_expected_codepoint;
325
18.5k
326
18.5k
    /* Don't do anything if empty. */
327
18.5k
    if (bytes == 0)
328
0
        return tc->instance->str_consts.empty;
329
18.5k
330
18.5k
    /* Decoding state, in a struct to easily pass to utility routines.
331
18.5k
     * Result buffer is a maximum estimate to avoid realloc; we can shrink
332
18.5k
     * it at the end. */
333
18.5k
    state.utf8 = (MVMuint8 *)utf8;
334
18.5k
    state.cur_byte = 0;
335
18.5k
    state.unaccepted_start = 0;
336
18.5k
    state.expecting = EXPECT_START;
337
18.5k
    state.cur_codepoint = 0;
338
18.5k
    state.result = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
339
18.5k
    state.result_pos = 0;
340
18.5k
    state.orig_codes = MVM_malloc(sizeof(MVMCodepoint) * bytes);
341
18.5k
    state.orig_codes_pos = 0;
342
18.5k
    state.orig_codes_unnormalized = 0;
343
18.5k
    state.num_prev_bad_bytes = 0;
344
18.5k
    MVM_unicode_normalizer_init(tc, &(state.norm), MVM_NORMALIZE_NFG);
345
18.5k
346
785k
    while (state.cur_byte < bytes) {
347
766k
        MVMuint8 decode_byte = utf8[state.cur_byte];
348
766k
        switch (state.expecting) {
349
766k
            case EXPECT_START:
350
766k
                if ((decode_byte & 0x80) == 0) {
351
766k
                    /* Single byte sequence. */
352
766k
                    state.cur_codepoint = decode_byte;
353
766k
                    process_ok_codepoint(tc, &state);
354
766k
                }
355
9
                else if ((decode_byte & 0xE0) == 0xC0) {
356
7
                    state.cur_codepoint = decode_byte & 0x1F;
357
7
                    state.expecting = EXPECT_CONTINUATION;
358
7
                    expected_continuations = 1;
359
7
                    min_expected_codepoint = 0x80;
360
7
                }
361
2
                else if ((decode_byte & 0xF0) == 0xE0) {
362
0
                    state.cur_codepoint = decode_byte & 0x0F;
363
0
                    state.expecting = EXPECT_CONTINUATION;
364
0
                    expected_continuations = 2;
365
0
                    min_expected_codepoint = 0x800;
366
0
                }
367
2
                else if ((decode_byte & 0xF8) == 0xF0) {
368
0
                    state.cur_codepoint = decode_byte & 0x07;
369
0
                    state.expecting = EXPECT_CONTINUATION;
370
0
                    expected_continuations = 3;
371
0
                    min_expected_codepoint = 0x10000;
372
0
                }
373
2
                else {
374
2
                    /* Invalid byte sequence. */
375
2
                    process_bad_bytes(tc, &state);
376
2
                }
377
766k
                break;
378
5
            case EXPECT_CONTINUATION:
379
5
                if ((decode_byte & 0xC0) == 0x80) {
380
5
                    state.cur_codepoint = (state.cur_codepoint << 6)
381
5
                                          | (decode_byte & 0x3F);
382
5
                    expected_continuations--;
383
5
                    if (expected_continuations == 0) {
384
5
                        if (min_expected_codepoint <= state.cur_codepoint && in_range(state.cur_codepoint))
385
5
                            process_ok_codepoint(tc, &state);
386
5
                        else
387
0
                            process_bad_bytes(tc, &state);
388
5
                        state.expecting = EXPECT_START;
389
5
                    }
390
5
                }
391
0
                else {
392
0
                    /* Invalid byte sequence. */
393
0
                    process_bad_bytes(tc, &state);
394
0
                    state.expecting = EXPECT_START;
395
0
                }
396
5
                break;
397
766k
        }
398
766k
        state.cur_byte++;
399
766k
    }
400
18.5k
401
18.5k
    /* Handle anything dangling off the end. */
402
18.5k
    state.cur_byte--; /* So we don't read 1 past the end. */
403
18.5k
    process_bad_bytes(tc, &state);
404
18.5k
405
18.5k
    MVM_free(state.orig_codes);
406
18.5k
    MVM_unicode_normalizer_cleanup(tc, &(state.norm));
407
18.5k
408
18.5k
    {
409
18.5k
        MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
410
18.5k
        result->body.storage.blob_32 = state.result;
411
18.5k
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
412
18.5k
        result->body.num_graphs      = state.result_pos;
413
18.5k
        return result;
414
18.5k
    }
415
18.5k
}
416
417
/* Decodes using a decodestream. Decodes as far as it can with the input
418
 * buffers, or until a stopper is reached. */
419
MVMuint32 MVM_string_utf8_c8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
420
                                     const MVMint32 *stopper_chars,
421
                                     MVMDecodeStreamSeparators *seps,
422
0
                                     MVMint32 eof) {
423
0
    /* Local state for decode loop. */
424
0
    MVMDecodeStreamBytes *cur_bytes;
425
0
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
426
0
    MVMint32 last_accept_pos = ds->bytes_head_pos;
427
0
    DecodeState state;
428
0
    int expected_continuations = 0;
429
0
    int min_expected_codepoint;
430
0
    MVMuint32 reached_stopper = 0;
431
0
    MVMint32 result_graphs = 0;
432
0
433
0
    /* If there's no buffers, we're done. */
434
0
    if (!ds->bytes_head)
435
0
        return 0;
436
0
    last_accept_pos = ds->bytes_head_pos;
437
0
438
0
    /* If we're asked for zero chars, also done. */
439
0
    if (stopper_chars && *stopper_chars == 0)
440
0
        return 1;
441
0
442
0
    /* Otherwise set up decode state, stealing normalizer of the decode
443
0
     * stream and re-instating any past orig_codes. */
444
0
    state.expecting = EXPECT_START;
445
0
    state.cur_codepoint = 0;
446
0
    state.num_prev_bad_bytes = 0;
447
0
    memcpy(&(state.norm), &(ds->norm), sizeof(MVMNormalizer));
448
0
    if (ds->decoder_state) {
449
0
        MVMCodepoint *saved = (MVMCodepoint *)ds->decoder_state;
450
0
        state.orig_codes = MVM_malloc(
451
0
            sizeof(MVMCodepoint) * (saved[0] + ds->bytes_head->length)
452
0
        );
453
0
        state.orig_codes_pos = saved[0];
454
0
        state.orig_codes_unnormalized = 0;
455
0
        memcpy(state.orig_codes, saved + 1, saved[0] * sizeof(MVMCodepoint));
456
0
        MVM_free(ds->decoder_state);
457
0
        ds->decoder_state = NULL;
458
0
    }
459
0
    else {
460
0
        state.orig_codes = NULL;
461
0
        state.orig_codes_pos = 0;
462
0
        state.orig_codes_unnormalized = 0;
463
0
    }
464
0
465
0
    /* Decode each of the buffers. */
466
0
    cur_bytes = ds->bytes_head;
467
0
    reached_stopper = 0;
468
0
    while (cur_bytes && !reached_stopper) {
469
0
        /* Set up decode state for this buffer. */
470
0
        MVMint32 bytes = ds->bytes_head->length;
471
0
        state.result = MVM_malloc(bytes * sizeof(MVMGrapheme32));
472
0
        state.orig_codes = MVM_realloc(state.orig_codes,
473
0
            sizeof(MVMCodepoint) * (state.orig_codes_pos + bytes));
474
0
        state.result_pos = 0;
475
0
        state.utf8 = (const MVMuint8*)cur_bytes->bytes;
476
0
        state.cur_byte = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
477
0
        state.unaccepted_start = state.cur_byte;
478
0
479
0
        /* Process this buffer. */
480
0
        while (state.cur_byte < bytes) {
481
0
            /* Process a byte. */
482
0
            MVMuint8 decode_byte = state.utf8[state.cur_byte];
483
0
            MVMint32 maybe_new_graph = 0;
484
0
            switch (state.expecting) {
485
0
                case EXPECT_START:
486
0
                    if ((decode_byte & 0x80) == 0) {
487
0
                        /* Single byte sequence. */
488
0
                        state.cur_codepoint = decode_byte;
489
0
                        process_ok_codepoint(tc, &state);
490
0
                        maybe_new_graph = 1;
491
0
                    }
492
0
                    else if ((decode_byte & 0xE0) == 0xC0) {
493
0
                        state.cur_codepoint = decode_byte & 0x1F;
494
0
                        state.expecting = EXPECT_CONTINUATION;
495
0
                        expected_continuations = 1;
496
0
                        min_expected_codepoint = 0x80;
497
0
                    }
498
0
                    else if ((decode_byte & 0xF0) == 0xE0) {
499
0
                        state.cur_codepoint = decode_byte & 0x0F;
500
0
                        state.expecting = EXPECT_CONTINUATION;
501
0
                        expected_continuations = 2;
502
0
                        min_expected_codepoint = 0x800;
503
0
                    }
504
0
                    else if ((decode_byte & 0xF8) == 0xF0) {
505
0
                        state.cur_codepoint = decode_byte & 0x07;
506
0
                        state.expecting = EXPECT_CONTINUATION;
507
0
                        expected_continuations = 3;
508
0
                        min_expected_codepoint = 0x10000;
509
0
                    }
510
0
                    else {
511
0
                        /* Invalid byte sequence. */
512
0
                        process_bad_bytes(tc, &state);
513
0
                        maybe_new_graph = 1;
514
0
                    }
515
0
                    break;
516
0
                case EXPECT_CONTINUATION:
517
0
                    if ((decode_byte & 0xC0) == 0x80) {
518
0
                        state.cur_codepoint = (state.cur_codepoint << 6)
519
0
                                              | (decode_byte & 0x3F);
520
0
                        expected_continuations--;
521
0
                        if (expected_continuations == 0) {
522
0
                            if (state.cur_codepoint >= min_expected_codepoint)
523
0
                                process_ok_codepoint(tc, &state);
524
0
                            else
525
0
                                process_bad_bytes(tc, &state);
526
0
                            maybe_new_graph = 1;
527
0
                            state.expecting = EXPECT_START;
528
0
                        }
529
0
                    }
530
0
                    else {
531
0
                        /* Invalid byte sequence. */
532
0
                        process_bad_bytes(tc, &state);
533
0
                        maybe_new_graph = 1;
534
0
                        state.expecting = EXPECT_START;
535
0
                    }
536
0
                    break;
537
0
            }
538
0
            state.cur_byte++;
539
0
540
0
            /* See if we've reached a stopper. */
541
0
            if (maybe_new_graph && state.result_pos > 0) {
542
0
                if (stopper_chars) {
543
0
                    if (result_graphs + state.result_pos >= *stopper_chars) {
544
0
                        reached_stopper = 1;
545
0
                        break;
546
0
                    }
547
0
                }
548
0
                if (MVM_string_decode_stream_maybe_sep(tc, seps,
549
0
                            state.result[state.result_pos - 1])) {
550
0
                    reached_stopper = 1;
551
0
                    break;
552
0
                }
553
0
            }
554
0
        }
555
0
556
0
        /* If we're at EOF and this is the last buffer, force out last bytes. */
557
0
        if (eof && !reached_stopper && !cur_bytes->next) {
558
0
            state.cur_byte--; /* So we don't read 1 past the end. */
559
0
            process_bad_bytes(tc, &state);
560
0
        }
561
0
562
0
        /* Attach what we successfully parsed as a result buffer, and trim away
563
0
         * what we chewed through. */
564
0
        if (state.result_pos)
565
0
            MVM_string_decodestream_add_chars(tc, ds, state.result, state.result_pos);
566
0
        else
567
0
            MVM_free(state.result);
568
0
        result_graphs += state.result_pos;
569
0
570
0
        /* Update our accepted position. */
571
0
        if (state.unaccepted_start > 0) {
572
0
            last_accept_bytes = cur_bytes;
573
0
            last_accept_pos = state.unaccepted_start;
574
0
        }
575
0
576
0
        /* If there were bytes we didn't accept, hold on to them in case we
577
0
         * need to emit them as bad bytes. */
578
0
        if (state.unaccepted_start != state.cur_byte && cur_bytes->next) {
579
0
            int i;
580
0
            for (i = state.unaccepted_start; i < state.cur_byte; i++)
581
0
                state.prev_bad_bytes[state.num_prev_bad_bytes++] = state.utf8[i];
582
0
        }
583
0
584
0
        cur_bytes = cur_bytes->next;
585
0
    }
586
0
587
0
    /* Eat the bytes we decoded. */
588
0
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
589
0
590
0
    /* Persist current normalizer. */
591
0
    memcpy(&(ds->norm), &(state.norm), sizeof(MVMNormalizer));
592
0
593
0
    /* Stash away any leftover codepoints we'll need to examine. */
594
0
    if (state.orig_codes_pos && state.orig_codes_pos != state.orig_codes_unnormalized) {
595
0
        size_t diff = state.orig_codes_pos - state.orig_codes_unnormalized;
596
0
        MVMCodepoint *saved = MVM_malloc(sizeof(MVMCodepoint) * (1 + diff));
597
0
        saved[0] = diff;
598
0
        memcpy(saved + 1, state.orig_codes + state.orig_codes_unnormalized,
599
0
            diff * sizeof(MVMCodepoint));
600
0
        ds->decoder_state = saved;
601
0
    }
602
0
    MVM_free(state.orig_codes);
603
0
604
0
    return reached_stopper;
605
0
}
606
607
/* Encodes the specified string to UTF-8. */
608
static void emit_cp(MVMThreadContext *tc, MVMCodepoint cp, MVMuint8 **result,
609
                    size_t *result_pos, size_t *result_limit,
610
90.7k
                    MVMuint8 *repl_bytes, MVMuint64 repl_length) {
611
90.7k
    MVMint32 bytes;
612
90.7k
    if (*result_pos >= *result_limit) {
613
0
        *result_limit *= 2;
614
0
        *result = MVM_realloc(*result, *result_limit + 4);
615
0
    }
616
90.7k
    bytes = utf8_encode(*result + *result_pos, cp);
617
90.7k
    if (bytes)
618
90.7k
        *result_pos += bytes;
619
0
    else if (repl_bytes) {
620
0
        if (repl_length >= *result_limit || *result_pos >= *result_limit - repl_length) {
621
0
            *result_limit += repl_length;
622
0
            *result = MVM_realloc(*result, *result_limit + 4);
623
0
        }
624
0
        memcpy(*result + *result_pos, repl_bytes, repl_length);
625
0
        *result_pos += repl_length;
626
0
    }
627
0
    else {
628
0
        MVM_free(*result);
629
0
        MVM_free(repl_bytes);
630
0
        MVM_string_utf8_throw_encoding_exception(tc, cp);
631
0
    }
632
90.7k
}
633
16
static int hex2int(MVMThreadContext *tc, MVMCodepoint cp) {
634
16
    if (cp >= '0' && cp <= '9')
635
2
        return cp - '0';
636
14
    else if (cp >= 'A' && cp <= 'F')
637
14
        return 10 + (cp - 'A');
638
14
    else
639
0
        MVM_exception_throw_adhoc(tc, "UTF-8 C-8 encoding encountered corrupt synthetic");
640
16
}
641
char * MVM_string_utf8_c8_encode_substr(MVMThreadContext *tc,
642
4.27k
        MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement) {
643
4.27k
    MVMuint8        *result;
644
4.27k
    size_t           result_pos, result_limit;
645
4.27k
    MVMGraphemeIter  gi;
646
4.27k
    MVMStringIndex   strgraphs = MVM_string_graphs(tc, str);
647
4.27k
    MVMuint8        *repl_bytes = NULL;
648
4.27k
    MVMuint64        repl_length;
649
4.27k
650
4.27k
    if (start < 0 || start > strgraphs)
651
0
        MVM_exception_throw_adhoc(tc, "start out of range");
652
4.27k
    if (length == -1)
653
4
        length = strgraphs;
654
4.27k
    if (length < 0 || start + length > strgraphs)
655
0
        MVM_exception_throw_adhoc(tc, "length out of range");
656
4.27k
657
4.27k
    if (replacement)
658
4
        repl_bytes = (MVMuint8 *) MVM_string_utf8_c8_encode_substr(tc, replacement, &repl_length, 0, -1, NULL);
659
4.27k
660
4.27k
    /* Guesstimate that we'll be within 2 bytes for most chars most of the
661
4.27k
     * time, and give ourselves 4 bytes breathing space. */
662
4.27k
    result_limit = 2 * length;
663
4.27k
    result       = MVM_malloc(result_limit + 4);
664
4.27k
    result_pos   = 0;
665
4.27k
666
4.27k
    /* We iterate graphemes, looking out for any synthetics. If we find a
667
4.27k
     * UTF-8 C-8 synthetic, then we spit out the raw byte. If we find any
668
4.27k
     * other synthetic, we iterate its codepoints. */
669
4.27k
    MVM_string_gi_init(tc, &gi, str);
670
95.0k
    while (MVM_string_gi_has_more(tc, &gi)) {
671
90.7k
        MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
672
90.7k
        if (g >= 0) {
673
90.7k
            emit_cp(tc, g, &result, &result_pos, &result_limit, repl_bytes, repl_length);
674
90.7k
        }
675
8
        else {
676
8
            MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
677
8
            if (synth->is_utf8_c8) {
678
8
                /* UTF-8 C-8 synthetic; emit the byte. */
679
8
                if (result_pos >= result_limit) {
680
0
                    result_limit *= 2;
681
0
                    result = MVM_realloc(result, result_limit + 1);
682
0
                }
683
8
                result[result_pos++] = (hex2int(tc, synth->codes[2]) << 4) +
684
8
                    hex2int(tc, synth->codes[3]);
685
8
            }
686
0
            else {
687
0
                MVMint32 i;
688
0
                for (i = 0; i < synth->num_codes; i++)
689
0
                    emit_cp(tc, synth->codes[i], &result, &result_pos, &result_limit, repl_bytes, repl_length);
690
0
            }
691
8
        }
692
90.7k
    }
693
4.27k
694
4.27k
    if (output_size)
695
4.27k
        *output_size = (MVMuint64)result_pos;
696
4.27k
    MVM_free(repl_bytes);
697
4.27k
    return (char *)result;
698
4.27k
}
699
700
/* Encodes the specified string to UTF-8 C-8. */
701
4.26k
char * MVM_string_utf8_c8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size) {
702
4.26k
    return MVM_string_utf8_c8_encode_substr(tc, str, output_size, 0,
703
4.26k
        MVM_string_graphs(tc, str), NULL);
704
4.26k
}
705
706
/* Encodes the specified string to a UTF-8 C-8 C string. */
707
4.26k
char * MVM_string_utf8_c8_encode_C_string(MVMThreadContext *tc, MVMString *str) {
708
4.26k
    MVMuint64 output_size;
709
4.26k
    char *result;
710
4.26k
    char *utf8_string = MVM_string_utf8_c8_encode(tc, str, &output_size);
711
4.26k
    result = MVM_malloc(output_size + 1);
712
4.26k
    memcpy(result, utf8_string, output_size);
713
4.26k
    MVM_free(utf8_string);
714
4.26k
    result[output_size] = (char)0;
715
4.26k
    return result;
716
4.26k
}