Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/utf8_c8.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* UTF-8 Clean-8 is an encoder/decoder that primarily works as the UTF-8 one.
4
 * However, upon encountering a byte sequence that will either not decode as
5
 * valid UTF-8, or that would not round-trip due to normalization, it will use
6
 * NFG synthetics to keep track of the original bytes involved. This means that
7
 * encoding back to UTF-8 Clean-8 will be able to recreate the bytes as they
8
 * originally existed. The synthetics contain 4 codepoints:
9
 *
10
 *   * The codepoint 0x10FFFD (which is a private use codepoint)
11
 *   * The codepoint 'x'
12
 *   * The upper 4 bits of the non-decodable byte as a hex char (0..9A..F)
13
 *   * The lower 4 bits as the non-decodable byte as a hex char (0..9A..F)
14
 *
15
 * Under normal UTF-8 encoding, this means the unrepresentable characters will
16
 * come out as something like `?xFF`.
17
 *
18
 * UTF-8 Clean-8 is used in places where MoarVM receives strings from the
19
 * environment, command line arguments, and file system queries.
20
 */
21
22
/* begin not_gerd section (modified from original)
23
// Copyright 2012 not_gerd
24
// see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122
25
26
Permission is granted to use, modify, and / or redistribute at will.
27
28
This includes removing authorship notices, re-use of code parts in
29
other software (with or without giving credit), and / or creating a
30
commercial product based on it.
31
32
This permission is not revocable by the author.
33
34
This software is provided as-is. Use it at your own risk. There is
35
no warranty whatsoever, neither expressed nor implied, and by using
36
this software you accept that the author(s) shall not be held liable
37
for any loss of data, loss of service, or other damages, be they
38
incidental or consequential. Your only option other than accepting
39
this is not to use the software at all.
40
*/
41
42
enum {
43
    CP_CHAR            = 1 << 0,
44
    CP_LOW_SURROGATE   = 1 << 1,
45
    CP_HIGH_SURROGATE  = 1 << 2,
46
    CP_NONCHAR         = 1 << 3,
47
    CP_OVERFLOW        = 1 << 4,
48
49
    U8_SINGLE          = 1 << 5,
50
    U8_DOUBLE          = 1 << 6,
51
    U8_TRIPLE          = 1 << 7,
52
    U8_QUAD            = 1 << 8
53
};
54
55
216k
static unsigned classify(MVMCodepoint cp) {
56
216k
    if(cp <= 0x7F)
57
216k
        return CP_CHAR | U8_SINGLE;
58
216k
59
0
    if(cp <= 0x07FF)
60
0
        return CP_CHAR | U8_DOUBLE;
61
0
62
0
    if(0xD800 <= cp && cp <= 0xDBFF)
63
0
        return CP_HIGH_SURROGATE | U8_TRIPLE;
64
0
65
0
    if(0xDC00 <= cp && cp <= 0xDFFF)
66
0
        return CP_LOW_SURROGATE | U8_TRIPLE;
67
0
68
0
    if(0xFDD0 <= cp && cp <= 0xFDEF)
69
0
        return CP_NONCHAR | U8_TRIPLE;
70
0
71
0
    if(cp <= 0xFFFD)
72
0
        return CP_CHAR | U8_TRIPLE;
73
0
74
0
    if(cp == 0xFFFE || cp == 0xFFFF)
75
0
        return CP_NONCHAR | U8_TRIPLE;
76
0
77
0
    if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
78
0
        return CP_NONCHAR | U8_QUAD;
79
0
80
0
    if(cp <= 0x10FFFF)
81
0
        return CP_CHAR | U8_QUAD;
82
0
83
0
    if(cp <= 0x1FFFFF)
84
0
        return CP_OVERFLOW | U8_QUAD;
85
0
86
0
    return 0;
87
0
}
88
89
216k
static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) {
90
216k
    unsigned cc = classify(cp);
91
216k
92
216k
    if (!(cc & (CP_CHAR | CP_NONCHAR)))
93
0
        return 0;
94
216k
95
216k
    if (cc & U8_SINGLE) {
96
216k
        bp[0] = (MVMuint8)cp;
97
216k
        return 1;
98
216k
    }
99
216k
100
0
    if (cc & U8_DOUBLE) {
101
0
        bp[0] = (MVMuint8)(( 6 << 5) |  (cp >> 6));
102
0
        bp[1] = (MVMuint8)(( 2 << 6) |  (cp &  0x3F));
103
0
        return 2;
104
0
    }
105
0
106
0
    if (cc & U8_TRIPLE) {
107
0
        bp[0] = (MVMuint8)((14 << 4) |  (cp >> 12));
108
0
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F));
109
0
        bp[2] = (MVMuint8)(( 2 << 6) | ( cp       & 0x3F));
110
0
        return 3;
111
0
    }
112
0
113
0
    if (cc & U8_QUAD) {
114
0
        bp[0] = (MVMuint8)((30 << 3) |  (cp >> 18));
115
0
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F));
116
0
        bp[2] = (MVMuint8)(( 2 << 6) | ((cp >>  6) & 0x3F));
117
0
        bp[3] = (MVMuint8)(( 2 << 6) | ( cp        & 0x3F));
118
0
        return 4;
119
0
    }
120
0
121
0
    return 0;
122
0
}
123
124
 /* end not_gerd section */
125
126
#define UTF8_MAXINC (32 * 1024 * 1024)
127
128
0
static void ensure_buffer(MVMGrapheme32 **buffer, MVMint32 *bufsize, MVMint32 needed) {
129
0
    while (needed >= *bufsize)
130
0
        *buffer = MVM_realloc(*buffer, sizeof(MVMGrapheme32) * (
131
0
            *bufsize >= UTF8_MAXINC ? (*bufsize += UTF8_MAXINC) : (*bufsize *= 2)
132
0
        ));
133
0
}
134
135
static const MVMuint8 hex_chars[] = { '0', '1', '2', '3', '4', '5', '6', '7',
136
                                      '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
137
0
static MVMGrapheme32 synthetic_for(MVMThreadContext *tc, MVMuint8 invalid) {
138
0
    if (invalid > 0x7F) {
139
0
        /* A real invalid. */
140
0
        MVMuint8 high = invalid >> 4;
141
0
        MVMuint8 low = invalid & 0x0F;
142
0
        MVMCodepoint cps[] = { 0x10FFFD, 'x', hex_chars[high], hex_chars[low] };
143
0
        return MVM_nfg_codes_to_grapheme_utf8_c8(tc, cps, 4);
144
0
    }
145
0
    else {
146
0
        /* Was in things thrown out as invalid by the decoder, but has an
147
0
         * ASCII interpretation, so hand it back as is. */
148
0
        return invalid;
149
0
    }
150
0
}
151
152
/* What the UTF-C8 decode process is expecting. */
153
typedef enum {
154
    EXPECT_START = 0,
155
    EXPECT_CONTINUATION = 1
156
} Expecting;
157
158
/* Decode state for the UTF8-C8 decoder. */
159
typedef struct {
160
    /* The UTF-8 we're decoding. */
161
    const MVMuint8 *utf8;
162
163
    /* The index of the current byte we're decoding. */
164
    size_t cur_byte;
165
166
    /* The index of the first unaccepted byte. */
167
    size_t unaccepted_start;
168
169
    /* What kind of byte we're expecting next. */
170
    Expecting expecting;
171
172
    /* The current codepoint we're decoding. */
173
    MVMCodepoint cur_codepoint;
174
175
    /* The result buffer we're decoding into. */
176
    MVMGrapheme32 *result;
177
178
    /* The current position in the result buffer. */
179
    size_t result_pos;
180
181
    /* Buffer of original codepoints, to ensure we will not spit out any
182
     * synthetics into the result that will re-order on round-trip. */
183
    MVMCodepoint *orig_codes;
184
185
    /* Position we're at in inserting into orig_codes. */
186
    size_t orig_codes_pos;
187
188
    /* First orig_codes index that did not yet go through the normalizer. */
189
    size_t orig_codes_unnormalized;
190
191
    /* The normalizer we're using to make synthetics that will not cause an
192
     * order change on output. */
193
    MVMNormalizer norm;
194
195
    /* Bad bytes from an earlier buffer, for the sake of streaming decode. */
196
    MVMuint8 prev_bad_bytes[4];
197
    MVMint32 num_prev_bad_bytes;
198
} DecodeState;
199
200
/* Appends a single grapheme to the buffer if it will not cause a mismatch
201
 * with the original codepoints upon encoding back to UTF-8. Returns non-zero
202
 * in this case. Otherwise, appends synthetics for the bytes the original code
203
 * points were encoded as. Since we can end up with index mis-matches, we just
204
 * spit out codepoints to catch the normalizer up to everything in the orig
205
 * codes buffer. */
206
624k
static int append_grapheme(MVMThreadContext *tc, DecodeState *state, MVMGrapheme32 g) {
207
624k
    if (g == state->orig_codes[state->orig_codes_unnormalized]) {
208
624k
        /* Easy case: exact match. */
209
624k
        state->result[state->result_pos++] = g;
210
624k
        state->orig_codes_unnormalized++;
211
624k
        return 1;
212
624k
    }
213
0
    else if (g < 0) {
214
0
        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
215
0
        int mismatch = 0;
216
0
        if (synth->base == state->orig_codes[state->orig_codes_unnormalized]) {
217
0
            MVMint32 i;
218
0
            for (i = 0; i < synth->num_combs; i++) {
219
0
                size_t orig_idx = state->orig_codes_unnormalized + i + 1;
220
0
                if (orig_idx >= state->orig_codes_pos ||
221
0
                        state->orig_codes[orig_idx] != synth->combs[i]) {
222
0
                    mismatch = 1;
223
0
                    break;
224
0
                }
225
0
            }
226
0
        }
227
0
        else {
228
0
            mismatch = 1;
229
0
        }
230
0
        if (!mismatch) {
231
0
            state->result[state->result_pos++] = g;
232
0
            state->orig_codes_unnormalized += 1 + synth->num_combs;
233
0
            return 1;
234
0
        }
235
0
    }
236
624k
237
624k
    /* If we get here, then normalization would trash the original bytes. */
238
0
    {
239
0
        /* Spit out synthetics to keep the bytes as is. */
240
0
        size_t i, j;
241
0
        for (i = state->orig_codes_unnormalized; i < state->orig_codes_pos; i++) {
242
0
            MVMCodepoint to_encode = state->orig_codes[i];
243
0
            MVMuint8 encoded[4];
244
0
            MVMint32 bytes = utf8_encode(encoded, to_encode);
245
0
            for (j = 0; j < bytes; j++)
246
0
                state->result[state->result_pos++] = synthetic_for(tc, encoded[j]);
247
0
        }
248
0
249
0
        /* Consider all codes pushed now normalized. */
250
0
        state->orig_codes_unnormalized = state->orig_codes_pos;
251
0
252
0
        /* Put a clean normalizer in place. */
253
0
        MVM_unicode_normalizer_cleanup(tc, &(state->norm));
254
0
        MVM_unicode_normalizer_init(tc, &(state->norm), MVM_NORMALIZE_NFG);
255
0
        return 0;
256
624k
    }
257
624k
}
258
259
/* Called when decoding has reached an acceptable codepoint. */
260
624k
static void process_ok_codepoint(MVMThreadContext *tc, DecodeState *state) {
261
624k
    MVMint32 ready;
262
624k
    MVMGrapheme32 g;
263
624k
264
624k
    /* Consider the byte range accepted. */
265
624k
    state->unaccepted_start = state->cur_byte + 1;
266
624k
267
624k
    /* Insert into original codepoints list and hand it to the normalizer. */
268
624k
    state->orig_codes[state->orig_codes_pos++] = state->cur_codepoint;
269
624k
    ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc,
270
624k
            &(state->norm), state->cur_codepoint, &g);
271
624k
272
624k
    /* If the normalizer produced some output... */
273
624k
    if (ready) {
274
608k
        if (append_grapheme(tc, state, g)) {
275
608k
            while (--ready > 0) {
276
0
                g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm));
277
0
                if (!append_grapheme(tc, state, g))
278
0
                    break;
279
0
            }
280
608k
        }
281
608k
    }
282
624k
283
624k
    /* We've no longer any bad bytes to care about from earlier buffers;
284
624k
     * they ended up making an acceptable codepoint. */
285
624k
    state->num_prev_bad_bytes = 0;
286
624k
}
287
288
/* Called when a bad byte has been encountered, or at the end of output. */
289
15.6k
static void process_bad_bytes(MVMThreadContext *tc, DecodeState *state) {
290
15.6k
    size_t i;
291
15.6k
    MVMint32 ready;
292
15.6k
293
15.6k
    /* Flush normalization buffer and take from that. */
294
15.6k
    MVM_unicode_normalizer_eof(tc, &(state->norm));
295
15.6k
    ready = MVM_unicode_normalizer_available(tc, &(state->norm));
296
31.2k
    while (ready-- > 0) {
297
15.6k
        MVMGrapheme32 g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm));
298
15.6k
        if (!append_grapheme(tc, state, g))
299
0
            break;
300
15.6k
    }
301
15.6k
302
15.6k
    /* Now add in synthetics for bad bytes. */
303
15.6k
    for (i = 0; i < state->num_prev_bad_bytes; i++)
304
0
        state->result[state->result_pos++] = synthetic_for(tc, state->prev_bad_bytes[i]);
305
15.6k
    state->num_prev_bad_bytes = 0;
306
15.6k
    for (i = state->unaccepted_start; i <= state->cur_byte; i++)
307
0
        state->result[state->result_pos++] = synthetic_for(tc, state->utf8[i]);
308
15.6k
    state->unaccepted_start = state->cur_byte + 1;
309
15.6k
}
310
311
/* Decodes the specified number of bytes of utf8 into an NFG string, creating
312
 * a result of the specified type. The type must have the MVMString REPR. */
313
MVMString * MVM_string_utf8_c8_decode(MVMThreadContext *tc, const MVMObject *result_type,
314
15.6k
                                      const char *utf8, size_t bytes) {
315
15.6k
    DecodeState state;
316
15.6k
317
15.6k
    /* Local state for decode loop. */
318
15.6k
    int expected_continuations = 0;
319
15.6k
    int min_expected_codepoint;
320
15.6k
321
15.6k
    /* Don't do anything if empty. */
322
15.6k
    if (bytes == 0)
323
0
        return tc->instance->str_consts.empty;
324
15.6k
325
15.6k
    /* Decoding state, in a struct to easily pass to utility routines.
326
15.6k
     * Result buffer is a maximum estimate to avoid realloc; we can shrink
327
15.6k
     * it at the end. */
328
15.6k
    state.utf8 = (MVMuint8 *)utf8;
329
15.6k
    state.cur_byte = 0;
330
15.6k
    state.unaccepted_start = 0;
331
15.6k
    state.expecting = EXPECT_START;
332
15.6k
    state.cur_codepoint = 0;
333
15.6k
    state.result = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
334
15.6k
    state.result_pos = 0;
335
15.6k
    state.orig_codes = MVM_malloc(sizeof(MVMCodepoint) * bytes);
336
15.6k
    state.orig_codes_pos = 0;
337
15.6k
    state.orig_codes_unnormalized = 0;
338
15.6k
    state.num_prev_bad_bytes = 0;
339
15.6k
    MVM_unicode_normalizer_init(tc, &(state.norm), MVM_NORMALIZE_NFG);
340
15.6k
341
639k
    while (state.cur_byte < bytes) {
342
624k
        MVMuint8 decode_byte = utf8[state.cur_byte];
343
624k
        switch (state.expecting) {
344
624k
            case EXPECT_START:
345
624k
                if ((decode_byte & 0x80) == 0) {
346
624k
                    /* Single byte sequence. */
347
624k
                    state.cur_codepoint = decode_byte;
348
624k
                    process_ok_codepoint(tc, &state);
349
624k
                }
350
0
                else if ((decode_byte & 0xE0) == 0xC0) {
351
0
                    state.cur_codepoint = decode_byte & 0x1F;
352
0
                    state.expecting = EXPECT_CONTINUATION;
353
0
                    expected_continuations = 1;
354
0
                    min_expected_codepoint = 0x80;
355
0
                }
356
0
                else if ((decode_byte & 0xF0) == 0xE0) {
357
0
                    state.cur_codepoint = decode_byte & 0x0F;
358
0
                    state.expecting = EXPECT_CONTINUATION;
359
0
                    expected_continuations = 2;
360
0
                    min_expected_codepoint = 0x800;
361
0
                }
362
0
                else if ((decode_byte & 0xF8) == 0xF0) {
363
0
                    state.cur_codepoint = decode_byte & 0x07;
364
0
                    state.expecting = EXPECT_CONTINUATION;
365
0
                    expected_continuations = 3;
366
0
                    min_expected_codepoint = 0x10000;
367
0
                }
368
0
                else {
369
0
                    /* Invalid byte sequence. */
370
0
                    process_bad_bytes(tc, &state);
371
0
                }
372
624k
                break;
373
0
            case EXPECT_CONTINUATION:
374
0
                if ((decode_byte & 0xC0) == 0x80) {
375
0
                    state.cur_codepoint = (state.cur_codepoint << 6)
376
0
                                          | (decode_byte & 0x3F);
377
0
                    expected_continuations--;
378
0
                    if (expected_continuations == 0) {
379
0
                        if (state.cur_codepoint >= min_expected_codepoint)
380
0
                            process_ok_codepoint(tc, &state);
381
0
                        else
382
0
                            process_bad_bytes(tc, &state);
383
0
                        state.expecting = EXPECT_START;
384
0
                    }
385
0
                }
386
0
                else {
387
0
                    /* Invalid byte sequence. */
388
0
                    process_bad_bytes(tc, &state);
389
0
                    state.expecting = EXPECT_START;
390
0
                }
391
0
                break;
392
624k
        }
393
624k
        state.cur_byte++;
394
624k
    }
395
15.6k
396
15.6k
    /* Handle anything dangling off the end. */
397
15.6k
    state.cur_byte--; /* So we don't read 1 past the end. */
398
15.6k
    process_bad_bytes(tc, &state);
399
15.6k
400
15.6k
    MVM_free(state.orig_codes);
401
15.6k
    MVM_unicode_normalizer_cleanup(tc, &(state.norm));
402
15.6k
403
15.6k
    {
404
15.6k
        MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
405
15.6k
        result->body.storage.blob_32 = state.result;
406
15.6k
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
407
15.6k
        result->body.num_graphs      = state.result_pos;
408
15.6k
        return result;
409
15.6k
    }
410
15.6k
}
411
412
/* Decodes using a decodestream. Decodes as far as it can with the input
413
 * buffers, or until a stopper is reached. */
414
MVMuint32 MVM_string_utf8_c8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
415
                                     const MVMint32 *stopper_chars,
416
                                     MVMDecodeStreamSeparators *seps,
417
0
                                     MVMint32 eof) {
418
0
    /* Local state for decode loop. */
419
0
    MVMDecodeStreamBytes *cur_bytes;
420
0
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
421
0
    MVMint32 last_accept_pos = ds->bytes_head_pos;
422
0
    DecodeState state;
423
0
    int expected_continuations = 0;
424
0
    int min_expected_codepoint;
425
0
    MVMuint32 reached_stopper = 0;
426
0
    MVMint32 result_graphs = 0;
427
0
428
0
    /* If there's no buffers, we're done. */
429
0
    if (!ds->bytes_head)
430
0
        return 0;
431
0
    last_accept_pos = ds->bytes_head_pos;
432
0
433
0
    /* If we're asked for zero chars, also done. */
434
0
    if (stopper_chars && *stopper_chars == 0)
435
0
        return 1;
436
0
437
0
    /* Otherwise set up decode state, stealing normalizer of the decode
438
0
     * stream and re-instating any past orig_codes. */
439
0
    state.expecting = EXPECT_START;
440
0
    state.cur_codepoint = 0;
441
0
    state.num_prev_bad_bytes = 0;
442
0
    memcpy(&(state.norm), &(ds->norm), sizeof(MVMNormalizer));
443
0
    if (ds->decoder_state) {
444
0
        MVMCodepoint *saved = (MVMCodepoint *)ds->decoder_state;
445
0
        state.orig_codes = MVM_malloc(
446
0
            sizeof(MVMCodepoint) * (saved[0] + ds->bytes_head->length)
447
0
        );
448
0
        state.orig_codes_pos = saved[0];
449
0
        state.orig_codes_unnormalized = 0;
450
0
        memcpy(state.orig_codes, saved + 1, saved[0] * sizeof(MVMCodepoint));
451
0
        MVM_free(ds->decoder_state);
452
0
        ds->decoder_state = NULL;
453
0
    }
454
0
    else {
455
0
        state.orig_codes = NULL;
456
0
        state.orig_codes_pos = 0;
457
0
        state.orig_codes_unnormalized = 0;
458
0
    }
459
0
460
0
    /* Decode each of the buffers. */
461
0
    cur_bytes = ds->bytes_head;
462
0
    reached_stopper = 0;
463
0
    while (cur_bytes && !reached_stopper) {
464
0
        /* Set up decode state for this buffer. */
465
0
        MVMint32 bytes = ds->bytes_head->length;
466
0
        state.result = MVM_malloc(bytes * sizeof(MVMGrapheme32));
467
0
        state.orig_codes = MVM_realloc(state.orig_codes,
468
0
            sizeof(MVMCodepoint) * (state.orig_codes_pos + bytes));
469
0
        state.result_pos = 0;
470
0
        state.utf8 = (const MVMuint8*)cur_bytes->bytes;
471
0
        state.cur_byte = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
472
0
        state.unaccepted_start = state.cur_byte;
473
0
474
0
        /* Process this buffer. */
475
0
        while (state.cur_byte < bytes) {
476
0
            /* Process a byte. */
477
0
            MVMuint8 decode_byte = state.utf8[state.cur_byte];
478
0
            MVMint32 maybe_new_graph = 0;
479
0
            switch (state.expecting) {
480
0
                case EXPECT_START:
481
0
                    if ((decode_byte & 0x80) == 0) {
482
0
                        /* Single byte sequence. */
483
0
                        state.cur_codepoint = decode_byte;
484
0
                        process_ok_codepoint(tc, &state);
485
0
                        maybe_new_graph = 1;
486
0
                    }
487
0
                    else if ((decode_byte & 0xE0) == 0xC0) {
488
0
                        state.cur_codepoint = decode_byte & 0x1F;
489
0
                        state.expecting = EXPECT_CONTINUATION;
490
0
                        expected_continuations = 1;
491
0
                        min_expected_codepoint = 0x80;
492
0
                    }
493
0
                    else if ((decode_byte & 0xF0) == 0xE0) {
494
0
                        state.cur_codepoint = decode_byte & 0x0F;
495
0
                        state.expecting = EXPECT_CONTINUATION;
496
0
                        expected_continuations = 2;
497
0
                        min_expected_codepoint = 0x800;
498
0
                    }
499
0
                    else if ((decode_byte & 0xF8) == 0xF0) {
500
0
                        state.cur_codepoint = decode_byte & 0x07;
501
0
                        state.expecting = EXPECT_CONTINUATION;
502
0
                        expected_continuations = 3;
503
0
                        min_expected_codepoint = 0x10000;
504
0
                    }
505
0
                    else {
506
0
                        /* Invalid byte sequence. */
507
0
                        process_bad_bytes(tc, &state);
508
0
                        maybe_new_graph = 1;
509
0
                    }
510
0
                    break;
511
0
                case EXPECT_CONTINUATION:
512
0
                    if ((decode_byte & 0xC0) == 0x80) {
513
0
                        state.cur_codepoint = (state.cur_codepoint << 6)
514
0
                                              | (decode_byte & 0x3F);
515
0
                        expected_continuations--;
516
0
                        if (expected_continuations == 0) {
517
0
                            if (state.cur_codepoint >= min_expected_codepoint)
518
0
                                process_ok_codepoint(tc, &state);
519
0
                            else
520
0
                                process_bad_bytes(tc, &state);
521
0
                            maybe_new_graph = 1;
522
0
                            state.expecting = EXPECT_START;
523
0
                        }
524
0
                    }
525
0
                    else {
526
0
                        /* Invalid byte sequence. */
527
0
                        process_bad_bytes(tc, &state);
528
0
                        maybe_new_graph = 1;
529
0
                        state.expecting = EXPECT_START;
530
0
                    }
531
0
                    break;
532
0
            }
533
0
            state.cur_byte++;
534
0
535
0
            /* See if we've reached a stopper. */
536
0
            if (maybe_new_graph && state.result_pos > 0) {
537
0
                if (stopper_chars) {
538
0
                    if (result_graphs + state.result_pos >= *stopper_chars) {
539
0
                        reached_stopper = 1;
540
0
                        break;
541
0
                    }
542
0
                }
543
0
                if (MVM_string_decode_stream_maybe_sep(tc, seps,
544
0
                            state.result[state.result_pos - 1])) {
545
0
                    reached_stopper = 1;
546
0
                    break;
547
0
                }
548
0
            }
549
0
        }
550
0
551
0
        /* If we're at EOF and this is the last buffer, force out last bytes. */
552
0
        if (eof && !reached_stopper && !cur_bytes->next) {
553
0
            state.cur_byte--; /* So we don't read 1 past the end. */
554
0
            process_bad_bytes(tc, &state);
555
0
        }
556
0
557
0
        /* Attach what we successfully parsed as a result buffer, and trim away
558
0
         * what we chewed through. */
559
0
        if (state.result_pos)
560
0
            MVM_string_decodestream_add_chars(tc, ds, state.result, state.result_pos);
561
0
        else
562
0
            MVM_free(state.result);
563
0
        result_graphs += state.result_pos;
564
0
565
0
        /* Update our accepted position. */
566
0
        if (state.unaccepted_start > 0) {
567
0
            last_accept_bytes = cur_bytes;
568
0
            last_accept_pos = state.unaccepted_start;
569
0
        }
570
0
571
0
        /* If there were bytes we didn't accept, hold on to them in case we
572
0
         * need to emit them as bad bytes. */
573
0
        if (state.unaccepted_start != state.cur_byte && cur_bytes->next) {
574
0
            int i;
575
0
            for (i = state.unaccepted_start; i < state.cur_byte; i++)
576
0
                state.prev_bad_bytes[state.num_prev_bad_bytes++] = state.utf8[i];
577
0
        }
578
0
579
0
        cur_bytes = cur_bytes->next;
580
0
    }
581
0
582
0
    /* Eat the bytes we decoded. */
583
0
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
584
0
585
0
    /* Persist current normalizer. */
586
0
    memcpy(&(ds->norm), &(state.norm), sizeof(MVMNormalizer));
587
0
588
0
    /* Stash away any leftover codepoints we'll need to examine. */
589
0
    if (state.orig_codes_pos && state.orig_codes_pos != state.orig_codes_unnormalized) {
590
0
        size_t diff = state.orig_codes_pos - state.orig_codes_unnormalized;
591
0
        MVMCodepoint *saved = MVM_malloc(sizeof(MVMCodepoint) * (1 + diff));
592
0
        saved[0] = diff;
593
0
        memcpy(saved + 1, state.orig_codes + state.orig_codes_unnormalized,
594
0
            diff * sizeof(MVMCodepoint));
595
0
        ds->decoder_state = saved;
596
0
    }
597
0
    MVM_free(state.orig_codes);
598
0
599
0
    return reached_stopper;
600
0
}
601
602
/* Encodes the specified string to UTF-8. */
603
static void emit_cp(MVMThreadContext *tc, MVMCodepoint cp, MVMuint8 **result,
604
                    size_t *result_pos, size_t *result_limit,
605
216k
                    MVMuint8 *repl_bytes, MVMuint64 repl_length) {
606
216k
    MVMint32 bytes;
607
216k
    if (*result_pos >= *result_limit) {
608
0
        *result_limit *= 2;
609
0
        *result = MVM_realloc(*result, *result_limit + 4);
610
0
    }
611
216k
    bytes = utf8_encode(*result + *result_pos, cp);
612
216k
    if (bytes)
613
216k
        *result_pos += bytes;
614
0
    else if (repl_bytes) {
615
0
        if (repl_length >= *result_limit || *result_pos >= *result_limit - repl_length) {
616
0
            *result_limit += repl_length;
617
0
            *result = MVM_realloc(*result, *result_limit + 4);
618
0
        }
619
0
        memcpy(*result + *result_pos, repl_bytes, repl_length);
620
0
        *result_pos += repl_length;
621
0
    }
622
0
    else {
623
0
        MVM_free(*result);
624
0
        MVM_free(repl_bytes);
625
0
        MVM_string_utf8_throw_encoding_exception(tc, cp);
626
0
    }
627
216k
}
628
0
static int hex2int(MVMThreadContext *tc, MVMCodepoint cp) {
629
0
    if (cp >= '0' && cp <= '9')
630
0
        return cp - '0';
631
0
    else if (cp >= 'A' && cp <= 'F')
632
0
        return 10 + (cp - 'A');
633
0
    else
634
0
        MVM_exception_throw_adhoc(tc, "UTF-8 C-8 encoding encountered corrupt synthetic");
635
0
}
636
char * MVM_string_utf8_c8_encode_substr(MVMThreadContext *tc,
637
10.5k
        MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement) {
638
10.5k
    MVMuint8        *result;
639
10.5k
    size_t           result_pos, result_limit;
640
10.5k
    MVMGraphemeIter  gi;
641
10.5k
    MVMStringIndex   strgraphs = MVM_string_graphs(tc, str);
642
10.5k
    MVMuint8        *repl_bytes = NULL;
643
10.5k
    MVMuint64        repl_length;
644
10.5k
645
10.5k
    if (start < 0 || start > strgraphs)
646
0
        MVM_exception_throw_adhoc(tc, "start out of range");
647
10.5k
    if (length == -1)
648
0
        length = strgraphs;
649
10.5k
    if (length < 0 || start + length > strgraphs)
650
0
        MVM_exception_throw_adhoc(tc, "length out of range");
651
10.5k
652
10.5k
    if (replacement)
653
0
        repl_bytes = (MVMuint8 *) MVM_string_utf8_c8_encode_substr(tc, replacement, &repl_length, 0, -1, NULL);
654
10.5k
655
10.5k
    /* Guesstimate that we'll be within 2 bytes for most chars most of the
656
10.5k
     * time, and give ourselves 4 bytes breathing space. */
657
10.5k
    result_limit = 2 * length;
658
10.5k
    result       = MVM_malloc(result_limit + 4);
659
10.5k
    result_pos   = 0;
660
10.5k
661
10.5k
    /* We iterate graphemes, looking out for any synthetics. If we find a
662
10.5k
     * UTF-8 C-8 synthetic, then we spit out the raw byte. If we find any
663
10.5k
     * other synthetic, we iterate its codepoints. */
664
10.5k
    MVM_string_gi_init(tc, &gi, str);
665
226k
    while (MVM_string_gi_has_more(tc, &gi)) {
666
216k
        MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
667
216k
        if (g >= 0) {
668
216k
            emit_cp(tc, g, &result, &result_pos, &result_limit, repl_bytes, repl_length);
669
216k
        }
670
0
        else {
671
0
            MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
672
0
            if (synth->is_utf8_c8) {
673
0
                /* UTF-8 C-8 synthetic; emit the byte. */
674
0
                if (result_pos >= result_limit) {
675
0
                    result_limit *= 2;
676
0
                    result = MVM_realloc(result, result_limit + 1);
677
0
                }
678
0
                result[result_pos++] = (hex2int(tc, synth->combs[1]) << 4) +
679
0
                    hex2int(tc, synth->combs[2]);
680
0
            }
681
0
            else {
682
0
                MVMint32 i;
683
0
                emit_cp(tc, synth->base, &result, &result_pos, &result_limit, repl_bytes, repl_length);
684
0
                for (i = 0; i < synth->num_combs; i++)
685
0
                    emit_cp(tc, synth->combs[i], &result, &result_pos, &result_limit, repl_bytes, repl_length);
686
0
            }
687
0
        }
688
216k
    }
689
10.5k
690
10.5k
    if (output_size)
691
10.5k
        *output_size = (MVMuint64)result_pos;
692
10.5k
    MVM_free(repl_bytes);
693
10.5k
    return (char *)result;
694
10.5k
}
695
696
/* Encodes the specified string to UTF-8 C-8. */
697
10.5k
char * MVM_string_utf8_c8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size) {
698
10.5k
    return MVM_string_utf8_c8_encode_substr(tc, str, output_size, 0,
699
10.5k
        MVM_string_graphs(tc, str), NULL);
700
10.5k
}
701
702
/* Encodes the specified string to a UTF-8 C-8 C string. */
703
10.5k
char * MVM_string_utf8_c8_encode_C_string(MVMThreadContext *tc, MVMString *str) {
704
10.5k
    MVMuint64 output_size;
705
10.5k
    char *result;
706
10.5k
    char *utf8_string = MVM_string_utf8_c8_encode(tc, str, &output_size);
707
10.5k
    result = MVM_malloc(output_size + 1);
708
10.5k
    memcpy(result, utf8_string, output_size);
709
10.5k
    MVM_free(utf8_string);
710
10.5k
    result[output_size] = (char)0;
711
10.5k
    return result;
712
10.5k
}