Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/utf8.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* The below section has an MIT-style license, included here.
4
5
// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7
 *
8
 * Permission is hereby granted, free of charge, to any person obtaining
9
 * a copy of this software and associated documentation files (the
10
 * "Software"), to deal in the Software without restriction, including
11
 * without limitation the rights to use, copy, modify, merge, publish,
12
 * distribute, sublicense, and/or sell copies of the Software, and to
13
 * permit persons to whom the Software is furnished to do so, subject
14
 * to the following conditions:
15
 *
16
 * The above copyright notice and this permission notice shall be
17
 * included in all copies or substantial portions of the Software.
18
 *
19
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
 * SOFTWARE.
27
 */
28
1.52M
#define UTF8_ACCEPT 0
29
0
#define UTF8_REJECT 12
30
31
static const MVMuint8 utf8d[] = {
32
  // The first part of the table maps bytes to character classes that
33
  // to reduce the size of the transition table and create bitmasks.
34
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
36
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
39
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
40
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
41
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
42
43
  // The second part is a transition table that maps a combination
44
  // of a state of the automaton and a character class to a state.
45
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
46
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
47
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
48
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
49
  12,36,12,12,12,12,12,12,12,12,12,12,
50
};
51
52
static MVMint32
53
801k
decode_utf8_byte(MVMint32 *state, MVMGrapheme32 *codep, MVMuint8 byte) {
54
801k
  MVMint32 type = utf8d[byte];
55
801k
56
801k
  *codep = (*state != UTF8_ACCEPT) ?
57
107k
    (byte & 0x3fu) | (*codep << 6) :
58
694k
    (0xff >> type) & (byte);
59
801k
60
801k
  *state = utf8d[256 + *state + type];
61
801k
  return *state;
62
801k
}
63
/* end Bjoern Hoehrmann section (some things were changed from the original) */
64
65
/* begin not_gerd section (modified from original)
66
// Copyright 2012 not_gerd
67
// see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122
68
69
Permission is granted to use, modify, and / or redistribute at will.
70
71
This includes removing authorship notices, re-use of code parts in
72
other software (with or without giving credit), and / or creating a
73
commercial product based on it.
74
75
This permission is not revocable by the author.
76
77
This software is provided as-is. Use it at your own risk. There is
78
no warranty whatsoever, neither expressed nor implied, and by using
79
this software you accept that the author(s) shall not be held liable
80
for any loss of data, loss of service, or other damages, be they
81
incidental or consequential. Your only option other than accepting
82
this is not to use the software at all.
83
*/
84
85
enum {
86
    CP_CHAR            = 1 << 0,
87
    CP_LOW_SURROGATE   = 1 << 1,
88
    CP_HIGH_SURROGATE  = 1 << 2,
89
    CP_NONCHAR         = 1 << 3,
90
    CP_OVERFLOW        = 1 << 4,
91
92
    U8_SINGLE          = 1 << 5,
93
    U8_DOUBLE          = 1 << 6,
94
    U8_TRIPLE          = 1 << 7,
95
    U8_QUAD            = 1 << 8
96
};
97
98
1.10M
static unsigned classify(MVMCodepoint cp) {
99
1.10M
    if(cp <= 0x7F)
100
1.10M
        return CP_CHAR | U8_SINGLE;
101
1.10M
102
3.61k
    if(cp <= 0x07FF)
103
2.98k
        return CP_CHAR | U8_DOUBLE;
104
3.61k
105
632
    if(0xD800 <= cp && cp <= 0xDBFF)
106
0
        return CP_HIGH_SURROGATE | U8_TRIPLE;
107
632
108
632
    if(0xDC00 <= cp && cp <= 0xDFFF)
109
0
        return CP_LOW_SURROGATE | U8_TRIPLE;
110
632
111
632
    if(0xFDD0 <= cp && cp <= 0xFDEF)
112
0
        return CP_NONCHAR | U8_TRIPLE;
113
632
114
632
    if(cp <= 0xFFFD)
115
621
        return CP_CHAR | U8_TRIPLE;
116
632
117
11
    if(cp == 0xFFFE || cp == 0xFFFF)
118
0
        return CP_NONCHAR | U8_TRIPLE;
119
11
120
11
    if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
121
0
        return CP_NONCHAR | U8_QUAD;
122
11
123
11
    if(cp <= 0x10FFFF)
124
11
        return CP_CHAR | U8_QUAD;
125
11
126
0
    if(cp <= 0x1FFFFF)
127
0
        return CP_OVERFLOW | U8_QUAD;
128
0
129
0
    return 0;
130
0
}
131
132
1.10M
static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) {
133
1.10M
    unsigned cc = classify(cp);
134
1.10M
135
1.10M
    if (!(cc & (CP_CHAR | CP_NONCHAR)))
136
0
        return 0;
137
1.10M
138
1.10M
    if (cc & U8_SINGLE) {
139
1.10M
        bp[0] = (MVMuint8)cp;
140
1.10M
        return 1;
141
1.10M
    }
142
1.10M
143
3.61k
    if (cc & U8_DOUBLE) {
144
2.98k
        bp[0] = (MVMuint8)(( 6 << 5) |  (cp >> 6));
145
2.98k
        bp[1] = (MVMuint8)(( 2 << 6) |  (cp &  0x3F));
146
2.98k
        return 2;
147
2.98k
    }
148
3.61k
149
632
    if (cc & U8_TRIPLE) {
150
621
        bp[0] = (MVMuint8)((14 << 4) |  (cp >> 12));
151
621
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F));
152
621
        bp[2] = (MVMuint8)(( 2 << 6) | ( cp       & 0x3F));
153
621
        return 3;
154
621
    }
155
632
156
11
    if (cc & U8_QUAD) {
157
11
        bp[0] = (MVMuint8)((30 << 3) |  (cp >> 18));
158
11
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F));
159
11
        bp[2] = (MVMuint8)(( 2 << 6) | ((cp >>  6) & 0x3F));
160
11
        bp[3] = (MVMuint8)(( 2 << 6) | ( cp        & 0x3F));
161
11
        return 4;
162
11
    }
163
11
164
0
    return 0;
165
11
}
166
167
 /* end not_gerd section */
168
169
0
#define UTF8_MAXINC (32 * 1024 * 1024)
170
171
/* Decodes the specified number of bytes of utf8 into an NFG string, creating
172
 * a result of the specified type. The type must have the MVMString REPR. */
173
25.9k
MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) {
174
25.9k
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
175
25.9k
    MVMint32 count = 0;
176
25.9k
    MVMCodepoint codepoint;
177
25.9k
    MVMint32 line_ending = 0;
178
25.9k
    MVMint32 state = 0;
179
25.9k
    MVMint32 bufsize = bytes;
180
25.9k
    MVMGrapheme32 lowest_graph  =  0x7fffffff;
181
25.9k
    MVMGrapheme32 highest_graph = -0x7fffffff;
182
25.9k
    MVMGrapheme32 *buffer = MVM_malloc(sizeof(MVMGrapheme32) * bufsize);
183
25.9k
    size_t orig_bytes;
184
25.9k
    const char *orig_utf8;
185
25.9k
    MVMint32 line;
186
25.9k
    MVMint32 col;
187
25.9k
    MVMint32 ready;
188
25.9k
189
25.9k
    /* Need to normalize to NFG as we decode. */
190
25.9k
    MVMNormalizer norm;
191
25.9k
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
192
25.9k
193
25.9k
    orig_bytes = bytes;
194
25.9k
    orig_utf8 = utf8;
195
25.9k
196
343k
    for (; bytes; ++utf8, --bytes) {
197
317k
        switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
198
210k
        case UTF8_ACCEPT: { /* got a codepoint */
199
210k
            MVMGrapheme32 g;
200
210k
            ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, codepoint, &g);
201
210k
            if (ready) {
202
184k
                while (count + ready >= bufsize) { /* if the buffer's full make a bigger one */
203
0
                    buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (
204
0
                        bufsize >= UTF8_MAXINC ? (bufsize += UTF8_MAXINC) : (bufsize *= 2)
205
0
                    ));
206
0
                }
207
184k
                buffer[count++] = g;
208
149k
                lowest_graph = g < lowest_graph ? g : lowest_graph;
209
102k
                highest_graph = g > highest_graph ? g : highest_graph;
210
184k
                while (--ready > 0) {
211
299
                    g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
212
288
                    lowest_graph = g < lowest_graph ? g : lowest_graph;
213
283
                    highest_graph = g > highest_graph ? g : highest_graph;
214
299
                    buffer[count++] = g;
215
299
                }
216
184k
            }
217
210k
            break;
218
210k
        }
219
0
        case UTF8_REJECT:
220
0
            /* found a malformed sequence; parse it again this time tracking
221
0
             * line and col numbers. */
222
0
            MVM_unicode_normalizer_cleanup(tc, &norm); /* Since we'll throw. */
223
0
            bytes = orig_bytes; utf8 = orig_utf8; state = 0; line = 1; col = 1;
224
0
            for (; bytes; ++utf8, --bytes) {
225
0
                switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
226
0
                case UTF8_ACCEPT:
227
0
                    /* this could be reorganized into several nested ugly if/else :/ */
228
0
                    if (!line_ending && (codepoint == 10 || codepoint == 13)) {
229
0
                        /* Detect the style of line endings.
230
0
                         * Select whichever comes first.
231
0
                         * First or only part of first line ending. */
232
0
                        line_ending = codepoint;
233
0
                        col = 1; line++;
234
0
                    }
235
0
                    else if (line_ending && codepoint == line_ending) {
236
0
                        /* first or only part of next line ending */
237
0
                        col = 1; line++;
238
0
                    }
239
0
                    else if (codepoint == 10 || codepoint == 13) {
240
0
                        /* second part of line ending; ignore */
241
0
                    }
242
0
                    else /* non-line ending codepoint */
243
0
                        col++;
244
0
                    break;
245
0
                case UTF8_REJECT:
246
0
                    MVM_free(buffer);
247
0
                    MVM_exception_throw_adhoc(tc, "Malformed UTF-8 at line %u col %u", line, col);
248
0
                }
249
0
            }
250
0
            MVM_free(buffer);
251
0
            MVM_exception_throw_adhoc(tc, "Concurrent modification of UTF-8 input buffer!");
252
0
            break;
253
317k
        }
254
317k
    }
255
25.9k
    if (state != UTF8_ACCEPT) {
256
0
        MVM_unicode_normalizer_cleanup(tc, &norm);
257
0
        MVM_free(buffer);
258
0
        MVM_exception_throw_adhoc(tc, "Malformed termination of UTF-8 string");
259
0
    }
260
25.9k
261
25.9k
    /* Get any final graphemes from the normalizer, and clean it up. */
262
25.9k
    MVM_unicode_normalizer_eof(tc, &norm);
263
25.9k
    ready = MVM_unicode_normalizer_available(tc, &norm);
264
25.9k
    if (ready) {
265
25.8k
        if (count + ready >= bufsize) {
266
19.9k
            buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (count + ready));
267
19.9k
        }
268
51.6k
        while (ready--) {
269
25.8k
            MVMGrapheme32 g;
270
25.8k
            g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
271
25.4k
            lowest_graph = g < lowest_graph ? g : lowest_graph;
272
12.9k
            highest_graph = g > highest_graph ? g : highest_graph;
273
25.8k
            buffer[count++] = g;
274
25.8k
        }
275
25.8k
    }
276
25.9k
    MVM_unicode_normalizer_cleanup(tc, &norm);
277
25.9k
278
25.9k
    /* If we're lucky, we can fit our string in 8 bits per grapheme.
279
25.9k
     * That happens when our lowest value is bigger than -129 and our
280
25.9k
     * highest value is lower than 128. */
281
25.9k
    if (lowest_graph >= -128 && highest_graph < 128) {
282
20.0k
        MVMGrapheme8 *new_buffer = MVM_malloc(sizeof(MVMGrapheme8) * count);
283
174k
        for (ready = 0; ready < count; ready++) {
284
154k
            new_buffer[ready] = buffer[ready];
285
154k
        }
286
20.0k
        MVM_free(buffer);
287
20.0k
        result->body.storage.blob_8  = new_buffer;
288
20.0k
        result->body.storage_type    = MVM_STRING_GRAPHEME_8;
289
5.97k
    } else {
290
5.97k
        /* just keep the same buffer as the MVMString's buffer.  Later
291
5.97k
         * we can add heuristics to resize it if we have enough free
292
5.97k
         * memory */
293
5.97k
        if (bufsize - count > 4) {
294
5.35k
            buffer = MVM_realloc(buffer, count * sizeof(MVMGrapheme32));
295
5.35k
        }
296
5.97k
        result->body.storage.blob_32 = buffer;
297
5.97k
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
298
5.97k
    }
299
25.9k
    result->body.num_graphs      = count;
300
25.9k
301
25.9k
    return result;
302
25.9k
}
303
304
177
static MVMint32 its_the_bom(const char *utf8) {
305
177
    const MVMuint8 *uns_utf8 = (const MVMuint8 *)utf8;
306
0
    return uns_utf8[0] == 0xEF && uns_utf8[1] == 0xBB && uns_utf8[2] == 0xBF;
307
177
}
308
309
/* Same as MVM_string_utf8_decode, but strips a BOM if it finds one. */
310
4
MVMString * MVM_string_utf8_decode_strip_bom(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) {
311
4
    if (bytes >= 3 && its_the_bom(utf8)) {
312
0
        utf8 += 3;
313
0
        bytes -= 3;
314
0
    }
315
4
    return MVM_string_utf8_decode(tc, result_type, utf8, bytes);
316
4
}
317
318
/* Decodes using a decodestream. Decodes as far as it can with the input
319
 * buffers, or until a stopper is reached. */
320
MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
321
                                  const MVMint32 *stopper_chars,
322
242
                                  MVMDecodeStreamSeparators *seps) {
323
242
    MVMint32 count = 0, total = 0;
324
242
    MVMint32 state = 0;
325
242
    MVMCodepoint codepoint = 0;
326
242
    MVMint32 bufsize;
327
242
    MVMGrapheme32 *buffer;
328
242
    MVMDecodeStreamBytes *cur_bytes;
329
242
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
330
242
    MVMint32 last_accept_pos, ready, at_start;
331
242
    MVMuint32 reached_stopper;
332
242
333
242
    /* If there's no buffers, we're done. */
334
242
    if (!ds->bytes_head)
335
24
        return 0;
336
218
    last_accept_pos = ds->bytes_head_pos;
337
218
338
218
    /* If we're asked for zero chars, also done. */
339
218
    if (stopper_chars && *stopper_chars == 0)
340
0
        return 1;
341
218
342
218
    /* Rough starting-size estimate is number of bytes in the head buffer. */
343
218
    bufsize = ds->bytes_head->length;
344
218
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
345
218
346
218
    /* Decode each of the buffers. */
347
218
    cur_bytes = ds->bytes_head;
348
218
    at_start = ds->abs_byte_pos == 0;
349
218
    reached_stopper = 0;
350
402
    while (cur_bytes) {
351
219
        /* Process this buffer. */
352
218
        MVMint32  pos   = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
353
219
        char     *bytes = cur_bytes->bytes;
354
219
        if (at_start) {
355
180
            /* We're right at the start of the stream of things to decode. See
356
180
             * if we have a BOM, and skip over it if so. */
357
180
            if (pos + 3 <= cur_bytes->length) {
358
176
                if (its_the_bom(bytes + pos)) {
359
0
                    pos += 3;
360
0
                    last_accept_bytes = cur_bytes;
361
0
                    last_accept_pos = pos;
362
0
                }
363
176
            }
364
180
            at_start = 0;
365
180
        }
366
484k
        while (pos < cur_bytes->length) {
367
483k
            switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) {
368
483k
            case UTF8_ACCEPT: {
369
483k
                MVMint32 first = 1;
370
483k
                MVMGrapheme32 g;
371
483k
                last_accept_bytes = cur_bytes;
372
483k
                last_accept_pos = pos;
373
483k
                ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &(ds->norm), codepoint, &g);
374
967k
                while (ready--) {
375
483k
                    if (first)
376
468k
                        first = 0;
377
483k
                    else
378
14.9k
                        g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
379
483k
                    if (count == bufsize) {
380
0
                        /* Valid character, but we filled the buffer. Attach this
381
0
                        * one to the buffers linked list, and continue with a new
382
0
                        * one. */
383
0
                        MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
384
0
                        buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
385
0
                        count = 0;
386
0
                    }
387
483k
                    buffer[count++] = g;
388
483k
                    total++;
389
483k
                    if (stopper_chars && *stopper_chars == total) {
390
5
                        reached_stopper = 1;
391
5
                        goto done;
392
5
                    }
393
483k
                    if (MVM_string_decode_stream_maybe_sep(tc, seps, g)) {
394
30
                        reached_stopper = 1;
395
30
                        goto done;
396
30
                    }
397
483k
                }
398
483k
                break;
399
483k
            }
400
0
            case UTF8_REJECT:
401
0
                MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
402
0
                break;
403
483k
            }
404
483k
        }
405
184
        cur_bytes = cur_bytes->next;
406
184
    }
407
218
  done:
408
218
409
218
    /* Attach what we successfully parsed as a result buffer, and trim away
410
218
     * what we chewed through. */
411
218
    if (count) {
412
214
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
413
214
    }
414
4
    else {
415
4
        MVM_free(buffer);
416
4
    }
417
218
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
418
218
419
218
    return reached_stopper;
420
218
}
421
422
/* Encodes the specified string to UTF-8. */
423
char * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
424
        MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
425
13.6k
        MVMString *replacement, MVMint32 translate_newlines) {
426
13.6k
    MVMuint8        *result;
427
13.6k
    size_t           result_pos, result_limit;
428
13.6k
    MVMCodepointIter ci;
429
13.6k
    MVMStringIndex   strgraphs = MVM_string_graphs(tc, str);
430
13.6k
    MVMuint8        *repl_bytes = NULL;
431
13.6k
    MVMuint64        repl_length;
432
13.6k
433
13.6k
    if (start < 0 || start > strgraphs)
434
0
        MVM_exception_throw_adhoc(tc, "start out of range");
435
13.6k
    if (length == -1)
436
13.5k
        length = strgraphs;
437
13.6k
    if (length < 0 || start + length > strgraphs)
438
0
        MVM_exception_throw_adhoc(tc, "length out of range");
439
13.6k
440
13.6k
    if (replacement)
441
0
        repl_bytes = (MVMuint8 *) MVM_string_utf8_encode_substr(tc,
442
0
            replacement, &repl_length, 0, -1, NULL, translate_newlines);
443
13.6k
444
13.6k
    /* Guesstimate that we'll be within 2 bytes for most chars most of the
445
13.6k
     * time, and give ourselves 4 bytes breathing space. */
446
13.6k
    result_limit = 2 * length;
447
13.6k
    result       = MVM_malloc(result_limit + 4);
448
13.6k
    result_pos   = 0;
449
13.6k
450
13.6k
    /* Iterate the codepoints and encode them. */
451
13.6k
    MVM_string_ci_init(tc, &ci, str, translate_newlines);
452
1.12M
    while (MVM_string_ci_has_more(tc, &ci)) {
453
1.10M
        MVMint32 bytes;
454
1.10M
        MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci);
455
1.10M
        if (result_pos >= result_limit) {
456
4
            result_limit *= 2;
457
4
            result = MVM_realloc(result, result_limit + 4);
458
4
        }
459
1.10M
        bytes = utf8_encode(result + result_pos, cp);
460
1.10M
        if (bytes)
461
1.10M
            result_pos += bytes;
462
0
        else if (replacement) {
463
0
            if (repl_length >= result_limit || result_pos >= result_limit - repl_length) {
464
0
                result_limit += repl_length;
465
0
                result = MVM_realloc(result, result_limit + 4);
466
0
            }
467
0
            memcpy(result + result_pos, repl_bytes, repl_length);
468
0
            result_pos += repl_length;
469
0
        }
470
0
        else {
471
0
            MVM_free(result);
472
0
            MVM_free(repl_bytes);
473
0
            MVM_string_utf8_throw_encoding_exception(tc, cp);
474
0
        }
475
1.10M
    }
476
13.6k
477
13.6k
    if (output_size)
478
13.6k
        *output_size = (MVMuint64)result_pos;
479
13.6k
    MVM_free(repl_bytes);
480
13.6k
    return (char *)result;
481
13.6k
}
482
483
/* Encodes the specified string to UTF-8. */
484
char * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size,
485
2.17k
        MVMint32 translate_newlines) {
486
2.17k
    return MVM_string_utf8_encode_substr(tc, str, output_size, 0, -1, NULL,
487
2.17k
        translate_newlines);
488
2.17k
}
489
490
/* Encodes the specified string to a UTF-8 C string. */
491
1.96k
char * MVM_string_utf8_encode_C_string(MVMThreadContext *tc, MVMString *str) {
492
1.96k
    MVMuint64 output_size;
493
1.96k
    char * result;
494
1.96k
    char * utf8_string = MVM_string_utf8_encode(tc, str, &output_size, 0);
495
1.96k
    /* this is almost always called from error-handling code. Don't care if it
496
1.96k
     * contains embedded NULs. XXX TODO: Make sure all uses of this free what it returns */
497
1.96k
    result = MVM_malloc(output_size + 1);
498
1.96k
    memcpy(result, utf8_string, output_size);
499
1.96k
    MVM_free(utf8_string);
500
1.96k
    result[output_size] = (char)0;
501
1.96k
    return result;
502
1.96k
}
503
504
0
void MVM_string_utf8_throw_encoding_exception (MVMThreadContext *tc, MVMCodepoint cp) {
505
0
    const char *gencat = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
506
0
    if(cp > 0x10FFFF) {
507
0
        MVM_exception_throw_adhoc(tc,
508
0
            "Error encoding UTF-8 string: could not encode codepoint %d (0x%X), codepoint out of bounds. Cannot encode higher than %d (0x%X)",
509
0
            cp, cp, 0x10FFFF, 0x10FFFF);
510
0
    }
511
0
    else if (strcmp("Cs", gencat) == 0) {
512
0
        MVM_exception_throw_adhoc(tc,
513
0
            "Error encoding UTF-8 string: could not encode Unicode Surrogate codepoint %d (0x%X)",
514
0
            cp, cp);
515
0
    }
516
0
    else {
517
0
        MVM_exception_throw_adhoc(tc,
518
0
            "Error encoding UTF-8 string: could not encode codepoint %d (0x%X)",
519
0
            cp, cp);
520
0
    }
521
0
}