/home/travis/build/MoarVM/MoarVM/src/strings/utf8_c8.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* UTF-8 Clean-8 is an encoder/decoder that primarily works as the UTF-8 one. |
4 | | * However, upon encountering a byte sequence that will either not decode as |
5 | | * valid UTF-8, or that would not round-trip due to normalization, it will use |
6 | | * NFG synthetics to keep track of the original bytes involved. This means that |
7 | | * encoding back to UTF-8 Clean-8 will be able to recreate the bytes as they |
8 | | * originally existed. The synthetics contain 4 codepoints: |
9 | | * |
10 | | * * The codepoint 0x10FFFD (which is a private use codepoint) |
11 | | * * The codepoint 'x' |
12 | | * * The upper 4 bits of the non-decodable byte as a hex char (0..9A..F) |
13 | | * * The lower 4 bits as the non-decodable byte as a hex char (0..9A..F) |
14 | | * |
15 | | * Under normal UTF-8 encoding, this means the unrepresentable characters will |
16 | | * come out as something like `?xFF`. |
17 | | * |
18 | | * UTF-8 Clean-8 is used in places where MoarVM receives strings from the |
19 | | * environment, command line arguments, and file system queries. |
20 | | */ |
21 | | |
22 | | /* begin not_gerd section (modified from original) |
23 | | // Copyright 2012 not_gerd |
24 | | // see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122 |
25 | | |
26 | | Permission is granted to use, modify, and / or redistribute at will. |
27 | | |
28 | | This includes removing authorship notices, re-use of code parts in |
29 | | other software (with or without giving credit), and / or creating a |
30 | | commercial product based on it. |
31 | | |
32 | | This permission is not revocable by the author. |
33 | | |
34 | | This software is provided as-is. Use it at your own risk. There is |
35 | | no warranty whatsoever, neither expressed nor implied, and by using |
36 | | this software you accept that the author(s) shall not be held liable |
37 | | for any loss of data, loss of service, or other damages, be they |
38 | | incidental or consequential. Your only option other than accepting |
39 | | this is not to use the software at all. |
40 | | */ |
41 | | |
42 | | enum { |
43 | | CP_CHAR = 1 << 0, |
44 | | CP_LOW_SURROGATE = 1 << 1, |
45 | | CP_HIGH_SURROGATE = 1 << 2, |
46 | | CP_NONCHAR = 1 << 3, |
47 | | CP_OVERFLOW = 1 << 4, |
48 | | |
49 | | U8_SINGLE = 1 << 5, |
50 | | U8_DOUBLE = 1 << 6, |
51 | | U8_TRIPLE = 1 << 7, |
52 | | U8_QUAD = 1 << 8 |
53 | | }; |
54 | | |
55 | 90.7k | static unsigned classify(MVMCodepoint cp) { |
56 | 90.7k | if(cp <= 0x7F) |
57 | 90.7k | return CP_CHAR | U8_SINGLE; |
58 | 90.7k | |
59 | 4 | if(cp <= 0x07FF) |
60 | 4 | return CP_CHAR | U8_DOUBLE; |
61 | 4 | |
62 | 0 | if(0xD800 <= cp && cp <= 0xDBFF) |
63 | 0 | return CP_HIGH_SURROGATE | U8_TRIPLE; |
64 | 0 |
|
65 | 0 | if(0xDC00 <= cp && cp <= 0xDFFF) |
66 | 0 | return CP_LOW_SURROGATE | U8_TRIPLE; |
67 | 0 |
|
68 | 0 | if(0xFDD0 <= cp && cp <= 0xFDEF) |
69 | 0 | return CP_NONCHAR | U8_TRIPLE; |
70 | 0 |
|
71 | 0 | if(cp <= 0xFFFD) |
72 | 0 | return CP_CHAR | U8_TRIPLE; |
73 | 0 |
|
74 | 0 | if(cp == 0xFFFE || cp == 0xFFFF) |
75 | 0 | return CP_NONCHAR | U8_TRIPLE; |
76 | 0 |
|
77 | 0 | if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF)) |
78 | 0 | return CP_NONCHAR | U8_QUAD; |
79 | 0 |
|
80 | 0 | if(cp <= 0x10FFFF) |
81 | 0 | return CP_CHAR | U8_QUAD; |
82 | 0 |
|
83 | 0 | if(cp <= 0x1FFFFF) |
84 | 0 | return CP_OVERFLOW | U8_QUAD; |
85 | 0 |
|
86 | 0 | return 0; |
87 | 0 | } |
88 | | |
89 | 90.7k | static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) { |
90 | 90.7k | unsigned cc = classify(cp); |
91 | 90.7k | |
92 | 90.7k | if (!(cc & (CP_CHAR | CP_NONCHAR))) |
93 | 0 | return 0; |
94 | 90.7k | |
95 | 90.7k | if (cc & U8_SINGLE) { |
96 | 90.7k | bp[0] = (MVMuint8)cp; |
97 | 90.7k | return 1; |
98 | 90.7k | } |
99 | 90.7k | |
100 | 4 | if (cc & U8_DOUBLE) { |
101 | 4 | bp[0] = (MVMuint8)(( 6 << 5) | (cp >> 6)); |
102 | 4 | bp[1] = (MVMuint8)(( 2 << 6) | (cp & 0x3F)); |
103 | 4 | return 2; |
104 | 4 | } |
105 | 4 | |
106 | 0 | if (cc & U8_TRIPLE) { |
107 | 0 | bp[0] = (MVMuint8)((14 << 4) | (cp >> 12)); |
108 | 0 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
109 | 0 | bp[2] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
110 | 0 | return 3; |
111 | 0 | } |
112 | 0 |
|
113 | 0 | if (cc & U8_QUAD) { |
114 | 0 | bp[0] = (MVMuint8)((30 << 3) | (cp >> 18)); |
115 | 0 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F)); |
116 | 0 | bp[2] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
117 | 0 | bp[3] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
118 | 0 | return 4; |
119 | 0 | } |
120 | 0 |
|
121 | 0 | return 0; |
122 | 0 | } |
123 | | |
124 | | /* end not_gerd section */ |
125 | | |
126 | | #define UTF8_MAXINC (32 * 1024 * 1024) |
127 | | |
128 | 0 | static void ensure_buffer(MVMGrapheme32 **buffer, MVMint32 *bufsize, MVMint32 needed) { |
129 | 0 | while (needed >= *bufsize) |
130 | 0 | *buffer = MVM_realloc(*buffer, sizeof(MVMGrapheme32) * ( |
131 | 0 | *bufsize >= UTF8_MAXINC ? (*bufsize += UTF8_MAXINC) : (*bufsize *= 2) |
132 | 0 | )); |
133 | 0 | } |
134 | | |
135 | | static const MVMuint8 hex_chars[] = { '0', '1', '2', '3', '4', '5', '6', '7', |
136 | | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; |
137 | 10 | static MVMGrapheme32 synthetic_for(MVMThreadContext *tc, MVMuint8 invalid) { |
138 | 10 | if (invalid > 0x7F) { |
139 | 8 | /* A real invalid. */ |
140 | 8 | MVMuint8 high = invalid >> 4; |
141 | 8 | MVMuint8 low = invalid & 0x0F; |
142 | 8 | MVMCodepoint cps[] = { 0x10FFFD, 'x', hex_chars[high], hex_chars[low] }; |
143 | 8 | return MVM_nfg_codes_to_grapheme_utf8_c8(tc, cps, 4); |
144 | 8 | } |
145 | 2 | else { |
146 | 2 | /* Was in things thrown out as invalid by the decoder, but has an |
147 | 2 | * ASCII interpretation, so hand it back as is. */ |
148 | 2 | return invalid; |
149 | 2 | } |
150 | 10 | } |
151 | | |
152 | | /* What the UTF-C8 decode process is expecting. */ |
153 | | typedef enum { |
154 | | EXPECT_START = 0, |
155 | | EXPECT_CONTINUATION = 1 |
156 | | } Expecting; |
157 | | |
158 | | /* Decode state for the UTF8-C8 decoder. */ |
159 | | typedef struct { |
160 | | /* The UTF-8 we're decoding. */ |
161 | | const MVMuint8 *utf8; |
162 | | |
163 | | /* The index of the current byte we're decoding. */ |
164 | | size_t cur_byte; |
165 | | |
166 | | /* The index of the first unaccepted byte. */ |
167 | | size_t unaccepted_start; |
168 | | |
169 | | /* What kind of byte we're expecting next. */ |
170 | | Expecting expecting; |
171 | | |
172 | | /* The current codepoint we're decoding. */ |
173 | | MVMCodepoint cur_codepoint; |
174 | | |
175 | | /* The result buffer we're decoding into. */ |
176 | | MVMGrapheme32 *result; |
177 | | |
178 | | /* The current position in the result buffer. */ |
179 | | size_t result_pos; |
180 | | |
181 | | /* Buffer of original codepoints, to ensure we will not spit out any |
182 | | * synthetics into the result that will re-order on round-trip. */ |
183 | | MVMCodepoint *orig_codes; |
184 | | |
185 | | /* Position we're at in inserting into orig_codes. */ |
186 | | size_t orig_codes_pos; |
187 | | |
188 | | /* First orig_codes index that did not yet go through the normalizer. */ |
189 | | size_t orig_codes_unnormalized; |
190 | | |
191 | | /* The normalizer we're using to make synthetics that will not cause an |
192 | | * order change on output. */ |
193 | | MVMNormalizer norm; |
194 | | |
195 | | /* Bad bytes from an earlier buffer, for the sake of streaming decode. */ |
196 | | MVMuint8 prev_bad_bytes[4]; |
197 | | MVMint32 num_prev_bad_bytes; |
198 | | } DecodeState; |
199 | | |
200 | | /* Appends a single grapheme to the buffer if it will not cause a mismatch |
201 | | * with the original codepoints upon encoding back to UTF-8. Returns non-zero |
202 | | * in this case. Otherwise, appends synthetics for the bytes the original code |
203 | | * points were encoded as. Since we can end up with index mis-matches, we just |
204 | | * spit out codepoints to catch the normalizer up to everything in the orig |
205 | | * codes buffer. */ |
206 | 766k | static int append_grapheme(MVMThreadContext *tc, DecodeState *state, MVMGrapheme32 g) { |
207 | 766k | if (g == state->orig_codes[state->orig_codes_unnormalized]) { |
208 | 766k | /* Easy case: exact match. */ |
209 | 766k | state->result[state->result_pos++] = g; |
210 | 766k | state->orig_codes_unnormalized++; |
211 | 766k | return 1; |
212 | 766k | } |
213 | 2 | else if (g < 0) { |
214 | 0 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
215 | 0 | int mismatch = 0; |
216 | 0 | if (synth->codes[0] == state->orig_codes[state->orig_codes_unnormalized]) { |
217 | 0 | MVMint32 i; |
218 | 0 | for (i = 1; i < synth->num_codes; i++) { |
219 | 0 | size_t orig_idx = state->orig_codes_unnormalized + i; |
220 | 0 | if (state->orig_codes_pos <= orig_idx || |
221 | 0 | state->orig_codes[orig_idx] != synth->codes[i]) { |
222 | 0 | mismatch = 1; |
223 | 0 | break; |
224 | 0 | } |
225 | 0 | } |
226 | 0 | } |
227 | 0 | else { |
228 | 0 | mismatch = 1; |
229 | 0 | } |
230 | 0 | if (!mismatch) { |
231 | 0 | state->result[state->result_pos++] = g; |
232 | 0 | state->orig_codes_unnormalized += synth->num_codes; |
233 | 0 | return 1; |
234 | 0 | } |
235 | 0 | } |
236 | 766k | |
237 | 766k | /* If we get here, then normalization would trash the original bytes. */ |
238 | 2 | { |
239 | 2 | /* Spit out synthetics to keep the bytes as is. */ |
240 | 2 | size_t i, j; |
241 | 6 | for (i = state->orig_codes_unnormalized; i < state->orig_codes_pos; i++) { |
242 | 4 | MVMCodepoint to_encode = state->orig_codes[i]; |
243 | 4 | MVMuint8 encoded[4]; |
244 | 4 | MVMint32 bytes = utf8_encode(encoded, to_encode); |
245 | 10 | for (j = 0; j < bytes; j++) |
246 | 6 | state->result[state->result_pos++] = synthetic_for(tc, encoded[j]); |
247 | 4 | } |
248 | 2 | |
249 | 2 | /* Consider all codes pushed now normalized. */ |
250 | 2 | state->orig_codes_unnormalized = state->orig_codes_pos; |
251 | 2 | |
252 | 2 | /* Put a clean normalizer in place. */ |
253 | 2 | MVM_unicode_normalizer_cleanup(tc, &(state->norm)); |
254 | 2 | MVM_unicode_normalizer_init(tc, &(state->norm), MVM_NORMALIZE_NFG); |
255 | 2 | return 0; |
256 | 766k | } |
257 | 766k | } |
258 | | |
259 | | /* Called when decoding has reached an acceptable codepoint. */ |
260 | 766k | static void process_ok_codepoint(MVMThreadContext *tc, DecodeState *state) { |
261 | 766k | MVMint32 ready; |
262 | 766k | MVMGrapheme32 g; |
263 | 766k | |
264 | 766k | /* Consider the byte range accepted. */ |
265 | 766k | state->unaccepted_start = state->cur_byte + 1; |
266 | 766k | |
267 | 766k | /* Insert into original codepoints list and hand it to the normalizer. */ |
268 | 766k | state->orig_codes[state->orig_codes_pos++] = state->cur_codepoint; |
269 | 766k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, |
270 | 766k | &(state->norm), state->cur_codepoint, &g); |
271 | 766k | |
272 | 766k | /* If the normalizer produced some output... */ |
273 | 766k | if (ready) { |
274 | 747k | if (append_grapheme(tc, state, g)) { |
275 | 748k | while (--ready > 0) { |
276 | 288 | g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm)); |
277 | 288 | if (!append_grapheme(tc, state, g)) |
278 | 0 | break; |
279 | 288 | } |
280 | 747k | } |
281 | 747k | } |
282 | 766k | |
283 | 766k | /* We've no longer any bad bytes to care about from earlier buffers; |
284 | 766k | * they ended up making an acceptable codepoint. */ |
285 | 766k | state->num_prev_bad_bytes = 0; |
286 | 766k | } |
287 | | |
288 | | /* Called when a bad byte has been encountered, or at the end of output. */ |
289 | 18.5k | static void process_bad_bytes(MVMThreadContext *tc, DecodeState *state) { |
290 | 18.5k | size_t i; |
291 | 18.5k | MVMint32 ready; |
292 | 18.5k | |
293 | 18.5k | /* Flush normalization buffer and take from that. */ |
294 | 18.5k | MVM_unicode_normalizer_eof(tc, &(state->norm)); |
295 | 18.5k | ready = MVM_unicode_normalizer_available(tc, &(state->norm)); |
296 | 37.1k | while (ready-- > 0) { |
297 | 18.5k | MVMGrapheme32 g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm)); |
298 | 18.5k | if (!append_grapheme(tc, state, g)) |
299 | 2 | break; |
300 | 18.5k | } |
301 | 18.5k | |
302 | 18.5k | /* Now add in synthetics for bad bytes. */ |
303 | 18.5k | for (i = 0; i < state->num_prev_bad_bytes; i++) |
304 | 0 | state->result[state->result_pos++] = synthetic_for(tc, state->prev_bad_bytes[i]); |
305 | 18.5k | state->num_prev_bad_bytes = 0; |
306 | 18.6k | for (i = state->unaccepted_start; i <= state->cur_byte; i++) |
307 | 4 | state->result[state->result_pos++] = synthetic_for(tc, state->utf8[i]); |
308 | 18.5k | state->unaccepted_start = state->cur_byte + 1; |
309 | 18.5k | } |
310 | | /* Check for if the codepoint is in range. Make sure it's not over 0x10FFFF |
311 | | * and make sure it isn't a Surrogate */ |
312 | 5 | MVM_STATIC_INLINE int in_range (MVMCodepoint cp) { |
313 | 5 | return ( 0 <= cp && cp <= 0x10FFFF) |
314 | 5 | && (cp < 0xD800 || 0xDFFF < cp); /* Surrogates */ |
315 | 5 | } |
316 | | /* Decodes the specified number of bytes of utf8 into an NFG string, creating |
317 | | * a result of the specified type. The type must have the MVMString REPR. */ |
318 | | MVMString * MVM_string_utf8_c8_decode(MVMThreadContext *tc, const MVMObject *result_type, |
319 | 18.5k | const char *utf8, size_t bytes) { |
320 | 18.5k | DecodeState state; |
321 | 18.5k | |
322 | 18.5k | /* Local state for decode loop. */ |
323 | 18.5k | int expected_continuations = 0; |
324 | 18.5k | int min_expected_codepoint; |
325 | 18.5k | |
326 | 18.5k | /* Don't do anything if empty. */ |
327 | 18.5k | if (bytes == 0) |
328 | 0 | return tc->instance->str_consts.empty; |
329 | 18.5k | |
330 | 18.5k | /* Decoding state, in a struct to easily pass to utility routines. |
331 | 18.5k | * Result buffer is a maximum estimate to avoid realloc; we can shrink |
332 | 18.5k | * it at the end. */ |
333 | 18.5k | state.utf8 = (MVMuint8 *)utf8; |
334 | 18.5k | state.cur_byte = 0; |
335 | 18.5k | state.unaccepted_start = 0; |
336 | 18.5k | state.expecting = EXPECT_START; |
337 | 18.5k | state.cur_codepoint = 0; |
338 | 18.5k | state.result = MVM_malloc(sizeof(MVMGrapheme32) * bytes); |
339 | 18.5k | state.result_pos = 0; |
340 | 18.5k | state.orig_codes = MVM_malloc(sizeof(MVMCodepoint) * bytes); |
341 | 18.5k | state.orig_codes_pos = 0; |
342 | 18.5k | state.orig_codes_unnormalized = 0; |
343 | 18.5k | state.num_prev_bad_bytes = 0; |
344 | 18.5k | MVM_unicode_normalizer_init(tc, &(state.norm), MVM_NORMALIZE_NFG); |
345 | 18.5k | |
346 | 785k | while (state.cur_byte < bytes) { |
347 | 766k | MVMuint8 decode_byte = utf8[state.cur_byte]; |
348 | 766k | switch (state.expecting) { |
349 | 766k | case EXPECT_START: |
350 | 766k | if ((decode_byte & 0x80) == 0) { |
351 | 766k | /* Single byte sequence. */ |
352 | 766k | state.cur_codepoint = decode_byte; |
353 | 766k | process_ok_codepoint(tc, &state); |
354 | 766k | } |
355 | 9 | else if ((decode_byte & 0xE0) == 0xC0) { |
356 | 7 | state.cur_codepoint = decode_byte & 0x1F; |
357 | 7 | state.expecting = EXPECT_CONTINUATION; |
358 | 7 | expected_continuations = 1; |
359 | 7 | min_expected_codepoint = 0x80; |
360 | 7 | } |
361 | 2 | else if ((decode_byte & 0xF0) == 0xE0) { |
362 | 0 | state.cur_codepoint = decode_byte & 0x0F; |
363 | 0 | state.expecting = EXPECT_CONTINUATION; |
364 | 0 | expected_continuations = 2; |
365 | 0 | min_expected_codepoint = 0x800; |
366 | 0 | } |
367 | 2 | else if ((decode_byte & 0xF8) == 0xF0) { |
368 | 0 | state.cur_codepoint = decode_byte & 0x07; |
369 | 0 | state.expecting = EXPECT_CONTINUATION; |
370 | 0 | expected_continuations = 3; |
371 | 0 | min_expected_codepoint = 0x10000; |
372 | 0 | } |
373 | 2 | else { |
374 | 2 | /* Invalid byte sequence. */ |
375 | 2 | process_bad_bytes(tc, &state); |
376 | 2 | } |
377 | 766k | break; |
378 | 5 | case EXPECT_CONTINUATION: |
379 | 5 | if ((decode_byte & 0xC0) == 0x80) { |
380 | 5 | state.cur_codepoint = (state.cur_codepoint << 6) |
381 | 5 | | (decode_byte & 0x3F); |
382 | 5 | expected_continuations--; |
383 | 5 | if (expected_continuations == 0) { |
384 | 5 | if (min_expected_codepoint <= state.cur_codepoint && in_range(state.cur_codepoint)) |
385 | 5 | process_ok_codepoint(tc, &state); |
386 | 5 | else |
387 | 0 | process_bad_bytes(tc, &state); |
388 | 5 | state.expecting = EXPECT_START; |
389 | 5 | } |
390 | 5 | } |
391 | 0 | else { |
392 | 0 | /* Invalid byte sequence. */ |
393 | 0 | process_bad_bytes(tc, &state); |
394 | 0 | state.expecting = EXPECT_START; |
395 | 0 | } |
396 | 5 | break; |
397 | 766k | } |
398 | 766k | state.cur_byte++; |
399 | 766k | } |
400 | 18.5k | |
401 | 18.5k | /* Handle anything dangling off the end. */ |
402 | 18.5k | state.cur_byte--; /* So we don't read 1 past the end. */ |
403 | 18.5k | process_bad_bytes(tc, &state); |
404 | 18.5k | |
405 | 18.5k | MVM_free(state.orig_codes); |
406 | 18.5k | MVM_unicode_normalizer_cleanup(tc, &(state.norm)); |
407 | 18.5k | |
408 | 18.5k | { |
409 | 18.5k | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
410 | 18.5k | result->body.storage.blob_32 = state.result; |
411 | 18.5k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
412 | 18.5k | result->body.num_graphs = state.result_pos; |
413 | 18.5k | return result; |
414 | 18.5k | } |
415 | 18.5k | } |
416 | | |
417 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
418 | | * buffers, or until a stopper is reached. */ |
419 | | MVMuint32 MVM_string_utf8_c8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
420 | | const MVMint32 *stopper_chars, |
421 | | MVMDecodeStreamSeparators *seps, |
422 | 0 | MVMint32 eof) { |
423 | 0 | /* Local state for decode loop. */ |
424 | 0 | MVMDecodeStreamBytes *cur_bytes; |
425 | 0 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
426 | 0 | MVMint32 last_accept_pos = ds->bytes_head_pos; |
427 | 0 | DecodeState state; |
428 | 0 | int expected_continuations = 0; |
429 | 0 | int min_expected_codepoint; |
430 | 0 | MVMuint32 reached_stopper = 0; |
431 | 0 | MVMint32 result_graphs = 0; |
432 | 0 |
|
433 | 0 | /* If there's no buffers, we're done. */ |
434 | 0 | if (!ds->bytes_head) |
435 | 0 | return 0; |
436 | 0 | last_accept_pos = ds->bytes_head_pos; |
437 | 0 |
|
438 | 0 | /* If we're asked for zero chars, also done. */ |
439 | 0 | if (stopper_chars && *stopper_chars == 0) |
440 | 0 | return 1; |
441 | 0 |
|
442 | 0 | /* Otherwise set up decode state, stealing normalizer of the decode |
443 | 0 | * stream and re-instating any past orig_codes. */ |
444 | 0 | state.expecting = EXPECT_START; |
445 | 0 | state.cur_codepoint = 0; |
446 | 0 | state.num_prev_bad_bytes = 0; |
447 | 0 | memcpy(&(state.norm), &(ds->norm), sizeof(MVMNormalizer)); |
448 | 0 | if (ds->decoder_state) { |
449 | 0 | MVMCodepoint *saved = (MVMCodepoint *)ds->decoder_state; |
450 | 0 | state.orig_codes = MVM_malloc( |
451 | 0 | sizeof(MVMCodepoint) * (saved[0] + ds->bytes_head->length) |
452 | 0 | ); |
453 | 0 | state.orig_codes_pos = saved[0]; |
454 | 0 | state.orig_codes_unnormalized = 0; |
455 | 0 | memcpy(state.orig_codes, saved + 1, saved[0] * sizeof(MVMCodepoint)); |
456 | 0 | MVM_free(ds->decoder_state); |
457 | 0 | ds->decoder_state = NULL; |
458 | 0 | } |
459 | 0 | else { |
460 | 0 | state.orig_codes = NULL; |
461 | 0 | state.orig_codes_pos = 0; |
462 | 0 | state.orig_codes_unnormalized = 0; |
463 | 0 | } |
464 | 0 |
|
465 | 0 | /* Decode each of the buffers. */ |
466 | 0 | cur_bytes = ds->bytes_head; |
467 | 0 | reached_stopper = 0; |
468 | 0 | while (cur_bytes && !reached_stopper) { |
469 | 0 | /* Set up decode state for this buffer. */ |
470 | 0 | MVMint32 bytes = ds->bytes_head->length; |
471 | 0 | state.result = MVM_malloc(bytes * sizeof(MVMGrapheme32)); |
472 | 0 | state.orig_codes = MVM_realloc(state.orig_codes, |
473 | 0 | sizeof(MVMCodepoint) * (state.orig_codes_pos + bytes)); |
474 | 0 | state.result_pos = 0; |
475 | 0 | state.utf8 = (const MVMuint8*)cur_bytes->bytes; |
476 | 0 | state.cur_byte = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
477 | 0 | state.unaccepted_start = state.cur_byte; |
478 | 0 |
|
479 | 0 | /* Process this buffer. */ |
480 | 0 | while (state.cur_byte < bytes) { |
481 | 0 | /* Process a byte. */ |
482 | 0 | MVMuint8 decode_byte = state.utf8[state.cur_byte]; |
483 | 0 | MVMint32 maybe_new_graph = 0; |
484 | 0 | switch (state.expecting) { |
485 | 0 | case EXPECT_START: |
486 | 0 | if ((decode_byte & 0x80) == 0) { |
487 | 0 | /* Single byte sequence. */ |
488 | 0 | state.cur_codepoint = decode_byte; |
489 | 0 | process_ok_codepoint(tc, &state); |
490 | 0 | maybe_new_graph = 1; |
491 | 0 | } |
492 | 0 | else if ((decode_byte & 0xE0) == 0xC0) { |
493 | 0 | state.cur_codepoint = decode_byte & 0x1F; |
494 | 0 | state.expecting = EXPECT_CONTINUATION; |
495 | 0 | expected_continuations = 1; |
496 | 0 | min_expected_codepoint = 0x80; |
497 | 0 | } |
498 | 0 | else if ((decode_byte & 0xF0) == 0xE0) { |
499 | 0 | state.cur_codepoint = decode_byte & 0x0F; |
500 | 0 | state.expecting = EXPECT_CONTINUATION; |
501 | 0 | expected_continuations = 2; |
502 | 0 | min_expected_codepoint = 0x800; |
503 | 0 | } |
504 | 0 | else if ((decode_byte & 0xF8) == 0xF0) { |
505 | 0 | state.cur_codepoint = decode_byte & 0x07; |
506 | 0 | state.expecting = EXPECT_CONTINUATION; |
507 | 0 | expected_continuations = 3; |
508 | 0 | min_expected_codepoint = 0x10000; |
509 | 0 | } |
510 | 0 | else { |
511 | 0 | /* Invalid byte sequence. */ |
512 | 0 | process_bad_bytes(tc, &state); |
513 | 0 | maybe_new_graph = 1; |
514 | 0 | } |
515 | 0 | break; |
516 | 0 | case EXPECT_CONTINUATION: |
517 | 0 | if ((decode_byte & 0xC0) == 0x80) { |
518 | 0 | state.cur_codepoint = (state.cur_codepoint << 6) |
519 | 0 | | (decode_byte & 0x3F); |
520 | 0 | expected_continuations--; |
521 | 0 | if (expected_continuations == 0) { |
522 | 0 | if (state.cur_codepoint >= min_expected_codepoint) |
523 | 0 | process_ok_codepoint(tc, &state); |
524 | 0 | else |
525 | 0 | process_bad_bytes(tc, &state); |
526 | 0 | maybe_new_graph = 1; |
527 | 0 | state.expecting = EXPECT_START; |
528 | 0 | } |
529 | 0 | } |
530 | 0 | else { |
531 | 0 | /* Invalid byte sequence. */ |
532 | 0 | process_bad_bytes(tc, &state); |
533 | 0 | maybe_new_graph = 1; |
534 | 0 | state.expecting = EXPECT_START; |
535 | 0 | } |
536 | 0 | break; |
537 | 0 | } |
538 | 0 | state.cur_byte++; |
539 | 0 |
|
540 | 0 | /* See if we've reached a stopper. */ |
541 | 0 | if (maybe_new_graph && state.result_pos > 0) { |
542 | 0 | if (stopper_chars) { |
543 | 0 | if (result_graphs + state.result_pos >= *stopper_chars) { |
544 | 0 | reached_stopper = 1; |
545 | 0 | break; |
546 | 0 | } |
547 | 0 | } |
548 | 0 | if (MVM_string_decode_stream_maybe_sep(tc, seps, |
549 | 0 | state.result[state.result_pos - 1])) { |
550 | 0 | reached_stopper = 1; |
551 | 0 | break; |
552 | 0 | } |
553 | 0 | } |
554 | 0 | } |
555 | 0 |
|
556 | 0 | /* If we're at EOF and this is the last buffer, force out last bytes. */ |
557 | 0 | if (eof && !reached_stopper && !cur_bytes->next) { |
558 | 0 | state.cur_byte--; /* So we don't read 1 past the end. */ |
559 | 0 | process_bad_bytes(tc, &state); |
560 | 0 | } |
561 | 0 |
|
562 | 0 | /* Attach what we successfully parsed as a result buffer, and trim away |
563 | 0 | * what we chewed through. */ |
564 | 0 | if (state.result_pos) |
565 | 0 | MVM_string_decodestream_add_chars(tc, ds, state.result, state.result_pos); |
566 | 0 | else |
567 | 0 | MVM_free(state.result); |
568 | 0 | result_graphs += state.result_pos; |
569 | 0 |
|
570 | 0 | /* Update our accepted position. */ |
571 | 0 | if (state.unaccepted_start > 0) { |
572 | 0 | last_accept_bytes = cur_bytes; |
573 | 0 | last_accept_pos = state.unaccepted_start; |
574 | 0 | } |
575 | 0 |
|
576 | 0 | /* If there were bytes we didn't accept, hold on to them in case we |
577 | 0 | * need to emit them as bad bytes. */ |
578 | 0 | if (state.unaccepted_start != state.cur_byte && cur_bytes->next) { |
579 | 0 | int i; |
580 | 0 | for (i = state.unaccepted_start; i < state.cur_byte; i++) |
581 | 0 | state.prev_bad_bytes[state.num_prev_bad_bytes++] = state.utf8[i]; |
582 | 0 | } |
583 | 0 |
|
584 | 0 | cur_bytes = cur_bytes->next; |
585 | 0 | } |
586 | 0 |
|
587 | 0 | /* Eat the bytes we decoded. */ |
588 | 0 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
589 | 0 |
|
590 | 0 | /* Persist current normalizer. */ |
591 | 0 | memcpy(&(ds->norm), &(state.norm), sizeof(MVMNormalizer)); |
592 | 0 |
|
593 | 0 | /* Stash away any leftover codepoints we'll need to examine. */ |
594 | 0 | if (state.orig_codes_pos && state.orig_codes_pos != state.orig_codes_unnormalized) { |
595 | 0 | size_t diff = state.orig_codes_pos - state.orig_codes_unnormalized; |
596 | 0 | MVMCodepoint *saved = MVM_malloc(sizeof(MVMCodepoint) * (1 + diff)); |
597 | 0 | saved[0] = diff; |
598 | 0 | memcpy(saved + 1, state.orig_codes + state.orig_codes_unnormalized, |
599 | 0 | diff * sizeof(MVMCodepoint)); |
600 | 0 | ds->decoder_state = saved; |
601 | 0 | } |
602 | 0 | MVM_free(state.orig_codes); |
603 | 0 |
|
604 | 0 | return reached_stopper; |
605 | 0 | } |
606 | | |
607 | | /* Encodes the specified string to UTF-8. */ |
608 | | static void emit_cp(MVMThreadContext *tc, MVMCodepoint cp, MVMuint8 **result, |
609 | | size_t *result_pos, size_t *result_limit, |
610 | 90.7k | MVMuint8 *repl_bytes, MVMuint64 repl_length) { |
611 | 90.7k | MVMint32 bytes; |
612 | 90.7k | if (*result_pos >= *result_limit) { |
613 | 0 | *result_limit *= 2; |
614 | 0 | *result = MVM_realloc(*result, *result_limit + 4); |
615 | 0 | } |
616 | 90.7k | bytes = utf8_encode(*result + *result_pos, cp); |
617 | 90.7k | if (bytes) |
618 | 90.7k | *result_pos += bytes; |
619 | 0 | else if (repl_bytes) { |
620 | 0 | if (repl_length >= *result_limit || *result_pos >= *result_limit - repl_length) { |
621 | 0 | *result_limit += repl_length; |
622 | 0 | *result = MVM_realloc(*result, *result_limit + 4); |
623 | 0 | } |
624 | 0 | memcpy(*result + *result_pos, repl_bytes, repl_length); |
625 | 0 | *result_pos += repl_length; |
626 | 0 | } |
627 | 0 | else { |
628 | 0 | MVM_free(*result); |
629 | 0 | MVM_free(repl_bytes); |
630 | 0 | MVM_string_utf8_throw_encoding_exception(tc, cp); |
631 | 0 | } |
632 | 90.7k | } |
633 | 16 | static int hex2int(MVMThreadContext *tc, MVMCodepoint cp) { |
634 | 16 | if (cp >= '0' && cp <= '9') |
635 | 2 | return cp - '0'; |
636 | 14 | else if (cp >= 'A' && cp <= 'F') |
637 | 14 | return 10 + (cp - 'A'); |
638 | 14 | else |
639 | 0 | MVM_exception_throw_adhoc(tc, "UTF-8 C-8 encoding encountered corrupt synthetic"); |
640 | 16 | } |
641 | | char * MVM_string_utf8_c8_encode_substr(MVMThreadContext *tc, |
642 | 4.27k | MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement) { |
643 | 4.27k | MVMuint8 *result; |
644 | 4.27k | size_t result_pos, result_limit; |
645 | 4.27k | MVMGraphemeIter gi; |
646 | 4.27k | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
647 | 4.27k | MVMuint8 *repl_bytes = NULL; |
648 | 4.27k | MVMuint64 repl_length; |
649 | 4.27k | |
650 | 4.27k | if (start < 0 || start > strgraphs) |
651 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
652 | 4.27k | if (length == -1) |
653 | 4 | length = strgraphs; |
654 | 4.27k | if (length < 0 || start + length > strgraphs) |
655 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
656 | 4.27k | |
657 | 4.27k | if (replacement) |
658 | 4 | repl_bytes = (MVMuint8 *) MVM_string_utf8_c8_encode_substr(tc, replacement, &repl_length, 0, -1, NULL); |
659 | 4.27k | |
660 | 4.27k | /* Guesstimate that we'll be within 2 bytes for most chars most of the |
661 | 4.27k | * time, and give ourselves 4 bytes breathing space. */ |
662 | 4.27k | result_limit = 2 * length; |
663 | 4.27k | result = MVM_malloc(result_limit + 4); |
664 | 4.27k | result_pos = 0; |
665 | 4.27k | |
666 | 4.27k | /* We iterate graphemes, looking out for any synthetics. If we find a |
667 | 4.27k | * UTF-8 C-8 synthetic, then we spit out the raw byte. If we find any |
668 | 4.27k | * other synthetic, we iterate its codepoints. */ |
669 | 4.27k | MVM_string_gi_init(tc, &gi, str); |
670 | 95.0k | while (MVM_string_gi_has_more(tc, &gi)) { |
671 | 90.7k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
672 | 90.7k | if (g >= 0) { |
673 | 90.7k | emit_cp(tc, g, &result, &result_pos, &result_limit, repl_bytes, repl_length); |
674 | 90.7k | } |
675 | 8 | else { |
676 | 8 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
677 | 8 | if (synth->is_utf8_c8) { |
678 | 8 | /* UTF-8 C-8 synthetic; emit the byte. */ |
679 | 8 | if (result_pos >= result_limit) { |
680 | 0 | result_limit *= 2; |
681 | 0 | result = MVM_realloc(result, result_limit + 1); |
682 | 0 | } |
683 | 8 | result[result_pos++] = (hex2int(tc, synth->codes[2]) << 4) + |
684 | 8 | hex2int(tc, synth->codes[3]); |
685 | 8 | } |
686 | 0 | else { |
687 | 0 | MVMint32 i; |
688 | 0 | for (i = 0; i < synth->num_codes; i++) |
689 | 0 | emit_cp(tc, synth->codes[i], &result, &result_pos, &result_limit, repl_bytes, repl_length); |
690 | 0 | } |
691 | 8 | } |
692 | 90.7k | } |
693 | 4.27k | |
694 | 4.27k | if (output_size) |
695 | 4.27k | *output_size = (MVMuint64)result_pos; |
696 | 4.27k | MVM_free(repl_bytes); |
697 | 4.27k | return (char *)result; |
698 | 4.27k | } |
699 | | |
700 | | /* Encodes the specified string to UTF-8 C-8. */ |
701 | 4.26k | char * MVM_string_utf8_c8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size) { |
702 | 4.26k | return MVM_string_utf8_c8_encode_substr(tc, str, output_size, 0, |
703 | 4.26k | MVM_string_graphs(tc, str), NULL); |
704 | 4.26k | } |
705 | | |
706 | | /* Encodes the specified string to a UTF-8 C-8 C string. */ |
707 | 4.26k | char * MVM_string_utf8_c8_encode_C_string(MVMThreadContext *tc, MVMString *str) { |
708 | 4.26k | MVMuint64 output_size; |
709 | 4.26k | char *result; |
710 | 4.26k | char *utf8_string = MVM_string_utf8_c8_encode(tc, str, &output_size); |
711 | 4.26k | result = MVM_malloc(output_size + 1); |
712 | 4.26k | memcpy(result, utf8_string, output_size); |
713 | 4.26k | MVM_free(utf8_string); |
714 | 4.26k | result[output_size] = (char)0; |
715 | 4.26k | return result; |
716 | 4.26k | } |