/home/travis/build/MoarVM/MoarVM/src/strings/utf8_c8.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* UTF-8 Clean-8 is an encoder/decoder that primarily works as the UTF-8 one. |
4 | | * However, upon encountering a byte sequence that will either not decode as |
5 | | * valid UTF-8, or that would not round-trip due to normalization, it will use |
6 | | * NFG synthetics to keep track of the original bytes involved. This means that |
7 | | * encoding back to UTF-8 Clean-8 will be able to recreate the bytes as they |
8 | | * originally existed. The synthetics contain 4 codepoints: |
9 | | * |
10 | | * * The codepoint 0x10FFFD (which is a private use codepoint) |
11 | | * * The codepoint 'x' |
12 | | * * The upper 4 bits of the non-decodable byte as a hex char (0..9A..F) |
13 | | * * The lower 4 bits as the non-decodable byte as a hex char (0..9A..F) |
14 | | * |
15 | | * Under normal UTF-8 encoding, this means the unrepresentable characters will |
16 | | * come out as something like `?xFF`. |
17 | | * |
18 | | * UTF-8 Clean-8 is used in places where MoarVM receives strings from the |
19 | | * environment, command line arguments, and file system queries. |
20 | | */ |
21 | | |
22 | | /* begin not_gerd section (modified from original) |
23 | | // Copyright 2012 not_gerd |
24 | | // see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122 |
25 | | |
26 | | Permission is granted to use, modify, and / or redistribute at will. |
27 | | |
28 | | This includes removing authorship notices, re-use of code parts in |
29 | | other software (with or without giving credit), and / or creating a |
30 | | commercial product based on it. |
31 | | |
32 | | This permission is not revocable by the author. |
33 | | |
34 | | This software is provided as-is. Use it at your own risk. There is |
35 | | no warranty whatsoever, neither expressed nor implied, and by using |
36 | | this software you accept that the author(s) shall not be held liable |
37 | | for any loss of data, loss of service, or other damages, be they |
38 | | incidental or consequential. Your only option other than accepting |
39 | | this is not to use the software at all. |
40 | | */ |
41 | | |
42 | | enum { |
43 | | CP_CHAR = 1 << 0, |
44 | | CP_LOW_SURROGATE = 1 << 1, |
45 | | CP_HIGH_SURROGATE = 1 << 2, |
46 | | CP_NONCHAR = 1 << 3, |
47 | | CP_OVERFLOW = 1 << 4, |
48 | | |
49 | | U8_SINGLE = 1 << 5, |
50 | | U8_DOUBLE = 1 << 6, |
51 | | U8_TRIPLE = 1 << 7, |
52 | | U8_QUAD = 1 << 8 |
53 | | }; |
54 | | |
55 | 216k | static unsigned classify(MVMCodepoint cp) { |
56 | 216k | if(cp <= 0x7F) |
57 | 216k | return CP_CHAR | U8_SINGLE; |
58 | 216k | |
59 | 0 | if(cp <= 0x07FF) |
60 | 0 | return CP_CHAR | U8_DOUBLE; |
61 | 0 |
|
62 | 0 | if(0xD800 <= cp && cp <= 0xDBFF) |
63 | 0 | return CP_HIGH_SURROGATE | U8_TRIPLE; |
64 | 0 |
|
65 | 0 | if(0xDC00 <= cp && cp <= 0xDFFF) |
66 | 0 | return CP_LOW_SURROGATE | U8_TRIPLE; |
67 | 0 |
|
68 | 0 | if(0xFDD0 <= cp && cp <= 0xFDEF) |
69 | 0 | return CP_NONCHAR | U8_TRIPLE; |
70 | 0 |
|
71 | 0 | if(cp <= 0xFFFD) |
72 | 0 | return CP_CHAR | U8_TRIPLE; |
73 | 0 |
|
74 | 0 | if(cp == 0xFFFE || cp == 0xFFFF) |
75 | 0 | return CP_NONCHAR | U8_TRIPLE; |
76 | 0 |
|
77 | 0 | if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF)) |
78 | 0 | return CP_NONCHAR | U8_QUAD; |
79 | 0 |
|
80 | 0 | if(cp <= 0x10FFFF) |
81 | 0 | return CP_CHAR | U8_QUAD; |
82 | 0 |
|
83 | 0 | if(cp <= 0x1FFFFF) |
84 | 0 | return CP_OVERFLOW | U8_QUAD; |
85 | 0 |
|
86 | 0 | return 0; |
87 | 0 | } |
88 | | |
89 | 216k | static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) { |
90 | 216k | unsigned cc = classify(cp); |
91 | 216k | |
92 | 216k | if (!(cc & (CP_CHAR | CP_NONCHAR))) |
93 | 0 | return 0; |
94 | 216k | |
95 | 216k | if (cc & U8_SINGLE) { |
96 | 216k | bp[0] = (MVMuint8)cp; |
97 | 216k | return 1; |
98 | 216k | } |
99 | 216k | |
100 | 0 | if (cc & U8_DOUBLE) { |
101 | 0 | bp[0] = (MVMuint8)(( 6 << 5) | (cp >> 6)); |
102 | 0 | bp[1] = (MVMuint8)(( 2 << 6) | (cp & 0x3F)); |
103 | 0 | return 2; |
104 | 0 | } |
105 | 0 |
|
106 | 0 | if (cc & U8_TRIPLE) { |
107 | 0 | bp[0] = (MVMuint8)((14 << 4) | (cp >> 12)); |
108 | 0 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
109 | 0 | bp[2] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
110 | 0 | return 3; |
111 | 0 | } |
112 | 0 |
|
113 | 0 | if (cc & U8_QUAD) { |
114 | 0 | bp[0] = (MVMuint8)((30 << 3) | (cp >> 18)); |
115 | 0 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F)); |
116 | 0 | bp[2] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
117 | 0 | bp[3] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
118 | 0 | return 4; |
119 | 0 | } |
120 | 0 |
|
121 | 0 | return 0; |
122 | 0 | } |
123 | | |
124 | | /* end not_gerd section */ |
125 | | |
126 | | #define UTF8_MAXINC (32 * 1024 * 1024) |
127 | | |
128 | 0 | static void ensure_buffer(MVMGrapheme32 **buffer, MVMint32 *bufsize, MVMint32 needed) { |
129 | 0 | while (needed >= *bufsize) |
130 | 0 | *buffer = MVM_realloc(*buffer, sizeof(MVMGrapheme32) * ( |
131 | 0 | *bufsize >= UTF8_MAXINC ? (*bufsize += UTF8_MAXINC) : (*bufsize *= 2) |
132 | 0 | )); |
133 | 0 | } |
134 | | |
135 | | static const MVMuint8 hex_chars[] = { '0', '1', '2', '3', '4', '5', '6', '7', |
136 | | '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' }; |
137 | 0 | static MVMGrapheme32 synthetic_for(MVMThreadContext *tc, MVMuint8 invalid) { |
138 | 0 | if (invalid > 0x7F) { |
139 | 0 | /* A real invalid. */ |
140 | 0 | MVMuint8 high = invalid >> 4; |
141 | 0 | MVMuint8 low = invalid & 0x0F; |
142 | 0 | MVMCodepoint cps[] = { 0x10FFFD, 'x', hex_chars[high], hex_chars[low] }; |
143 | 0 | return MVM_nfg_codes_to_grapheme_utf8_c8(tc, cps, 4); |
144 | 0 | } |
145 | 0 | else { |
146 | 0 | /* Was in things thrown out as invalid by the decoder, but has an |
147 | 0 | * ASCII interpretation, so hand it back as is. */ |
148 | 0 | return invalid; |
149 | 0 | } |
150 | 0 | } |
151 | | |
152 | | /* What the UTF-C8 decode process is expecting. */ |
153 | | typedef enum { |
154 | | EXPECT_START = 0, |
155 | | EXPECT_CONTINUATION = 1 |
156 | | } Expecting; |
157 | | |
158 | | /* Decode state for the UTF8-C8 decoder. */ |
159 | | typedef struct { |
160 | | /* The UTF-8 we're decoding. */ |
161 | | const MVMuint8 *utf8; |
162 | | |
163 | | /* The index of the current byte we're decoding. */ |
164 | | size_t cur_byte; |
165 | | |
166 | | /* The index of the first unaccepted byte. */ |
167 | | size_t unaccepted_start; |
168 | | |
169 | | /* What kind of byte we're expecting next. */ |
170 | | Expecting expecting; |
171 | | |
172 | | /* The current codepoint we're decoding. */ |
173 | | MVMCodepoint cur_codepoint; |
174 | | |
175 | | /* The result buffer we're decoding into. */ |
176 | | MVMGrapheme32 *result; |
177 | | |
178 | | /* The current position in the result buffer. */ |
179 | | size_t result_pos; |
180 | | |
181 | | /* Buffer of original codepoints, to ensure we will not spit out any |
182 | | * synthetics into the result that will re-order on round-trip. */ |
183 | | MVMCodepoint *orig_codes; |
184 | | |
185 | | /* Position we're at in inserting into orig_codes. */ |
186 | | size_t orig_codes_pos; |
187 | | |
188 | | /* First orig_codes index that did not yet go through the normalizer. */ |
189 | | size_t orig_codes_unnormalized; |
190 | | |
191 | | /* The normalizer we're using to make synthetics that will not cause an |
192 | | * order change on output. */ |
193 | | MVMNormalizer norm; |
194 | | |
195 | | /* Bad bytes from an earlier buffer, for the sake of streaming decode. */ |
196 | | MVMuint8 prev_bad_bytes[4]; |
197 | | MVMint32 num_prev_bad_bytes; |
198 | | } DecodeState; |
199 | | |
200 | | /* Appends a single grapheme to the buffer if it will not cause a mismatch |
201 | | * with the original codepoints upon encoding back to UTF-8. Returns non-zero |
202 | | * in this case. Otherwise, appends synthetics for the bytes the original code |
203 | | * points were encoded as. Since we can end up with index mis-matches, we just |
204 | | * spit out codepoints to catch the normalizer up to everything in the orig |
205 | | * codes buffer. */ |
206 | 624k | static int append_grapheme(MVMThreadContext *tc, DecodeState *state, MVMGrapheme32 g) { |
207 | 624k | if (g == state->orig_codes[state->orig_codes_unnormalized]) { |
208 | 624k | /* Easy case: exact match. */ |
209 | 624k | state->result[state->result_pos++] = g; |
210 | 624k | state->orig_codes_unnormalized++; |
211 | 624k | return 1; |
212 | 624k | } |
213 | 0 | else if (g < 0) { |
214 | 0 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
215 | 0 | int mismatch = 0; |
216 | 0 | if (synth->base == state->orig_codes[state->orig_codes_unnormalized]) { |
217 | 0 | MVMint32 i; |
218 | 0 | for (i = 0; i < synth->num_combs; i++) { |
219 | 0 | size_t orig_idx = state->orig_codes_unnormalized + i + 1; |
220 | 0 | if (orig_idx >= state->orig_codes_pos || |
221 | 0 | state->orig_codes[orig_idx] != synth->combs[i]) { |
222 | 0 | mismatch = 1; |
223 | 0 | break; |
224 | 0 | } |
225 | 0 | } |
226 | 0 | } |
227 | 0 | else { |
228 | 0 | mismatch = 1; |
229 | 0 | } |
230 | 0 | if (!mismatch) { |
231 | 0 | state->result[state->result_pos++] = g; |
232 | 0 | state->orig_codes_unnormalized += 1 + synth->num_combs; |
233 | 0 | return 1; |
234 | 0 | } |
235 | 0 | } |
236 | 624k | |
237 | 624k | /* If we get here, then normalization would trash the original bytes. */ |
238 | 0 | { |
239 | 0 | /* Spit out synthetics to keep the bytes as is. */ |
240 | 0 | size_t i, j; |
241 | 0 | for (i = state->orig_codes_unnormalized; i < state->orig_codes_pos; i++) { |
242 | 0 | MVMCodepoint to_encode = state->orig_codes[i]; |
243 | 0 | MVMuint8 encoded[4]; |
244 | 0 | MVMint32 bytes = utf8_encode(encoded, to_encode); |
245 | 0 | for (j = 0; j < bytes; j++) |
246 | 0 | state->result[state->result_pos++] = synthetic_for(tc, encoded[j]); |
247 | 0 | } |
248 | 0 |
|
249 | 0 | /* Consider all codes pushed now normalized. */ |
250 | 0 | state->orig_codes_unnormalized = state->orig_codes_pos; |
251 | 0 |
|
252 | 0 | /* Put a clean normalizer in place. */ |
253 | 0 | MVM_unicode_normalizer_cleanup(tc, &(state->norm)); |
254 | 0 | MVM_unicode_normalizer_init(tc, &(state->norm), MVM_NORMALIZE_NFG); |
255 | 0 | return 0; |
256 | 624k | } |
257 | 624k | } |
258 | | |
259 | | /* Called when decoding has reached an acceptable codepoint. */ |
260 | 624k | static void process_ok_codepoint(MVMThreadContext *tc, DecodeState *state) { |
261 | 624k | MVMint32 ready; |
262 | 624k | MVMGrapheme32 g; |
263 | 624k | |
264 | 624k | /* Consider the byte range accepted. */ |
265 | 624k | state->unaccepted_start = state->cur_byte + 1; |
266 | 624k | |
267 | 624k | /* Insert into original codepoints list and hand it to the normalizer. */ |
268 | 624k | state->orig_codes[state->orig_codes_pos++] = state->cur_codepoint; |
269 | 624k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, |
270 | 624k | &(state->norm), state->cur_codepoint, &g); |
271 | 624k | |
272 | 624k | /* If the normalizer produced some output... */ |
273 | 624k | if (ready) { |
274 | 608k | if (append_grapheme(tc, state, g)) { |
275 | 608k | while (--ready > 0) { |
276 | 0 | g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm)); |
277 | 0 | if (!append_grapheme(tc, state, g)) |
278 | 0 | break; |
279 | 0 | } |
280 | 608k | } |
281 | 608k | } |
282 | 624k | |
283 | 624k | /* We've no longer any bad bytes to care about from earlier buffers; |
284 | 624k | * they ended up making an acceptable codepoint. */ |
285 | 624k | state->num_prev_bad_bytes = 0; |
286 | 624k | } |
287 | | |
288 | | /* Called when a bad byte has been encountered, or at the end of output. */ |
289 | 15.6k | static void process_bad_bytes(MVMThreadContext *tc, DecodeState *state) { |
290 | 15.6k | size_t i; |
291 | 15.6k | MVMint32 ready; |
292 | 15.6k | |
293 | 15.6k | /* Flush normalization buffer and take from that. */ |
294 | 15.6k | MVM_unicode_normalizer_eof(tc, &(state->norm)); |
295 | 15.6k | ready = MVM_unicode_normalizer_available(tc, &(state->norm)); |
296 | 31.2k | while (ready-- > 0) { |
297 | 15.6k | MVMGrapheme32 g = MVM_unicode_normalizer_get_grapheme(tc, &(state->norm)); |
298 | 15.6k | if (!append_grapheme(tc, state, g)) |
299 | 0 | break; |
300 | 15.6k | } |
301 | 15.6k | |
302 | 15.6k | /* Now add in synthetics for bad bytes. */ |
303 | 15.6k | for (i = 0; i < state->num_prev_bad_bytes; i++) |
304 | 0 | state->result[state->result_pos++] = synthetic_for(tc, state->prev_bad_bytes[i]); |
305 | 15.6k | state->num_prev_bad_bytes = 0; |
306 | 15.6k | for (i = state->unaccepted_start; i <= state->cur_byte; i++) |
307 | 0 | state->result[state->result_pos++] = synthetic_for(tc, state->utf8[i]); |
308 | 15.6k | state->unaccepted_start = state->cur_byte + 1; |
309 | 15.6k | } |
310 | | |
311 | | /* Decodes the specified number of bytes of utf8 into an NFG string, creating |
312 | | * a result of the specified type. The type must have the MVMString REPR. */ |
313 | | MVMString * MVM_string_utf8_c8_decode(MVMThreadContext *tc, const MVMObject *result_type, |
314 | 15.6k | const char *utf8, size_t bytes) { |
315 | 15.6k | DecodeState state; |
316 | 15.6k | |
317 | 15.6k | /* Local state for decode loop. */ |
318 | 15.6k | int expected_continuations = 0; |
319 | 15.6k | int min_expected_codepoint; |
320 | 15.6k | |
321 | 15.6k | /* Don't do anything if empty. */ |
322 | 15.6k | if (bytes == 0) |
323 | 0 | return tc->instance->str_consts.empty; |
324 | 15.6k | |
325 | 15.6k | /* Decoding state, in a struct to easily pass to utility routines. |
326 | 15.6k | * Result buffer is a maximum estimate to avoid realloc; we can shrink |
327 | 15.6k | * it at the end. */ |
328 | 15.6k | state.utf8 = (MVMuint8 *)utf8; |
329 | 15.6k | state.cur_byte = 0; |
330 | 15.6k | state.unaccepted_start = 0; |
331 | 15.6k | state.expecting = EXPECT_START; |
332 | 15.6k | state.cur_codepoint = 0; |
333 | 15.6k | state.result = MVM_malloc(sizeof(MVMGrapheme32) * bytes); |
334 | 15.6k | state.result_pos = 0; |
335 | 15.6k | state.orig_codes = MVM_malloc(sizeof(MVMCodepoint) * bytes); |
336 | 15.6k | state.orig_codes_pos = 0; |
337 | 15.6k | state.orig_codes_unnormalized = 0; |
338 | 15.6k | state.num_prev_bad_bytes = 0; |
339 | 15.6k | MVM_unicode_normalizer_init(tc, &(state.norm), MVM_NORMALIZE_NFG); |
340 | 15.6k | |
341 | 639k | while (state.cur_byte < bytes) { |
342 | 624k | MVMuint8 decode_byte = utf8[state.cur_byte]; |
343 | 624k | switch (state.expecting) { |
344 | 624k | case EXPECT_START: |
345 | 624k | if ((decode_byte & 0x80) == 0) { |
346 | 624k | /* Single byte sequence. */ |
347 | 624k | state.cur_codepoint = decode_byte; |
348 | 624k | process_ok_codepoint(tc, &state); |
349 | 624k | } |
350 | 0 | else if ((decode_byte & 0xE0) == 0xC0) { |
351 | 0 | state.cur_codepoint = decode_byte & 0x1F; |
352 | 0 | state.expecting = EXPECT_CONTINUATION; |
353 | 0 | expected_continuations = 1; |
354 | 0 | min_expected_codepoint = 0x80; |
355 | 0 | } |
356 | 0 | else if ((decode_byte & 0xF0) == 0xE0) { |
357 | 0 | state.cur_codepoint = decode_byte & 0x0F; |
358 | 0 | state.expecting = EXPECT_CONTINUATION; |
359 | 0 | expected_continuations = 2; |
360 | 0 | min_expected_codepoint = 0x800; |
361 | 0 | } |
362 | 0 | else if ((decode_byte & 0xF8) == 0xF0) { |
363 | 0 | state.cur_codepoint = decode_byte & 0x07; |
364 | 0 | state.expecting = EXPECT_CONTINUATION; |
365 | 0 | expected_continuations = 3; |
366 | 0 | min_expected_codepoint = 0x10000; |
367 | 0 | } |
368 | 0 | else { |
369 | 0 | /* Invalid byte sequence. */ |
370 | 0 | process_bad_bytes(tc, &state); |
371 | 0 | } |
372 | 624k | break; |
373 | 0 | case EXPECT_CONTINUATION: |
374 | 0 | if ((decode_byte & 0xC0) == 0x80) { |
375 | 0 | state.cur_codepoint = (state.cur_codepoint << 6) |
376 | 0 | | (decode_byte & 0x3F); |
377 | 0 | expected_continuations--; |
378 | 0 | if (expected_continuations == 0) { |
379 | 0 | if (state.cur_codepoint >= min_expected_codepoint) |
380 | 0 | process_ok_codepoint(tc, &state); |
381 | 0 | else |
382 | 0 | process_bad_bytes(tc, &state); |
383 | 0 | state.expecting = EXPECT_START; |
384 | 0 | } |
385 | 0 | } |
386 | 0 | else { |
387 | 0 | /* Invalid byte sequence. */ |
388 | 0 | process_bad_bytes(tc, &state); |
389 | 0 | state.expecting = EXPECT_START; |
390 | 0 | } |
391 | 0 | break; |
392 | 624k | } |
393 | 624k | state.cur_byte++; |
394 | 624k | } |
395 | 15.6k | |
396 | 15.6k | /* Handle anything dangling off the end. */ |
397 | 15.6k | state.cur_byte--; /* So we don't read 1 past the end. */ |
398 | 15.6k | process_bad_bytes(tc, &state); |
399 | 15.6k | |
400 | 15.6k | MVM_free(state.orig_codes); |
401 | 15.6k | MVM_unicode_normalizer_cleanup(tc, &(state.norm)); |
402 | 15.6k | |
403 | 15.6k | { |
404 | 15.6k | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
405 | 15.6k | result->body.storage.blob_32 = state.result; |
406 | 15.6k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
407 | 15.6k | result->body.num_graphs = state.result_pos; |
408 | 15.6k | return result; |
409 | 15.6k | } |
410 | 15.6k | } |
411 | | |
412 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
413 | | * buffers, or until a stopper is reached. */ |
414 | | MVMuint32 MVM_string_utf8_c8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
415 | | const MVMint32 *stopper_chars, |
416 | | MVMDecodeStreamSeparators *seps, |
417 | 0 | MVMint32 eof) { |
418 | 0 | /* Local state for decode loop. */ |
419 | 0 | MVMDecodeStreamBytes *cur_bytes; |
420 | 0 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
421 | 0 | MVMint32 last_accept_pos = ds->bytes_head_pos; |
422 | 0 | DecodeState state; |
423 | 0 | int expected_continuations = 0; |
424 | 0 | int min_expected_codepoint; |
425 | 0 | MVMuint32 reached_stopper = 0; |
426 | 0 | MVMint32 result_graphs = 0; |
427 | 0 |
|
428 | 0 | /* If there's no buffers, we're done. */ |
429 | 0 | if (!ds->bytes_head) |
430 | 0 | return 0; |
431 | 0 | last_accept_pos = ds->bytes_head_pos; |
432 | 0 |
|
433 | 0 | /* If we're asked for zero chars, also done. */ |
434 | 0 | if (stopper_chars && *stopper_chars == 0) |
435 | 0 | return 1; |
436 | 0 |
|
437 | 0 | /* Otherwise set up decode state, stealing normalizer of the decode |
438 | 0 | * stream and re-instating any past orig_codes. */ |
439 | 0 | state.expecting = EXPECT_START; |
440 | 0 | state.cur_codepoint = 0; |
441 | 0 | state.num_prev_bad_bytes = 0; |
442 | 0 | memcpy(&(state.norm), &(ds->norm), sizeof(MVMNormalizer)); |
443 | 0 | if (ds->decoder_state) { |
444 | 0 | MVMCodepoint *saved = (MVMCodepoint *)ds->decoder_state; |
445 | 0 | state.orig_codes = MVM_malloc( |
446 | 0 | sizeof(MVMCodepoint) * (saved[0] + ds->bytes_head->length) |
447 | 0 | ); |
448 | 0 | state.orig_codes_pos = saved[0]; |
449 | 0 | state.orig_codes_unnormalized = 0; |
450 | 0 | memcpy(state.orig_codes, saved + 1, saved[0] * sizeof(MVMCodepoint)); |
451 | 0 | MVM_free(ds->decoder_state); |
452 | 0 | ds->decoder_state = NULL; |
453 | 0 | } |
454 | 0 | else { |
455 | 0 | state.orig_codes = NULL; |
456 | 0 | state.orig_codes_pos = 0; |
457 | 0 | state.orig_codes_unnormalized = 0; |
458 | 0 | } |
459 | 0 |
|
460 | 0 | /* Decode each of the buffers. */ |
461 | 0 | cur_bytes = ds->bytes_head; |
462 | 0 | reached_stopper = 0; |
463 | 0 | while (cur_bytes && !reached_stopper) { |
464 | 0 | /* Set up decode state for this buffer. */ |
465 | 0 | MVMint32 bytes = ds->bytes_head->length; |
466 | 0 | state.result = MVM_malloc(bytes * sizeof(MVMGrapheme32)); |
467 | 0 | state.orig_codes = MVM_realloc(state.orig_codes, |
468 | 0 | sizeof(MVMCodepoint) * (state.orig_codes_pos + bytes)); |
469 | 0 | state.result_pos = 0; |
470 | 0 | state.utf8 = (const MVMuint8*)cur_bytes->bytes; |
471 | 0 | state.cur_byte = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
472 | 0 | state.unaccepted_start = state.cur_byte; |
473 | 0 |
|
474 | 0 | /* Process this buffer. */ |
475 | 0 | while (state.cur_byte < bytes) { |
476 | 0 | /* Process a byte. */ |
477 | 0 | MVMuint8 decode_byte = state.utf8[state.cur_byte]; |
478 | 0 | MVMint32 maybe_new_graph = 0; |
479 | 0 | switch (state.expecting) { |
480 | 0 | case EXPECT_START: |
481 | 0 | if ((decode_byte & 0x80) == 0) { |
482 | 0 | /* Single byte sequence. */ |
483 | 0 | state.cur_codepoint = decode_byte; |
484 | 0 | process_ok_codepoint(tc, &state); |
485 | 0 | maybe_new_graph = 1; |
486 | 0 | } |
487 | 0 | else if ((decode_byte & 0xE0) == 0xC0) { |
488 | 0 | state.cur_codepoint = decode_byte & 0x1F; |
489 | 0 | state.expecting = EXPECT_CONTINUATION; |
490 | 0 | expected_continuations = 1; |
491 | 0 | min_expected_codepoint = 0x80; |
492 | 0 | } |
493 | 0 | else if ((decode_byte & 0xF0) == 0xE0) { |
494 | 0 | state.cur_codepoint = decode_byte & 0x0F; |
495 | 0 | state.expecting = EXPECT_CONTINUATION; |
496 | 0 | expected_continuations = 2; |
497 | 0 | min_expected_codepoint = 0x800; |
498 | 0 | } |
499 | 0 | else if ((decode_byte & 0xF8) == 0xF0) { |
500 | 0 | state.cur_codepoint = decode_byte & 0x07; |
501 | 0 | state.expecting = EXPECT_CONTINUATION; |
502 | 0 | expected_continuations = 3; |
503 | 0 | min_expected_codepoint = 0x10000; |
504 | 0 | } |
505 | 0 | else { |
506 | 0 | /* Invalid byte sequence. */ |
507 | 0 | process_bad_bytes(tc, &state); |
508 | 0 | maybe_new_graph = 1; |
509 | 0 | } |
510 | 0 | break; |
511 | 0 | case EXPECT_CONTINUATION: |
512 | 0 | if ((decode_byte & 0xC0) == 0x80) { |
513 | 0 | state.cur_codepoint = (state.cur_codepoint << 6) |
514 | 0 | | (decode_byte & 0x3F); |
515 | 0 | expected_continuations--; |
516 | 0 | if (expected_continuations == 0) { |
517 | 0 | if (state.cur_codepoint >= min_expected_codepoint) |
518 | 0 | process_ok_codepoint(tc, &state); |
519 | 0 | else |
520 | 0 | process_bad_bytes(tc, &state); |
521 | 0 | maybe_new_graph = 1; |
522 | 0 | state.expecting = EXPECT_START; |
523 | 0 | } |
524 | 0 | } |
525 | 0 | else { |
526 | 0 | /* Invalid byte sequence. */ |
527 | 0 | process_bad_bytes(tc, &state); |
528 | 0 | maybe_new_graph = 1; |
529 | 0 | state.expecting = EXPECT_START; |
530 | 0 | } |
531 | 0 | break; |
532 | 0 | } |
533 | 0 | state.cur_byte++; |
534 | 0 |
|
535 | 0 | /* See if we've reached a stopper. */ |
536 | 0 | if (maybe_new_graph && state.result_pos > 0) { |
537 | 0 | if (stopper_chars) { |
538 | 0 | if (result_graphs + state.result_pos >= *stopper_chars) { |
539 | 0 | reached_stopper = 1; |
540 | 0 | break; |
541 | 0 | } |
542 | 0 | } |
543 | 0 | if (MVM_string_decode_stream_maybe_sep(tc, seps, |
544 | 0 | state.result[state.result_pos - 1])) { |
545 | 0 | reached_stopper = 1; |
546 | 0 | break; |
547 | 0 | } |
548 | 0 | } |
549 | 0 | } |
550 | 0 |
|
551 | 0 | /* If we're at EOF and this is the last buffer, force out last bytes. */ |
552 | 0 | if (eof && !reached_stopper && !cur_bytes->next) { |
553 | 0 | state.cur_byte--; /* So we don't read 1 past the end. */ |
554 | 0 | process_bad_bytes(tc, &state); |
555 | 0 | } |
556 | 0 |
|
557 | 0 | /* Attach what we successfully parsed as a result buffer, and trim away |
558 | 0 | * what we chewed through. */ |
559 | 0 | if (state.result_pos) |
560 | 0 | MVM_string_decodestream_add_chars(tc, ds, state.result, state.result_pos); |
561 | 0 | else |
562 | 0 | MVM_free(state.result); |
563 | 0 | result_graphs += state.result_pos; |
564 | 0 |
|
565 | 0 | /* Update our accepted position. */ |
566 | 0 | if (state.unaccepted_start > 0) { |
567 | 0 | last_accept_bytes = cur_bytes; |
568 | 0 | last_accept_pos = state.unaccepted_start; |
569 | 0 | } |
570 | 0 |
|
571 | 0 | /* If there were bytes we didn't accept, hold on to them in case we |
572 | 0 | * need to emit them as bad bytes. */ |
573 | 0 | if (state.unaccepted_start != state.cur_byte && cur_bytes->next) { |
574 | 0 | int i; |
575 | 0 | for (i = state.unaccepted_start; i < state.cur_byte; i++) |
576 | 0 | state.prev_bad_bytes[state.num_prev_bad_bytes++] = state.utf8[i]; |
577 | 0 | } |
578 | 0 |
|
579 | 0 | cur_bytes = cur_bytes->next; |
580 | 0 | } |
581 | 0 |
|
582 | 0 | /* Eat the bytes we decoded. */ |
583 | 0 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
584 | 0 |
|
585 | 0 | /* Persist current normalizer. */ |
586 | 0 | memcpy(&(ds->norm), &(state.norm), sizeof(MVMNormalizer)); |
587 | 0 |
|
588 | 0 | /* Stash away any leftover codepoints we'll need to examine. */ |
589 | 0 | if (state.orig_codes_pos && state.orig_codes_pos != state.orig_codes_unnormalized) { |
590 | 0 | size_t diff = state.orig_codes_pos - state.orig_codes_unnormalized; |
591 | 0 | MVMCodepoint *saved = MVM_malloc(sizeof(MVMCodepoint) * (1 + diff)); |
592 | 0 | saved[0] = diff; |
593 | 0 | memcpy(saved + 1, state.orig_codes + state.orig_codes_unnormalized, |
594 | 0 | diff * sizeof(MVMCodepoint)); |
595 | 0 | ds->decoder_state = saved; |
596 | 0 | } |
597 | 0 | MVM_free(state.orig_codes); |
598 | 0 |
|
599 | 0 | return reached_stopper; |
600 | 0 | } |
601 | | |
602 | | /* Encodes the specified string to UTF-8. */ |
603 | | static void emit_cp(MVMThreadContext *tc, MVMCodepoint cp, MVMuint8 **result, |
604 | | size_t *result_pos, size_t *result_limit, |
605 | 216k | MVMuint8 *repl_bytes, MVMuint64 repl_length) { |
606 | 216k | MVMint32 bytes; |
607 | 216k | if (*result_pos >= *result_limit) { |
608 | 0 | *result_limit *= 2; |
609 | 0 | *result = MVM_realloc(*result, *result_limit + 4); |
610 | 0 | } |
611 | 216k | bytes = utf8_encode(*result + *result_pos, cp); |
612 | 216k | if (bytes) |
613 | 216k | *result_pos += bytes; |
614 | 0 | else if (repl_bytes) { |
615 | 0 | if (repl_length >= *result_limit || *result_pos >= *result_limit - repl_length) { |
616 | 0 | *result_limit += repl_length; |
617 | 0 | *result = MVM_realloc(*result, *result_limit + 4); |
618 | 0 | } |
619 | 0 | memcpy(*result + *result_pos, repl_bytes, repl_length); |
620 | 0 | *result_pos += repl_length; |
621 | 0 | } |
622 | 0 | else { |
623 | 0 | MVM_free(*result); |
624 | 0 | MVM_free(repl_bytes); |
625 | 0 | MVM_string_utf8_throw_encoding_exception(tc, cp); |
626 | 0 | } |
627 | 216k | } |
628 | 0 | static int hex2int(MVMThreadContext *tc, MVMCodepoint cp) { |
629 | 0 | if (cp >= '0' && cp <= '9') |
630 | 0 | return cp - '0'; |
631 | 0 | else if (cp >= 'A' && cp <= 'F') |
632 | 0 | return 10 + (cp - 'A'); |
633 | 0 | else |
634 | 0 | MVM_exception_throw_adhoc(tc, "UTF-8 C-8 encoding encountered corrupt synthetic"); |
635 | 0 | } |
636 | | char * MVM_string_utf8_c8_encode_substr(MVMThreadContext *tc, |
637 | 10.5k | MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement) { |
638 | 10.5k | MVMuint8 *result; |
639 | 10.5k | size_t result_pos, result_limit; |
640 | 10.5k | MVMGraphemeIter gi; |
641 | 10.5k | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
642 | 10.5k | MVMuint8 *repl_bytes = NULL; |
643 | 10.5k | MVMuint64 repl_length; |
644 | 10.5k | |
645 | 10.5k | if (start < 0 || start > strgraphs) |
646 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
647 | 10.5k | if (length == -1) |
648 | 0 | length = strgraphs; |
649 | 10.5k | if (length < 0 || start + length > strgraphs) |
650 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
651 | 10.5k | |
652 | 10.5k | if (replacement) |
653 | 0 | repl_bytes = (MVMuint8 *) MVM_string_utf8_c8_encode_substr(tc, replacement, &repl_length, 0, -1, NULL); |
654 | 10.5k | |
655 | 10.5k | /* Guesstimate that we'll be within 2 bytes for most chars most of the |
656 | 10.5k | * time, and give ourselves 4 bytes breathing space. */ |
657 | 10.5k | result_limit = 2 * length; |
658 | 10.5k | result = MVM_malloc(result_limit + 4); |
659 | 10.5k | result_pos = 0; |
660 | 10.5k | |
661 | 10.5k | /* We iterate graphemes, looking out for any synthetics. If we find a |
662 | 10.5k | * UTF-8 C-8 synthetic, then we spit out the raw byte. If we find any |
663 | 10.5k | * other synthetic, we iterate its codepoints. */ |
664 | 10.5k | MVM_string_gi_init(tc, &gi, str); |
665 | 226k | while (MVM_string_gi_has_more(tc, &gi)) { |
666 | 216k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
667 | 216k | if (g >= 0) { |
668 | 216k | emit_cp(tc, g, &result, &result_pos, &result_limit, repl_bytes, repl_length); |
669 | 216k | } |
670 | 0 | else { |
671 | 0 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
672 | 0 | if (synth->is_utf8_c8) { |
673 | 0 | /* UTF-8 C-8 synthetic; emit the byte. */ |
674 | 0 | if (result_pos >= result_limit) { |
675 | 0 | result_limit *= 2; |
676 | 0 | result = MVM_realloc(result, result_limit + 1); |
677 | 0 | } |
678 | 0 | result[result_pos++] = (hex2int(tc, synth->combs[1]) << 4) + |
679 | 0 | hex2int(tc, synth->combs[2]); |
680 | 0 | } |
681 | 0 | else { |
682 | 0 | MVMint32 i; |
683 | 0 | emit_cp(tc, synth->base, &result, &result_pos, &result_limit, repl_bytes, repl_length); |
684 | 0 | for (i = 0; i < synth->num_combs; i++) |
685 | 0 | emit_cp(tc, synth->combs[i], &result, &result_pos, &result_limit, repl_bytes, repl_length); |
686 | 0 | } |
687 | 0 | } |
688 | 216k | } |
689 | 10.5k | |
690 | 10.5k | if (output_size) |
691 | 10.5k | *output_size = (MVMuint64)result_pos; |
692 | 10.5k | MVM_free(repl_bytes); |
693 | 10.5k | return (char *)result; |
694 | 10.5k | } |
695 | | |
696 | | /* Encodes the specified string to UTF-8 C-8. */ |
697 | 10.5k | char * MVM_string_utf8_c8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size) { |
698 | 10.5k | return MVM_string_utf8_c8_encode_substr(tc, str, output_size, 0, |
699 | 10.5k | MVM_string_graphs(tc, str), NULL); |
700 | 10.5k | } |
701 | | |
702 | | /* Encodes the specified string to a UTF-8 C-8 C string. */ |
703 | 10.5k | char * MVM_string_utf8_c8_encode_C_string(MVMThreadContext *tc, MVMString *str) { |
704 | 10.5k | MVMuint64 output_size; |
705 | 10.5k | char *result; |
706 | 10.5k | char *utf8_string = MVM_string_utf8_c8_encode(tc, str, &output_size); |
707 | 10.5k | result = MVM_malloc(output_size + 1); |
708 | 10.5k | memcpy(result, utf8_string, output_size); |
709 | 10.5k | MVM_free(utf8_string); |
710 | 10.5k | result[output_size] = (char)0; |
711 | 10.5k | return result; |
712 | 10.5k | } |