/home/travis/build/MoarVM/MoarVM/src/strings/utf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* The below section has an MIT-style license, included here. |
4 | | |
5 | | // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
6 | | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
7 | | * |
8 | | * Permission is hereby granted, free of charge, to any person obtaining |
9 | | * a copy of this software and associated documentation files (the |
10 | | * "Software"), to deal in the Software without restriction, including |
11 | | * without limitation the rights to use, copy, modify, merge, publish, |
12 | | * distribute, sublicense, and/or sell copies of the Software, and to |
13 | | * permit persons to whom the Software is furnished to do so, subject |
14 | | * to the following conditions: |
15 | | * |
16 | | * The above copyright notice and this permission notice shall be |
17 | | * included in all copies or substantial portions of the Software. |
18 | | * |
19 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
20 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
21 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
22 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
23 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
24 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
25 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
26 | | * SOFTWARE. |
27 | | */ |
28 | 1.52M | #define UTF8_ACCEPT 0 |
29 | 0 | #define UTF8_REJECT 12 |
30 | | |
31 | | static const MVMuint8 utf8d[] = { |
32 | | // The first part of the table maps bytes to character classes that |
33 | | // to reduce the size of the transition table and create bitmasks. |
34 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
35 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
36 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
37 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
38 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
39 | | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
40 | | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
41 | | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
42 | | |
43 | | // The second part is a transition table that maps a combination |
44 | | // of a state of the automaton and a character class to a state. |
45 | | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, |
46 | | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, |
47 | | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, |
48 | | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, |
49 | | 12,36,12,12,12,12,12,12,12,12,12,12, |
50 | | }; |
51 | | |
52 | | static MVMint32 |
53 | 801k | decode_utf8_byte(MVMint32 *state, MVMGrapheme32 *codep, MVMuint8 byte) { |
54 | 801k | MVMint32 type = utf8d[byte]; |
55 | 801k | |
56 | 801k | *codep = (*state != UTF8_ACCEPT) ? |
57 | 107k | (byte & 0x3fu) | (*codep << 6) : |
58 | 694k | (0xff >> type) & (byte); |
59 | 801k | |
60 | 801k | *state = utf8d[256 + *state + type]; |
61 | 801k | return *state; |
62 | 801k | } |
63 | | /* end Bjoern Hoehrmann section (some things were changed from the original) */ |
64 | | |
65 | | /* begin not_gerd section (modified from original) |
66 | | // Copyright 2012 not_gerd |
67 | | // see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122 |
68 | | |
69 | | Permission is granted to use, modify, and / or redistribute at will. |
70 | | |
71 | | This includes removing authorship notices, re-use of code parts in |
72 | | other software (with or without giving credit), and / or creating a |
73 | | commercial product based on it. |
74 | | |
75 | | This permission is not revocable by the author. |
76 | | |
77 | | This software is provided as-is. Use it at your own risk. There is |
78 | | no warranty whatsoever, neither expressed nor implied, and by using |
79 | | this software you accept that the author(s) shall not be held liable |
80 | | for any loss of data, loss of service, or other damages, be they |
81 | | incidental or consequential. Your only option other than accepting |
82 | | this is not to use the software at all. |
83 | | */ |
84 | | |
85 | | enum { |
86 | | CP_CHAR = 1 << 0, |
87 | | CP_LOW_SURROGATE = 1 << 1, |
88 | | CP_HIGH_SURROGATE = 1 << 2, |
89 | | CP_NONCHAR = 1 << 3, |
90 | | CP_OVERFLOW = 1 << 4, |
91 | | |
92 | | U8_SINGLE = 1 << 5, |
93 | | U8_DOUBLE = 1 << 6, |
94 | | U8_TRIPLE = 1 << 7, |
95 | | U8_QUAD = 1 << 8 |
96 | | }; |
97 | | |
98 | 1.10M | static unsigned classify(MVMCodepoint cp) { |
99 | 1.10M | if(cp <= 0x7F) |
100 | 1.10M | return CP_CHAR | U8_SINGLE; |
101 | 1.10M | |
102 | 3.61k | if(cp <= 0x07FF) |
103 | 2.98k | return CP_CHAR | U8_DOUBLE; |
104 | 3.61k | |
105 | 632 | if(0xD800 <= cp && cp <= 0xDBFF) |
106 | 0 | return CP_HIGH_SURROGATE | U8_TRIPLE; |
107 | 632 | |
108 | 632 | if(0xDC00 <= cp && cp <= 0xDFFF) |
109 | 0 | return CP_LOW_SURROGATE | U8_TRIPLE; |
110 | 632 | |
111 | 632 | if(0xFDD0 <= cp && cp <= 0xFDEF) |
112 | 0 | return CP_NONCHAR | U8_TRIPLE; |
113 | 632 | |
114 | 632 | if(cp <= 0xFFFD) |
115 | 621 | return CP_CHAR | U8_TRIPLE; |
116 | 632 | |
117 | 11 | if(cp == 0xFFFE || cp == 0xFFFF) |
118 | 0 | return CP_NONCHAR | U8_TRIPLE; |
119 | 11 | |
120 | 11 | if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF)) |
121 | 0 | return CP_NONCHAR | U8_QUAD; |
122 | 11 | |
123 | 11 | if(cp <= 0x10FFFF) |
124 | 11 | return CP_CHAR | U8_QUAD; |
125 | 11 | |
126 | 0 | if(cp <= 0x1FFFFF) |
127 | 0 | return CP_OVERFLOW | U8_QUAD; |
128 | 0 |
|
129 | 0 | return 0; |
130 | 0 | } |
131 | | |
132 | 1.10M | static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) { |
133 | 1.10M | unsigned cc = classify(cp); |
134 | 1.10M | |
135 | 1.10M | if (!(cc & (CP_CHAR | CP_NONCHAR))) |
136 | 0 | return 0; |
137 | 1.10M | |
138 | 1.10M | if (cc & U8_SINGLE) { |
139 | 1.10M | bp[0] = (MVMuint8)cp; |
140 | 1.10M | return 1; |
141 | 1.10M | } |
142 | 1.10M | |
143 | 3.61k | if (cc & U8_DOUBLE) { |
144 | 2.98k | bp[0] = (MVMuint8)(( 6 << 5) | (cp >> 6)); |
145 | 2.98k | bp[1] = (MVMuint8)(( 2 << 6) | (cp & 0x3F)); |
146 | 2.98k | return 2; |
147 | 2.98k | } |
148 | 3.61k | |
149 | 632 | if (cc & U8_TRIPLE) { |
150 | 621 | bp[0] = (MVMuint8)((14 << 4) | (cp >> 12)); |
151 | 621 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
152 | 621 | bp[2] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
153 | 621 | return 3; |
154 | 621 | } |
155 | 632 | |
156 | 11 | if (cc & U8_QUAD) { |
157 | 11 | bp[0] = (MVMuint8)((30 << 3) | (cp >> 18)); |
158 | 11 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F)); |
159 | 11 | bp[2] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
160 | 11 | bp[3] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
161 | 11 | return 4; |
162 | 11 | } |
163 | 11 | |
164 | 0 | return 0; |
165 | 11 | } |
166 | | |
167 | | /* end not_gerd section */ |
168 | | |
169 | 0 | #define UTF8_MAXINC (32 * 1024 * 1024) |
170 | | |
171 | | /* Decodes the specified number of bytes of utf8 into an NFG string, creating |
172 | | * a result of the specified type. The type must have the MVMString REPR. */ |
173 | 25.9k | MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) { |
174 | 25.9k | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
175 | 25.9k | MVMint32 count = 0; |
176 | 25.9k | MVMCodepoint codepoint; |
177 | 25.9k | MVMint32 line_ending = 0; |
178 | 25.9k | MVMint32 state = 0; |
179 | 25.9k | MVMint32 bufsize = bytes; |
180 | 25.9k | MVMGrapheme32 lowest_graph = 0x7fffffff; |
181 | 25.9k | MVMGrapheme32 highest_graph = -0x7fffffff; |
182 | 25.9k | MVMGrapheme32 *buffer = MVM_malloc(sizeof(MVMGrapheme32) * bufsize); |
183 | 25.9k | size_t orig_bytes; |
184 | 25.9k | const char *orig_utf8; |
185 | 25.9k | MVMint32 line; |
186 | 25.9k | MVMint32 col; |
187 | 25.9k | MVMint32 ready; |
188 | 25.9k | |
189 | 25.9k | /* Need to normalize to NFG as we decode. */ |
190 | 25.9k | MVMNormalizer norm; |
191 | 25.9k | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
192 | 25.9k | |
193 | 25.9k | orig_bytes = bytes; |
194 | 25.9k | orig_utf8 = utf8; |
195 | 25.9k | |
196 | 343k | for (; bytes; ++utf8, --bytes) { |
197 | 317k | switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) { |
198 | 210k | case UTF8_ACCEPT: { /* got a codepoint */ |
199 | 210k | MVMGrapheme32 g; |
200 | 210k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, codepoint, &g); |
201 | 210k | if (ready) { |
202 | 184k | while (count + ready >= bufsize) { /* if the buffer's full make a bigger one */ |
203 | 0 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * ( |
204 | 0 | bufsize >= UTF8_MAXINC ? (bufsize += UTF8_MAXINC) : (bufsize *= 2) |
205 | 0 | )); |
206 | 0 | } |
207 | 184k | buffer[count++] = g; |
208 | 149k | lowest_graph = g < lowest_graph ? g : lowest_graph; |
209 | 102k | highest_graph = g > highest_graph ? g : highest_graph; |
210 | 184k | while (--ready > 0) { |
211 | 299 | g = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
212 | 288 | lowest_graph = g < lowest_graph ? g : lowest_graph; |
213 | 283 | highest_graph = g > highest_graph ? g : highest_graph; |
214 | 299 | buffer[count++] = g; |
215 | 299 | } |
216 | 184k | } |
217 | 210k | break; |
218 | 210k | } |
219 | 0 | case UTF8_REJECT: |
220 | 0 | /* found a malformed sequence; parse it again this time tracking |
221 | 0 | * line and col numbers. */ |
222 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); /* Since we'll throw. */ |
223 | 0 | bytes = orig_bytes; utf8 = orig_utf8; state = 0; line = 1; col = 1; |
224 | 0 | for (; bytes; ++utf8, --bytes) { |
225 | 0 | switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) { |
226 | 0 | case UTF8_ACCEPT: |
227 | 0 | /* this could be reorganized into several nested ugly if/else :/ */ |
228 | 0 | if (!line_ending && (codepoint == 10 || codepoint == 13)) { |
229 | 0 | /* Detect the style of line endings. |
230 | 0 | * Select whichever comes first. |
231 | 0 | * First or only part of first line ending. */ |
232 | 0 | line_ending = codepoint; |
233 | 0 | col = 1; line++; |
234 | 0 | } |
235 | 0 | else if (line_ending && codepoint == line_ending) { |
236 | 0 | /* first or only part of next line ending */ |
237 | 0 | col = 1; line++; |
238 | 0 | } |
239 | 0 | else if (codepoint == 10 || codepoint == 13) { |
240 | 0 | /* second part of line ending; ignore */ |
241 | 0 | } |
242 | 0 | else /* non-line ending codepoint */ |
243 | 0 | col++; |
244 | 0 | break; |
245 | 0 | case UTF8_REJECT: |
246 | 0 | MVM_free(buffer); |
247 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-8 at line %u col %u", line, col); |
248 | 0 | } |
249 | 0 | } |
250 | 0 | MVM_free(buffer); |
251 | 0 | MVM_exception_throw_adhoc(tc, "Concurrent modification of UTF-8 input buffer!"); |
252 | 0 | break; |
253 | 317k | } |
254 | 317k | } |
255 | 25.9k | if (state != UTF8_ACCEPT) { |
256 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
257 | 0 | MVM_free(buffer); |
258 | 0 | MVM_exception_throw_adhoc(tc, "Malformed termination of UTF-8 string"); |
259 | 0 | } |
260 | 25.9k | |
261 | 25.9k | /* Get any final graphemes from the normalizer, and clean it up. */ |
262 | 25.9k | MVM_unicode_normalizer_eof(tc, &norm); |
263 | 25.9k | ready = MVM_unicode_normalizer_available(tc, &norm); |
264 | 25.9k | if (ready) { |
265 | 25.8k | if (count + ready >= bufsize) { |
266 | 19.9k | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (count + ready)); |
267 | 19.9k | } |
268 | 51.6k | while (ready--) { |
269 | 25.8k | MVMGrapheme32 g; |
270 | 25.8k | g = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
271 | 25.4k | lowest_graph = g < lowest_graph ? g : lowest_graph; |
272 | 12.9k | highest_graph = g > highest_graph ? g : highest_graph; |
273 | 25.8k | buffer[count++] = g; |
274 | 25.8k | } |
275 | 25.8k | } |
276 | 25.9k | MVM_unicode_normalizer_cleanup(tc, &norm); |
277 | 25.9k | |
278 | 25.9k | /* If we're lucky, we can fit our string in 8 bits per grapheme. |
279 | 25.9k | * That happens when our lowest value is bigger than -129 and our |
280 | 25.9k | * highest value is lower than 128. */ |
281 | 25.9k | if (lowest_graph >= -128 && highest_graph < 128) { |
282 | 20.0k | MVMGrapheme8 *new_buffer = MVM_malloc(sizeof(MVMGrapheme8) * count); |
283 | 174k | for (ready = 0; ready < count; ready++) { |
284 | 154k | new_buffer[ready] = buffer[ready]; |
285 | 154k | } |
286 | 20.0k | MVM_free(buffer); |
287 | 20.0k | result->body.storage.blob_8 = new_buffer; |
288 | 20.0k | result->body.storage_type = MVM_STRING_GRAPHEME_8; |
289 | 5.97k | } else { |
290 | 5.97k | /* just keep the same buffer as the MVMString's buffer. Later |
291 | 5.97k | * we can add heuristics to resize it if we have enough free |
292 | 5.97k | * memory */ |
293 | 5.97k | if (bufsize - count > 4) { |
294 | 5.35k | buffer = MVM_realloc(buffer, count * sizeof(MVMGrapheme32)); |
295 | 5.35k | } |
296 | 5.97k | result->body.storage.blob_32 = buffer; |
297 | 5.97k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
298 | 5.97k | } |
299 | 25.9k | result->body.num_graphs = count; |
300 | 25.9k | |
301 | 25.9k | return result; |
302 | 25.9k | } |
303 | | |
304 | 177 | static MVMint32 its_the_bom(const char *utf8) { |
305 | 177 | const MVMuint8 *uns_utf8 = (const MVMuint8 *)utf8; |
306 | 0 | return uns_utf8[0] == 0xEF && uns_utf8[1] == 0xBB && uns_utf8[2] == 0xBF; |
307 | 177 | } |
308 | | |
309 | | /* Same as MVM_string_utf8_decode, but strips a BOM if it finds one. */ |
310 | 4 | MVMString * MVM_string_utf8_decode_strip_bom(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) { |
311 | 4 | if (bytes >= 3 && its_the_bom(utf8)) { |
312 | 0 | utf8 += 3; |
313 | 0 | bytes -= 3; |
314 | 0 | } |
315 | 4 | return MVM_string_utf8_decode(tc, result_type, utf8, bytes); |
316 | 4 | } |
317 | | |
318 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
319 | | * buffers, or until a stopper is reached. */ |
320 | | MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
321 | | const MVMint32 *stopper_chars, |
322 | 242 | MVMDecodeStreamSeparators *seps) { |
323 | 242 | MVMint32 count = 0, total = 0; |
324 | 242 | MVMint32 state = 0; |
325 | 242 | MVMCodepoint codepoint = 0; |
326 | 242 | MVMint32 bufsize; |
327 | 242 | MVMGrapheme32 *buffer; |
328 | 242 | MVMDecodeStreamBytes *cur_bytes; |
329 | 242 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
330 | 242 | MVMint32 last_accept_pos, ready, at_start; |
331 | 242 | MVMuint32 reached_stopper; |
332 | 242 | |
333 | 242 | /* If there's no buffers, we're done. */ |
334 | 242 | if (!ds->bytes_head) |
335 | 24 | return 0; |
336 | 218 | last_accept_pos = ds->bytes_head_pos; |
337 | 218 | |
338 | 218 | /* If we're asked for zero chars, also done. */ |
339 | 218 | if (stopper_chars && *stopper_chars == 0) |
340 | 0 | return 1; |
341 | 218 | |
342 | 218 | /* Rough starting-size estimate is number of bytes in the head buffer. */ |
343 | 218 | bufsize = ds->bytes_head->length; |
344 | 218 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
345 | 218 | |
346 | 218 | /* Decode each of the buffers. */ |
347 | 218 | cur_bytes = ds->bytes_head; |
348 | 218 | at_start = ds->abs_byte_pos == 0; |
349 | 218 | reached_stopper = 0; |
350 | 402 | while (cur_bytes) { |
351 | 219 | /* Process this buffer. */ |
352 | 218 | MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
353 | 219 | char *bytes = cur_bytes->bytes; |
354 | 219 | if (at_start) { |
355 | 180 | /* We're right at the start of the stream of things to decode. See |
356 | 180 | * if we have a BOM, and skip over it if so. */ |
357 | 180 | if (pos + 3 <= cur_bytes->length) { |
358 | 176 | if (its_the_bom(bytes + pos)) { |
359 | 0 | pos += 3; |
360 | 0 | last_accept_bytes = cur_bytes; |
361 | 0 | last_accept_pos = pos; |
362 | 0 | } |
363 | 176 | } |
364 | 180 | at_start = 0; |
365 | 180 | } |
366 | 484k | while (pos < cur_bytes->length) { |
367 | 483k | switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) { |
368 | 483k | case UTF8_ACCEPT: { |
369 | 483k | MVMint32 first = 1; |
370 | 483k | MVMGrapheme32 g; |
371 | 483k | last_accept_bytes = cur_bytes; |
372 | 483k | last_accept_pos = pos; |
373 | 483k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &(ds->norm), codepoint, &g); |
374 | 967k | while (ready--) { |
375 | 483k | if (first) |
376 | 468k | first = 0; |
377 | 483k | else |
378 | 14.9k | g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); |
379 | 483k | if (count == bufsize) { |
380 | 0 | /* Valid character, but we filled the buffer. Attach this |
381 | 0 | * one to the buffers linked list, and continue with a new |
382 | 0 | * one. */ |
383 | 0 | MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); |
384 | 0 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
385 | 0 | count = 0; |
386 | 0 | } |
387 | 483k | buffer[count++] = g; |
388 | 483k | total++; |
389 | 483k | if (stopper_chars && *stopper_chars == total) { |
390 | 5 | reached_stopper = 1; |
391 | 5 | goto done; |
392 | 5 | } |
393 | 483k | if (MVM_string_decode_stream_maybe_sep(tc, seps, g)) { |
394 | 30 | reached_stopper = 1; |
395 | 30 | goto done; |
396 | 30 | } |
397 | 483k | } |
398 | 483k | break; |
399 | 483k | } |
400 | 0 | case UTF8_REJECT: |
401 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-8"); |
402 | 0 | break; |
403 | 483k | } |
404 | 483k | } |
405 | 184 | cur_bytes = cur_bytes->next; |
406 | 184 | } |
407 | 218 | done: |
408 | 218 | |
409 | 218 | /* Attach what we successfully parsed as a result buffer, and trim away |
410 | 218 | * what we chewed through. */ |
411 | 218 | if (count) { |
412 | 214 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
413 | 214 | } |
414 | 4 | else { |
415 | 4 | MVM_free(buffer); |
416 | 4 | } |
417 | 218 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
418 | 218 | |
419 | 218 | return reached_stopper; |
420 | 218 | } |
421 | | |
422 | | /* Encodes the specified string to UTF-8. */ |
423 | | char * MVM_string_utf8_encode_substr(MVMThreadContext *tc, |
424 | | MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, |
425 | 13.6k | MVMString *replacement, MVMint32 translate_newlines) { |
426 | 13.6k | MVMuint8 *result; |
427 | 13.6k | size_t result_pos, result_limit; |
428 | 13.6k | MVMCodepointIter ci; |
429 | 13.6k | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
430 | 13.6k | MVMuint8 *repl_bytes = NULL; |
431 | 13.6k | MVMuint64 repl_length; |
432 | 13.6k | |
433 | 13.6k | if (start < 0 || start > strgraphs) |
434 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
435 | 13.6k | if (length == -1) |
436 | 13.5k | length = strgraphs; |
437 | 13.6k | if (length < 0 || start + length > strgraphs) |
438 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
439 | 13.6k | |
440 | 13.6k | if (replacement) |
441 | 0 | repl_bytes = (MVMuint8 *) MVM_string_utf8_encode_substr(tc, |
442 | 0 | replacement, &repl_length, 0, -1, NULL, translate_newlines); |
443 | 13.6k | |
444 | 13.6k | /* Guesstimate that we'll be within 2 bytes for most chars most of the |
445 | 13.6k | * time, and give ourselves 4 bytes breathing space. */ |
446 | 13.6k | result_limit = 2 * length; |
447 | 13.6k | result = MVM_malloc(result_limit + 4); |
448 | 13.6k | result_pos = 0; |
449 | 13.6k | |
450 | 13.6k | /* Iterate the codepoints and encode them. */ |
451 | 13.6k | MVM_string_ci_init(tc, &ci, str, translate_newlines); |
452 | 1.12M | while (MVM_string_ci_has_more(tc, &ci)) { |
453 | 1.10M | MVMint32 bytes; |
454 | 1.10M | MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci); |
455 | 1.10M | if (result_pos >= result_limit) { |
456 | 4 | result_limit *= 2; |
457 | 4 | result = MVM_realloc(result, result_limit + 4); |
458 | 4 | } |
459 | 1.10M | bytes = utf8_encode(result + result_pos, cp); |
460 | 1.10M | if (bytes) |
461 | 1.10M | result_pos += bytes; |
462 | 0 | else if (replacement) { |
463 | 0 | if (repl_length >= result_limit || result_pos >= result_limit - repl_length) { |
464 | 0 | result_limit += repl_length; |
465 | 0 | result = MVM_realloc(result, result_limit + 4); |
466 | 0 | } |
467 | 0 | memcpy(result + result_pos, repl_bytes, repl_length); |
468 | 0 | result_pos += repl_length; |
469 | 0 | } |
470 | 0 | else { |
471 | 0 | MVM_free(result); |
472 | 0 | MVM_free(repl_bytes); |
473 | 0 | MVM_string_utf8_throw_encoding_exception(tc, cp); |
474 | 0 | } |
475 | 1.10M | } |
476 | 13.6k | |
477 | 13.6k | if (output_size) |
478 | 13.6k | *output_size = (MVMuint64)result_pos; |
479 | 13.6k | MVM_free(repl_bytes); |
480 | 13.6k | return (char *)result; |
481 | 13.6k | } |
482 | | |
483 | | /* Encodes the specified string to UTF-8. */ |
484 | | char * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, |
485 | 2.17k | MVMint32 translate_newlines) { |
486 | 2.17k | return MVM_string_utf8_encode_substr(tc, str, output_size, 0, -1, NULL, |
487 | 2.17k | translate_newlines); |
488 | 2.17k | } |
489 | | |
490 | | /* Encodes the specified string to a UTF-8 C string. */ |
491 | 1.96k | char * MVM_string_utf8_encode_C_string(MVMThreadContext *tc, MVMString *str) { |
492 | 1.96k | MVMuint64 output_size; |
493 | 1.96k | char * result; |
494 | 1.96k | char * utf8_string = MVM_string_utf8_encode(tc, str, &output_size, 0); |
495 | 1.96k | /* this is almost always called from error-handling code. Don't care if it |
496 | 1.96k | * contains embedded NULs. XXX TODO: Make sure all uses of this free what it returns */ |
497 | 1.96k | result = MVM_malloc(output_size + 1); |
498 | 1.96k | memcpy(result, utf8_string, output_size); |
499 | 1.96k | MVM_free(utf8_string); |
500 | 1.96k | result[output_size] = (char)0; |
501 | 1.96k | return result; |
502 | 1.96k | } |
503 | | |
504 | 0 | void MVM_string_utf8_throw_encoding_exception (MVMThreadContext *tc, MVMCodepoint cp) { |
505 | 0 | const char *gencat = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY); |
506 | 0 | if(cp > 0x10FFFF) { |
507 | 0 | MVM_exception_throw_adhoc(tc, |
508 | 0 | "Error encoding UTF-8 string: could not encode codepoint %d (0x%X), codepoint out of bounds. Cannot encode higher than %d (0x%X)", |
509 | 0 | cp, cp, 0x10FFFF, 0x10FFFF); |
510 | 0 | } |
511 | 0 | else if (strcmp("Cs", gencat) == 0) { |
512 | 0 | MVM_exception_throw_adhoc(tc, |
513 | 0 | "Error encoding UTF-8 string: could not encode Unicode Surrogate codepoint %d (0x%X)", |
514 | 0 | cp, cp); |
515 | 0 | } |
516 | 0 | else { |
517 | 0 | MVM_exception_throw_adhoc(tc, |
518 | 0 | "Error encoding UTF-8 string: could not encode codepoint %d (0x%X)", |
519 | 0 | cp, cp); |
520 | 0 | } |
521 | 0 | } |