/home/travis/build/MoarVM/MoarVM/src/strings/utf8.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* The below section has an MIT-style license, included here. |
4 | | |
5 | | // Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de> |
6 | | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. |
7 | | * |
8 | | * Permission is hereby granted, free of charge, to any person obtaining |
9 | | * a copy of this software and associated documentation files (the |
10 | | * "Software"), to deal in the Software without restriction, including |
11 | | * without limitation the rights to use, copy, modify, merge, publish, |
12 | | * distribute, sublicense, and/or sell copies of the Software, and to |
13 | | * permit persons to whom the Software is furnished to do so, subject |
14 | | * to the following conditions: |
15 | | * |
16 | | * The above copyright notice and this permission notice shall be |
17 | | * included in all copies or substantial portions of the Software. |
18 | | * |
19 | | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
20 | | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
21 | | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
22 | | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS |
23 | | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
24 | | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
25 | | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
26 | | * SOFTWARE. |
27 | | */ |
28 | 1.82M | #define UTF8_ACCEPT 0 |
29 | 0 | #define UTF8_REJECT 12 |
30 | | |
31 | | static const MVMuint8 utf8d[] = { |
32 | | // The first part of the table maps bytes to character classes that |
33 | | // to reduce the size of the transition table and create bitmasks. |
34 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
35 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
36 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
37 | | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
38 | | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
39 | | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
40 | | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
41 | | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, |
42 | | |
43 | | // The second part is a transition table that maps a combination |
44 | | // of a state of the automaton and a character class to a state. |
45 | | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, |
46 | | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, |
47 | | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, |
48 | | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, |
49 | | 12,36,12,12,12,12,12,12,12,12,12,12, |
50 | | }; |
51 | | |
52 | | MVM_STATIC_INLINE MVMint32 |
53 | 956k | decode_utf8_byte(MVMint32 *state, MVMGrapheme32 *codep, MVMuint8 byte) { |
54 | 956k | const MVMint32 type = utf8d[byte]; |
55 | 956k | |
56 | 956k | *codep = (*state != UTF8_ACCEPT) ? |
57 | 118k | (byte & 0x3fu) | (*codep << 6) : |
58 | 837k | (0xff >> type) & (byte); |
59 | 956k | |
60 | 956k | *state = utf8d[256 + *state + type]; |
61 | 956k | return *state; |
62 | 956k | } |
63 | | /* end Bjoern Hoehrmann section (some things were changed from the original) */ |
64 | | |
65 | | /* begin not_gerd section (modified from original) |
66 | | // Copyright 2012 not_gerd |
67 | | // see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122 |
68 | | |
69 | | Permission is granted to use, modify, and / or redistribute at will. |
70 | | |
71 | | This includes removing authorship notices, re-use of code parts in |
72 | | other software (with or without giving credit), and / or creating a |
73 | | commercial product based on it. |
74 | | |
75 | | This permission is not revocable by the author. |
76 | | |
77 | | This software is provided as-is. Use it at your own risk. There is |
78 | | no warranty whatsoever, neither expressed nor implied, and by using |
79 | | this software you accept that the author(s) shall not be held liable |
80 | | for any loss of data, loss of service, or other damages, be they |
81 | | incidental or consequential. Your only option other than accepting |
82 | | this is not to use the software at all. |
83 | | */ |
84 | | |
85 | | enum { |
86 | | CP_CHAR = 1 << 0, |
87 | | CP_LOW_SURROGATE = 1 << 1, |
88 | | CP_HIGH_SURROGATE = 1 << 2, |
89 | | CP_NONCHAR = 1 << 3, |
90 | | CP_OVERFLOW = 1 << 4, |
91 | | |
92 | | U8_SINGLE = 1 << 5, |
93 | | U8_DOUBLE = 1 << 6, |
94 | | U8_TRIPLE = 1 << 7, |
95 | | U8_QUAD = 1 << 8 |
96 | | }; |
97 | | |
98 | 1.35M | static unsigned classify(MVMCodepoint cp) { |
99 | 1.35M | if(cp <= 0x7F) |
100 | 1.35M | return CP_CHAR | U8_SINGLE; |
101 | 1.35M | |
102 | 3.98k | if(cp <= 0x07FF) |
103 | 3.13k | return CP_CHAR | U8_DOUBLE; |
104 | 3.98k | |
105 | 858 | if(0xD800 <= cp && cp <= 0xDBFF) |
106 | 0 | return CP_HIGH_SURROGATE | U8_TRIPLE; |
107 | 858 | |
108 | 858 | if(0xDC00 <= cp && cp <= 0xDFFF) |
109 | 0 | return CP_LOW_SURROGATE | U8_TRIPLE; |
110 | 858 | |
111 | 858 | if(0xFDD0 <= cp && cp <= 0xFDEF) |
112 | 0 | return CP_NONCHAR | U8_TRIPLE; |
113 | 858 | |
114 | 858 | if(cp <= 0xFFFD) |
115 | 845 | return CP_CHAR | U8_TRIPLE; |
116 | 858 | |
117 | 13 | if(cp == 0xFFFE || cp == 0xFFFF) |
118 | 0 | return CP_NONCHAR | U8_TRIPLE; |
119 | 13 | |
120 | 13 | if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF)) |
121 | 0 | return CP_NONCHAR | U8_QUAD; |
122 | 13 | |
123 | 13 | if(cp <= 0x10FFFF) |
124 | 13 | return CP_CHAR | U8_QUAD; |
125 | 13 | |
126 | 0 | if(cp <= 0x1FFFFF) |
127 | 0 | return CP_OVERFLOW | U8_QUAD; |
128 | 0 |
|
129 | 0 | return 0; |
130 | 0 | } |
131 | | |
132 | 1.35M | static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) { |
133 | 1.35M | unsigned cc = classify(cp); |
134 | 1.35M | |
135 | 1.35M | if (!(cc & (CP_CHAR | CP_NONCHAR))) |
136 | 0 | return 0; |
137 | 1.35M | |
138 | 1.35M | if (cc & U8_SINGLE) { |
139 | 1.35M | bp[0] = (MVMuint8)cp; |
140 | 1.35M | return 1; |
141 | 1.35M | } |
142 | 1.35M | |
143 | 3.98k | if (cc & U8_DOUBLE) { |
144 | 3.13k | bp[0] = (MVMuint8)(( 6 << 5) | (cp >> 6)); |
145 | 3.13k | bp[1] = (MVMuint8)(( 2 << 6) | (cp & 0x3F)); |
146 | 3.13k | return 2; |
147 | 3.13k | } |
148 | 3.98k | |
149 | 858 | if (cc & U8_TRIPLE) { |
150 | 845 | bp[0] = (MVMuint8)((14 << 4) | (cp >> 12)); |
151 | 845 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
152 | 845 | bp[2] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
153 | 845 | return 3; |
154 | 845 | } |
155 | 858 | |
156 | 13 | if (cc & U8_QUAD) { |
157 | 13 | bp[0] = (MVMuint8)((30 << 3) | (cp >> 18)); |
158 | 13 | bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F)); |
159 | 13 | bp[2] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F)); |
160 | 13 | bp[3] = (MVMuint8)(( 2 << 6) | ( cp & 0x3F)); |
161 | 13 | return 4; |
162 | 13 | } |
163 | 13 | |
164 | 0 | return 0; |
165 | 13 | } |
166 | | |
167 | | /* end not_gerd section */ |
168 | | |
169 | 0 | #define UTF8_MAXINC (32 * 1024 * 1024) |
170 | | |
171 | | /* Decodes the specified number of bytes of utf8 into an NFG string, creating |
172 | | * a result of the specified type. The type must have the MVMString REPR. */ |
173 | 27.7k | MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) { |
174 | 27.7k | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
175 | 27.7k | MVMint32 count = 0; |
176 | 27.7k | MVMCodepoint codepoint; |
177 | 27.7k | MVMint32 line_ending = 0; |
178 | 27.7k | MVMint32 state = 0; |
179 | 27.7k | MVMint32 bufsize = bytes; |
180 | 27.7k | MVMGrapheme32 *buffer = MVM_malloc(sizeof(MVMGrapheme32) * bufsize); |
181 | 27.7k | size_t orig_bytes; |
182 | 27.7k | const char *orig_utf8; |
183 | 27.7k | MVMint32 line; |
184 | 27.7k | MVMint32 col; |
185 | 27.7k | MVMint32 ready; |
186 | 27.7k | |
187 | 27.7k | /* Need to normalize to NFG as we decode. */ |
188 | 27.7k | MVMNormalizer norm; |
189 | 27.7k | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
190 | 27.7k | |
191 | 27.7k | orig_bytes = bytes; |
192 | 27.7k | orig_utf8 = utf8; |
193 | 27.7k | |
194 | 377k | for (; bytes; ++utf8, --bytes) { |
195 | 350k | switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8), UTF8_ACCEPT)) { |
196 | 231k | case UTF8_ACCEPT: { /* got a codepoint */ |
197 | 231k | MVMGrapheme32 g; |
198 | 231k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, codepoint, &g); |
199 | 231k | if (ready) { |
200 | 203k | while (count + ready > bufsize) { /* if the buffer's full make a bigger one */ |
201 | 0 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * ( |
202 | 0 | bufsize >= UTF8_MAXINC ? (bufsize += UTF8_MAXINC) : (bufsize *= 2) |
203 | 0 | )); |
204 | 0 | } |
205 | 203k | buffer[count++] = g; |
206 | 203k | while (--ready > 0) { |
207 | 340 | buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
208 | 340 | } |
209 | 203k | } |
210 | 231k | break; |
211 | 231k | } |
212 | 0 | case UTF8_REJECT: |
213 | 0 | /* found a malformed sequence; parse it again this time tracking |
214 | 0 | * line and col numbers. */ |
215 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); /* Since we'll throw. */ |
216 | 0 | bytes = orig_bytes; utf8 = orig_utf8; state = 0; line = 1; col = 1; |
217 | 0 | for (; bytes; ++utf8, --bytes) { |
218 | 0 | switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) { |
219 | 0 | case UTF8_ACCEPT: |
220 | 0 | /* this could be reorganized into several nested ugly if/else :/ */ |
221 | 0 | if (!line_ending && (codepoint == 10 || codepoint == 13)) { |
222 | 0 | /* Detect the style of line endings. |
223 | 0 | * Select whichever comes first. |
224 | 0 | * First or only part of first line ending. */ |
225 | 0 | line_ending = codepoint; |
226 | 0 | col = 1; line++; |
227 | 0 | } |
228 | 0 | else if (line_ending && codepoint == line_ending) { |
229 | 0 | /* first or only part of next line ending */ |
230 | 0 | col = 1; line++; |
231 | 0 | } |
232 | 0 | else if (codepoint == 10 || codepoint == 13) { |
233 | 0 | /* second part of line ending; ignore */ |
234 | 0 | } |
235 | 0 | else /* non-line ending codepoint */ |
236 | 0 | col++; |
237 | 0 | break; |
238 | 0 | case UTF8_REJECT: |
239 | 0 | MVM_free(buffer); |
240 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-8 at line %u col %u", line, col); |
241 | 0 | } |
242 | 0 | } |
243 | 0 | MVM_free(buffer); |
244 | 0 | MVM_exception_throw_adhoc(tc, "Concurrent modification of UTF-8 input buffer!"); |
245 | 0 | break; |
246 | 350k | } |
247 | 350k | } |
248 | 27.7k | if (state != UTF8_ACCEPT) { |
249 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
250 | 0 | MVM_free(buffer); |
251 | 0 | MVM_exception_throw_adhoc(tc, "Malformed termination of UTF-8 string"); |
252 | 0 | } |
253 | 27.7k | |
254 | 27.7k | /* Get any final graphemes from the normalizer, and clean it up. */ |
255 | 27.7k | MVM_unicode_normalizer_eof(tc, &norm); |
256 | 27.7k | ready = MVM_unicode_normalizer_available(tc, &norm); |
257 | 27.7k | if (ready) { |
258 | 27.4k | if (count + ready > bufsize) { |
259 | 0 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (count + ready)); |
260 | 0 | } |
261 | 54.9k | while (ready--) { |
262 | 27.4k | buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
263 | 27.4k | } |
264 | 27.4k | } |
265 | 27.7k | MVM_unicode_normalizer_cleanup(tc, &norm); |
266 | 27.7k | |
267 | 27.7k | /* If we're lucky, we can fit our string in 8 bits per grapheme. */ |
268 | 27.7k | if (MVM_string_buf32_can_fit_into_8bit(buffer, count)) { |
269 | 21.2k | MVMGrapheme8 *new_buffer = MVM_malloc(sizeof(MVMGrapheme8) * count); |
270 | 21.2k | MVM_VECTORIZE_LOOP |
271 | 191k | for (ready = 0; ready < count; ready++) { |
272 | 169k | new_buffer[ready] = buffer[ready]; |
273 | 169k | } |
274 | 21.2k | MVM_free(buffer); |
275 | 21.2k | result->body.storage.blob_8 = new_buffer; |
276 | 21.2k | result->body.storage_type = MVM_STRING_GRAPHEME_8; |
277 | 6.51k | } else { |
278 | 6.51k | /* just keep the same buffer as the MVMString's buffer. Later |
279 | 6.51k | * we can add heuristics to resize it if we have enough free |
280 | 6.51k | * memory */ |
281 | 6.51k | if (bufsize - count > 4) { |
282 | 5.94k | buffer = MVM_realloc(buffer, count * sizeof(MVMGrapheme32)); |
283 | 5.94k | } |
284 | 6.51k | result->body.storage.blob_32 = buffer; |
285 | 6.51k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
286 | 6.51k | } |
287 | 27.7k | result->body.num_graphs = count; |
288 | 27.7k | |
289 | 27.7k | return result; |
290 | 27.7k | } |
291 | | |
292 | 198 | static MVMint32 its_the_bom(const char *utf8) { |
293 | 198 | const MVMuint8 *uns_utf8 = (const MVMuint8 *)utf8; |
294 | 0 | return uns_utf8[0] == 0xEF && uns_utf8[1] == 0xBB && uns_utf8[2] == 0xBF; |
295 | 198 | } |
296 | | |
297 | | /* Same as MVM_string_utf8_decode, but strips a BOM if it finds one. */ |
298 | 6 | MVMString * MVM_string_utf8_decode_strip_bom(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) { |
299 | 6 | if (bytes >= 3 && its_the_bom(utf8)) { |
300 | 0 | utf8 += 3; |
301 | 0 | bytes -= 3; |
302 | 0 | } |
303 | 6 | return MVM_string_utf8_decode(tc, result_type, utf8, bytes); |
304 | 6 | } |
305 | | |
306 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
307 | | * buffers, or until a stopper is reached. */ |
308 | | MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
309 | | const MVMint32 *stopper_chars, |
310 | 271 | MVMDecodeStreamSeparators *seps) { |
311 | 271 | MVMint32 count = 0, total = 0; |
312 | 271 | MVMint32 state = 0; |
313 | 271 | MVMCodepoint codepoint = 0; |
314 | 271 | MVMCodepoint lag_codepoint = -1; |
315 | 271 | MVMint32 bufsize; |
316 | 271 | MVMGrapheme32 *buffer = NULL; |
317 | 271 | MVMDecodeStreamBytes *cur_bytes = NULL; |
318 | 271 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
319 | 271 | MVMDecodeStreamBytes *lag_last_accept_bytes = NULL; |
320 | 271 | MVMint32 last_accept_pos, lag_last_accept_pos, ready, at_start; |
321 | 271 | MVMuint32 reached_stopper; |
322 | 271 | MVMuint32 can_fast_path; |
323 | 271 | |
324 | 271 | /* If there's no buffers, we're done. */ |
325 | 271 | if (!ds->bytes_head) |
326 | 28 | return 0; |
327 | 243 | last_accept_pos = ds->bytes_head_pos; |
328 | 243 | |
329 | 243 | /* If we're asked for zero chars, also done. */ |
330 | 243 | if (stopper_chars && *stopper_chars == 0) |
331 | 0 | return 1; |
332 | 243 | |
333 | 243 | /* If there's nothing hanging around in the normalization buffer, we can |
334 | 243 | * use the fast path. */ |
335 | 243 | can_fast_path = MVM_unicode_normalizer_empty(tc, &(ds->norm)); |
336 | 243 | |
337 | 243 | bufsize = ds->result_size_guess; |
338 | 243 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
339 | 243 | |
340 | 243 | /* Decode each of the buffers. */ |
341 | 243 | cur_bytes = ds->bytes_head; |
342 | 243 | at_start = ds->abs_byte_pos == 0; |
343 | 243 | reached_stopper = 0; |
344 | 449 | while (cur_bytes) { |
345 | 248 | /* Process this buffer. */ |
346 | 243 | MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
347 | 248 | char *bytes = cur_bytes->bytes; |
348 | 248 | if (at_start) { |
349 | 200 | /* We're right at the start of the stream of things to decode. See |
350 | 200 | * if we have a BOM, and skip over it if so. */ |
351 | 200 | if (pos + 3 <= cur_bytes->length) { |
352 | 195 | if (its_the_bom(bytes + pos)) { |
353 | 0 | pos += 3; |
354 | 0 | last_accept_bytes = cur_bytes; |
355 | 0 | last_accept_pos = pos; |
356 | 0 | } |
357 | 195 | } |
358 | 200 | at_start = 0; |
359 | 200 | } |
360 | 248 | |
361 | 248 | /* We have both a fast path and a slow path for UTF-8 decoding. The |
362 | 248 | * fast path covers the common case where we have no chars that are |
363 | 248 | * significant to normalization, and so we can skip the normalizer. |
364 | 248 | * This is true of the ASCII and Latin-1 ranges of UTF-8, with the |
365 | 248 | * exception of \r. Note that since the following codepoint may be |
366 | 248 | * the one that causes us to need to compose, we need a lag of 1 |
367 | 248 | * codepoint. */ |
368 | 248 | if (can_fast_path) { |
369 | 240 | /* Lift the no lag codepoint case out of the hot loop below, |
370 | 240 | * to save on a couple of branches. */ |
371 | 240 | MVMCodepoint first_significant = ds->norm.first_significant; |
372 | 480 | while (lag_codepoint == -1 && pos < cur_bytes->length) { |
373 | 242 | switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, bytes[pos++]), UTF8_ACCEPT)) { |
374 | 236 | case UTF8_ACCEPT: { |
375 | 236 | if (codepoint == '\r' || codepoint >= first_significant) { |
376 | 2 | can_fast_path = 0; |
377 | 2 | last_accept_bytes = cur_bytes; |
378 | 2 | last_accept_pos = pos; |
379 | 2 | goto slow_path; |
380 | 2 | } |
381 | 234 | lag_codepoint = codepoint; |
382 | 234 | lag_last_accept_bytes = cur_bytes; |
383 | 234 | lag_last_accept_pos = pos; |
384 | 234 | break; |
385 | 236 | } |
386 | 0 | case UTF8_REJECT: |
387 | 0 | MVM_free(buffer); |
388 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-8"); |
389 | 0 | break; |
390 | 242 | } |
391 | 242 | } |
392 | 240 | |
393 | 566k | while (pos < cur_bytes->length) { |
394 | 566k | switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, bytes[pos++]), UTF8_ACCEPT)) { |
395 | 566k | case UTF8_ACCEPT: { |
396 | 566k | /* If we hit something that needs the normalizer, we put |
397 | 566k | * any lagging codepoint into its buffer and jump to it. */ |
398 | 566k | if (codepoint == '\r' || codepoint >= first_significant) { |
399 | 20 | MVM_unicode_normalizer_push_codepoints(tc, &(ds->norm), |
400 | 20 | &lag_codepoint, 1); |
401 | 20 | lag_codepoint = -1; /* Invalidate, we used it. */ |
402 | 20 | can_fast_path = 0; |
403 | 20 | last_accept_bytes = cur_bytes; |
404 | 20 | last_accept_pos = pos; |
405 | 20 | goto slow_path; |
406 | 20 | } |
407 | 566k | |
408 | 566k | /* As we have a lagging codepoint, and this one does not |
409 | 566k | * need normalization, then we know we can spit out the |
410 | 566k | * lagging one. */ |
411 | 566k | if (count == bufsize) { |
412 | 8.76k | /* Valid character, but we filled the buffer. Attach this |
413 | 8.76k | * one to the buffers linked list, and continue with a new |
414 | 8.76k | * one. */ |
415 | 8.76k | MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); |
416 | 8.76k | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
417 | 8.76k | count = 0; |
418 | 8.76k | } |
419 | 566k | buffer[count++] = lag_codepoint; |
420 | 566k | total++; |
421 | 566k | if (MVM_string_decode_stream_maybe_sep(tc, seps, lag_codepoint) || |
422 | 566k | stopper_chars && *stopper_chars == total) { |
423 | 32 | reached_stopper = 1; |
424 | 32 | last_accept_bytes = lag_last_accept_bytes; |
425 | 32 | last_accept_pos = lag_last_accept_pos; |
426 | 32 | goto done; |
427 | 32 | } |
428 | 566k | |
429 | 566k | /* The current state becomes the lagged state. */ |
430 | 566k | lag_codepoint = codepoint; |
431 | 566k | lag_last_accept_bytes = cur_bytes; |
432 | 566k | lag_last_accept_pos = pos; |
433 | 566k | break; |
434 | 566k | } |
435 | 0 | case UTF8_REJECT: |
436 | 0 | MVM_free(buffer); |
437 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-8"); |
438 | 0 | break; |
439 | 566k | } |
440 | 566k | } |
441 | 238 | |
442 | 238 | /* If we fall out of the loop and have a lagged codepoint, but |
443 | 238 | * no next buffer, then we fall into the slow path to process it |
444 | 238 | * correctly. */ |
445 | 186 | if (lag_codepoint != -1 && !cur_bytes->next) { |
446 | 182 | codepoint = lag_codepoint; |
447 | 182 | lag_codepoint = -1; |
448 | 182 | can_fast_path = 0; |
449 | 182 | last_accept_bytes = lag_last_accept_bytes; |
450 | 182 | last_accept_pos = lag_last_accept_pos; |
451 | 182 | goto slow_path; |
452 | 182 | } |
453 | 186 | } |
454 | 8 | else { |
455 | 39.3k | while (pos < cur_bytes->length) { |
456 | 39.1k | switch(MVM_EXPECT(decode_utf8_byte(&state, &codepoint, bytes[pos++]), UTF8_ACCEPT)) { |
457 | 38.7k | case UTF8_ACCEPT: { |
458 | 38.7k | MVMGrapheme32 g; |
459 | 38.7k | MVMint32 first; |
460 | 38.7k | last_accept_bytes = cur_bytes; |
461 | 38.7k | last_accept_pos = pos; |
462 | 39.0k | slow_path: |
463 | 39.0k | first = 1; |
464 | 39.0k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, |
465 | 39.0k | &(ds->norm), codepoint, &g); |
466 | 77.9k | while (ready--) { |
467 | 38.9k | if (first) |
468 | 37.8k | first = 0; |
469 | 38.9k | else |
470 | 1.14k | g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); |
471 | 38.9k | if (count == bufsize) { |
472 | 612 | /* Valid character, but we filled the buffer. Attach this |
473 | 612 | * one to the buffers linked list, and continue with a new |
474 | 612 | * one. */ |
475 | 612 | MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); |
476 | 612 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
477 | 612 | count = 0; |
478 | 612 | } |
479 | 38.9k | buffer[count++] = g; |
480 | 38.9k | total++; |
481 | 38.9k | if (MVM_string_decode_stream_maybe_sep(tc, seps, g) || |
482 | 38.9k | stopper_chars && *stopper_chars == total) { |
483 | 10 | reached_stopper = 1; |
484 | 10 | goto done; |
485 | 10 | } |
486 | 38.9k | } |
487 | 38.9k | break; |
488 | 39.0k | } |
489 | 0 | case UTF8_REJECT: |
490 | 0 | MVM_free(buffer); |
491 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-8"); |
492 | 0 | break; |
493 | 39.1k | } |
494 | 39.1k | } |
495 | 8 | } |
496 | 206 | cur_bytes = cur_bytes->next; |
497 | 206 | } |
498 | 243 | done: |
499 | 243 | |
500 | 243 | /* Attach what we successfully parsed as a result buffer, and trim away |
501 | 243 | * what we chewed through. */ |
502 | 243 | if (count) { |
503 | 239 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
504 | 239 | } |
505 | 4 | else { |
506 | 4 | MVM_free(buffer); |
507 | 4 | } |
508 | 243 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
509 | 243 | |
510 | 243 | return reached_stopper; |
511 | 243 | } |
512 | | |
513 | | /* Encodes the specified string to UTF-8. */ |
514 | | char * MVM_string_utf8_encode_substr(MVMThreadContext *tc, |
515 | | MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, |
516 | 16.0k | MVMString *replacement, MVMint32 translate_newlines) { |
517 | 16.0k | MVMuint8 *result = NULL; |
518 | 16.0k | size_t result_pos, result_limit; |
519 | 16.0k | MVMCodepointIter ci; |
520 | 16.0k | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
521 | 16.0k | MVMuint8 *repl_bytes = NULL; |
522 | 16.0k | MVMuint64 repl_length; |
523 | 16.0k | |
524 | 16.0k | if (start < 0 || start > strgraphs) |
525 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
526 | 16.0k | if (length == -1) |
527 | 3.02k | length = strgraphs; |
528 | 16.0k | if (length < 0 || start + length > strgraphs) |
529 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
530 | 16.0k | |
531 | 16.0k | if (replacement) |
532 | 0 | repl_bytes = (MVMuint8 *) MVM_string_utf8_encode_substr(tc, |
533 | 0 | replacement, &repl_length, 0, -1, NULL, translate_newlines); |
534 | 16.0k | |
535 | 16.0k | /* Guesstimate that we'll be within 2 bytes for most chars most of the |
536 | 16.0k | * time, and give ourselves 4 bytes breathing space. */ |
537 | 16.0k | result_limit = 2 * length; |
538 | 16.0k | result = MVM_malloc(result_limit + 4); |
539 | 16.0k | result_pos = 0; |
540 | 16.0k | |
541 | 16.0k | /* Iterate the codepoints and encode them. */ |
542 | 16.0k | MVM_string_ci_init(tc, &ci, str, translate_newlines, 0); |
543 | 1.37M | while (MVM_string_ci_has_more(tc, &ci)) { |
544 | 1.35M | MVMint32 bytes; |
545 | 1.35M | MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci); |
546 | 1.35M | if (result_pos >= result_limit) { |
547 | 9 | result_limit *= 2; |
548 | 9 | result = MVM_realloc(result, result_limit + 4); |
549 | 9 | } |
550 | 1.35M | bytes = utf8_encode(result + result_pos, cp); |
551 | 1.35M | if (bytes) |
552 | 1.35M | result_pos += bytes; |
553 | 0 | else if (replacement) { |
554 | 0 | if (repl_length >= result_limit || result_pos >= result_limit - repl_length) { |
555 | 0 | result_limit += repl_length; |
556 | 0 | result = MVM_realloc(result, result_limit + 4); |
557 | 0 | } |
558 | 0 | memcpy(result + result_pos, repl_bytes, repl_length); |
559 | 0 | result_pos += repl_length; |
560 | 0 | } |
561 | 0 | else { |
562 | 0 | MVM_free(result); |
563 | 0 | MVM_free(repl_bytes); |
564 | 0 | MVM_string_utf8_throw_encoding_exception(tc, cp); |
565 | 0 | } |
566 | 1.35M | } |
567 | 16.0k | |
568 | 16.0k | if (output_size) |
569 | 16.0k | *output_size = (MVMuint64)result_pos; |
570 | 16.0k | MVM_free(repl_bytes); |
571 | 16.0k | return (char *)result; |
572 | 16.0k | } |
573 | | |
574 | | /* Encodes the specified string to UTF-8. */ |
575 | | char * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, |
576 | 3.02k | MVMint32 translate_newlines) { |
577 | 3.02k | return MVM_string_utf8_encode_substr(tc, str, output_size, 0, -1, NULL, |
578 | 3.02k | translate_newlines); |
579 | 3.02k | } |
580 | | |
581 | | /* Encodes the specified string to a UTF-8 C string. */ |
582 | 2.46k | char * MVM_string_utf8_encode_C_string(MVMThreadContext *tc, MVMString *str) { |
583 | 2.46k | MVMuint64 output_size; |
584 | 2.46k | char * result = NULL; |
585 | 2.46k | char * utf8_string = MVM_string_utf8_encode(tc, str, &output_size, 0); |
586 | 2.46k | /* this is almost always called from error-handling code. Don't care if it |
587 | 2.46k | * contains embedded NULs. XXX TODO: Make sure all uses of this free what it returns */ |
588 | 2.46k | result = MVM_malloc(output_size + 1); |
589 | 2.46k | memcpy(result, utf8_string, output_size); |
590 | 2.46k | MVM_free(utf8_string); |
591 | 2.46k | result[output_size] = (char)0; |
592 | 2.46k | return result; |
593 | 2.46k | } |
594 | | |
595 | | /* Encodes the specified string to a UTF-8 C string if it is not NULL. */ |
596 | 0 | char * MVM_string_utf8_maybe_encode_C_string(MVMThreadContext *tc, MVMString *str) { |
597 | 0 | return str ? MVM_string_utf8_encode_C_string(tc, str) : NULL; |
598 | 0 | } |
599 | | |
600 | 0 | void MVM_string_utf8_throw_encoding_exception (MVMThreadContext *tc, MVMCodepoint cp) { |
601 | 0 | const char *gencat = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY); |
602 | 0 | if(cp > 0x10FFFF) { |
603 | 0 | MVM_exception_throw_adhoc(tc, |
604 | 0 | "Error encoding UTF-8 string: could not encode codepoint %d (0x%X), codepoint out of bounds. Cannot encode higher than %d (0x%X)", |
605 | 0 | cp, cp, 0x10FFFF, 0x10FFFF); |
606 | 0 | } |
607 | 0 | else if (strcmp("Cs", gencat) == 0) { |
608 | 0 | MVM_exception_throw_adhoc(tc, |
609 | 0 | "Error encoding UTF-8 string: could not encode Unicode Surrogate codepoint %d (0x%X)", |
610 | 0 | cp, cp); |
611 | 0 | } |
612 | 0 | else { |
613 | 0 | MVM_exception_throw_adhoc(tc, |
614 | 0 | "Error encoding UTF-8 string: could not encode codepoint %d (0x%X)", |
615 | 0 | cp, cp); |
616 | 0 | } |
617 | 0 | } |