/home/travis/build/MoarVM/MoarVM/src/strings/decode_stream.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* A decode stream represents an on-going decoding process, from bytes into |
4 | | * characters. Bytes can be contributed to the decode stream, and chars can be |
5 | | * obtained. Byte buffers and decoded char buffers are kept in linked lists. |
6 | | * Note that characters may start at the end of one byte buffer and finish in |
7 | | * the next, which is taken care of by the logic in here and the decoders |
8 | | * themselves. Additionally, normalization may be applied using the normalizer |
9 | | * in the decode stream, at the discretion of the encoding in question (some, |
10 | | * such as ASCII and Latin-1, are normalized by definition). |
11 | | */ |
12 | | |
13 | 85 | #define DECODE_NOT_EOF 0 |
14 | 188 | #define DECODE_EOF 1 |
15 | | |
16 | | /* Creates a new decoding stream. */ |
17 | | MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, |
18 | 661 | MVMint64 abs_byte_pos, MVMint32 translate_newlines) { |
19 | 661 | MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream)); |
20 | 661 | ds->encoding = encoding; |
21 | 661 | ds->abs_byte_pos = abs_byte_pos; |
22 | 661 | MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG); |
23 | 661 | if (translate_newlines) |
24 | 647 | MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm)); |
25 | 661 | ds->result_size_guess = 64; |
26 | 661 | return ds; |
27 | 661 | } |
28 | | |
29 | | /* Adds another byte buffer into the decoding stream. */ |
30 | 213 | void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, char *bytes, MVMint32 length) { |
31 | 213 | if (length > 0) { |
32 | 212 | MVMDecodeStreamBytes *new_bytes = MVM_calloc(1, sizeof(MVMDecodeStreamBytes)); |
33 | 212 | new_bytes->bytes = bytes; |
34 | 212 | new_bytes->length = length; |
35 | 212 | if (ds->bytes_tail) |
36 | 7 | ds->bytes_tail->next = new_bytes; |
37 | 212 | ds->bytes_tail = new_bytes; |
38 | 212 | if (!ds->bytes_head) |
39 | 205 | ds->bytes_head = new_bytes; |
40 | 212 | } |
41 | 1 | else { |
42 | 1 | /* It's empty, so free the buffer right away and don't add. */ |
43 | 1 | MVM_free(bytes); |
44 | 1 | } |
45 | 213 | } |
46 | | |
47 | | /* Adds another char result buffer into the decoding stream. */ |
48 | 9.64k | void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length) { |
49 | 9.64k | MVMDecodeStreamChars *new_chars; |
50 | 9.64k | if (ds->chars_reuse) { |
51 | 28 | new_chars = ds->chars_reuse; |
52 | 28 | ds->chars_reuse = NULL; |
53 | 28 | } |
54 | 9.61k | else { |
55 | 9.61k | new_chars = MVM_malloc(sizeof(MVMDecodeStreamChars)); |
56 | 9.61k | } |
57 | 9.64k | new_chars->chars = chars; |
58 | 9.64k | new_chars->length = length; |
59 | 9.64k | new_chars->next = NULL; |
60 | 9.64k | if (ds->chars_tail) |
61 | 9.41k | ds->chars_tail->next = new_chars; |
62 | 9.64k | ds->chars_tail = new_chars; |
63 | 9.64k | if (!ds->chars_head) |
64 | 230 | ds->chars_head = new_chars; |
65 | 9.64k | } |
66 | | |
67 | | /* Internal function to free a chars result structure, putting it into the |
68 | | * re-use slot if it's empty. */ |
69 | 9.64k | static void free_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamChars *chars) { |
70 | 9.64k | if (ds->chars_reuse) |
71 | 9.41k | MVM_free(chars); |
72 | 9.64k | else |
73 | 230 | ds->chars_reuse = chars; |
74 | 9.64k | } |
75 | | |
76 | | /* Throws away byte buffers no longer needed. */ |
77 | 245 | void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos) { |
78 | 251 | while (ds->bytes_head != bytes) { |
79 | 6 | MVMDecodeStreamBytes *discard = ds->bytes_head; |
80 | 6 | ds->abs_byte_pos += discard->length - ds->bytes_head_pos; |
81 | 6 | ds->bytes_head = discard->next; |
82 | 6 | ds->bytes_head_pos = 0; |
83 | 6 | MVM_free(discard->bytes); |
84 | 6 | MVM_free(discard); |
85 | 6 | } |
86 | 245 | if (!ds->bytes_head) { |
87 | 0 | if (MVM_LIKELY(pos == 0)) |
88 | 0 | return; |
89 | 0 | /* Guard against null pointer dereference below. */ |
90 | 0 | else |
91 | 0 | MVM_exception_throw_adhoc(tc, |
92 | 0 | "Unknown error encountered in MVM_string_decodestream_discard_to"); |
93 | 0 | } |
94 | 245 | if (ds->bytes_head->length == pos) { |
95 | 203 | /* We ate all of the new head buffer too; also free it. */ |
96 | 203 | MVMDecodeStreamBytes *discard = ds->bytes_head; |
97 | 203 | ds->abs_byte_pos += discard->length - ds->bytes_head_pos; |
98 | 203 | ds->bytes_head = discard->next; |
99 | 203 | ds->bytes_head_pos = 0; |
100 | 203 | MVM_free(discard->bytes); |
101 | 203 | MVM_free(discard); |
102 | 203 | if (ds->bytes_head == NULL) |
103 | 203 | ds->bytes_tail = NULL; |
104 | 203 | } |
105 | 42 | else { |
106 | 42 | ds->abs_byte_pos += pos - ds->bytes_head_pos; |
107 | 42 | ds->bytes_head_pos = pos; |
108 | 42 | } |
109 | 245 | } |
110 | | |
111 | | /* Does a decode run, selected by encoding. Returns non-zero if we actually |
112 | | * decoded more chars. */ |
113 | 109 | #define RUN_DECODE_NOTHING_DECODED 0 |
114 | 199 | #define RUN_DECODE_STOPPER_NOT_REACHED 1 |
115 | 92 | #define RUN_DECODE_STOPPER_REACHED 2 |
116 | 273 | static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *sep_spec, MVMint32 eof) { |
117 | 273 | MVMDecodeStreamChars *prev_chars_tail = ds->chars_tail; |
118 | 273 | MVMuint32 reached_stopper; |
119 | 273 | switch (ds->encoding) { |
120 | 271 | case MVM_encoding_type_utf8: |
121 | 271 | reached_stopper = MVM_string_utf8_decodestream(tc, ds, stopper_chars, sep_spec); |
122 | 271 | break; |
123 | 1 | case MVM_encoding_type_ascii: |
124 | 1 | reached_stopper = MVM_string_ascii_decodestream(tc, ds, stopper_chars, sep_spec); |
125 | 1 | break; |
126 | 1 | case MVM_encoding_type_latin1: |
127 | 1 | reached_stopper = MVM_string_latin1_decodestream(tc, ds, stopper_chars, sep_spec); |
128 | 1 | break; |
129 | 0 | case MVM_encoding_type_windows1252: |
130 | 0 | reached_stopper = MVM_string_windows1252_decodestream(tc, ds, stopper_chars, sep_spec); |
131 | 0 | break; |
132 | 0 | case MVM_encoding_type_windows1251: |
133 | 0 | reached_stopper = MVM_string_windows1251_decodestream(tc, ds, stopper_chars, sep_spec); |
134 | 0 | break; |
135 | 0 | case MVM_encoding_type_shiftjis: |
136 | 0 | reached_stopper = MVM_string_shiftjis_decodestream(tc, ds, stopper_chars, sep_spec); |
137 | 0 | break; |
138 | 0 | case MVM_encoding_type_utf8_c8: |
139 | 0 | reached_stopper = MVM_string_utf8_c8_decodestream(tc, ds, stopper_chars, sep_spec, eof); |
140 | 0 | break; |
141 | 0 | default: |
142 | 0 | MVM_exception_throw_adhoc(tc, "Streaming decode NYI for encoding %d", |
143 | 0 | (int)ds->encoding); |
144 | 273 | } |
145 | 273 | if (ds->chars_tail == prev_chars_tail) |
146 | 32 | return RUN_DECODE_NOTHING_DECODED; |
147 | 241 | else if (reached_stopper) |
148 | 42 | return RUN_DECODE_STOPPER_REACHED; |
149 | 241 | else |
150 | 199 | return RUN_DECODE_STOPPER_NOT_REACHED; |
151 | 273 | } |
152 | | |
153 | | /* In situations where we have hit EOF, we need to decode what's left and flush |
154 | | * the normalization buffer also. */ |
155 | 217 | static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) { |
156 | 217 | /* Decode all the things. */ |
157 | 217 | if (ds->bytes_head) |
158 | 188 | run_decode(tc, ds, NULL, NULL, DECODE_EOF); |
159 | 217 | |
160 | 217 | /* If there's some things left in the normalization buffer, take them. */ |
161 | 217 | MVM_unicode_normalizer_eof(tc, &(ds->norm)); |
162 | 217 | if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { |
163 | 27 | MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); |
164 | 27 | MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); |
165 | 27 | MVMint32 count = 0; |
166 | 54 | while (ready--) |
167 | 27 | buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); |
168 | 27 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
169 | 27 | } |
170 | 217 | } |
171 | | |
172 | | /* Gets the specified number of characters. If we are not yet able to decode |
173 | | * that many, returns NULL. This may mean more input buffers are needed. The |
174 | | * exclude parameter specifies a number of chars that should be taken from the |
175 | | * input buffer, but not included in the result string (for chomping a line |
176 | | * separator). */ |
177 | 16 | static MVMint32 missing_chars(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 wanted) { |
178 | 16 | MVMint32 got = 0; |
179 | 16 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
180 | 23 | while (cur_chars && got < wanted) { |
181 | 7 | if (cur_chars == ds->chars_head) |
182 | 7 | got += cur_chars->length - ds->chars_head_pos; |
183 | 7 | else |
184 | 0 | got += cur_chars->length; |
185 | 7 | cur_chars = cur_chars->next; |
186 | 7 | } |
187 | 11 | return got >= wanted ? 0 : wanted - got; |
188 | 16 | } |
189 | 28 | static MVMString * take_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint32 exclude) { |
190 | 28 | MVMString *result; |
191 | 28 | MVMint32 found = 0; |
192 | 28 | MVMint32 result_found = 0; |
193 | 28 | |
194 | 28 | MVMint32 result_chars = chars - exclude; |
195 | 28 | if (result_chars < 0) |
196 | 0 | MVM_exception_throw_adhoc(tc, "DecodeStream take_chars: chars - exclude < 0 should never happen"); |
197 | 28 | |
198 | 28 | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
199 | 28 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
200 | 28 | result->body.num_graphs = result_chars; |
201 | 28 | |
202 | 28 | /* In the best case, the head char buffer has exactly what we need. This |
203 | 28 | * will typically happen when it a steady state of decoding lines. */ |
204 | 28 | if (ds->chars_head->length == chars && ds->chars_head_pos == 0) { |
205 | 18 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
206 | 18 | result->body.storage.blob_32 = cur_chars->chars; |
207 | 18 | ds->chars_head = cur_chars->next; |
208 | 18 | if (ds->chars_head == NULL) |
209 | 18 | ds->chars_tail = NULL; |
210 | 18 | free_chars(tc, ds, cur_chars); |
211 | 18 | } |
212 | 28 | |
213 | 28 | /* Otherwise, need to take and copy. */ |
214 | 10 | else { |
215 | 10 | result->body.storage.blob_32 = MVM_malloc(result_chars * sizeof(MVMGrapheme32)); |
216 | 40 | while (found < chars) { |
217 | 30 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
218 | 30 | MVMint32 available = cur_chars->length - ds->chars_head_pos; |
219 | 30 | if (available <= chars - found) { |
220 | 30 | /* We need all that's left in this buffer and likely |
221 | 30 | * more. */ |
222 | 30 | MVMDecodeStreamChars *next_chars = cur_chars->next; |
223 | 30 | if (available <= result_chars - result_found) { |
224 | 23 | memcpy(result->body.storage.blob_32 + result_found, |
225 | 23 | cur_chars->chars + ds->chars_head_pos, |
226 | 23 | available * sizeof(MVMGrapheme32)); |
227 | 23 | result_found += available; |
228 | 23 | } |
229 | 7 | else { |
230 | 7 | MVMint32 to_copy = result_chars - result_found; |
231 | 7 | memcpy(result->body.storage.blob_32 + result_found, |
232 | 7 | cur_chars->chars + ds->chars_head_pos, |
233 | 7 | to_copy * sizeof(MVMGrapheme32)); |
234 | 7 | result_found += to_copy; |
235 | 7 | } |
236 | 30 | found += available; |
237 | 30 | MVM_free(cur_chars->chars); |
238 | 30 | free_chars(tc, ds, cur_chars); |
239 | 30 | ds->chars_head = next_chars; |
240 | 30 | ds->chars_head_pos = 0; |
241 | 30 | if (ds->chars_head == NULL) |
242 | 10 | ds->chars_tail = NULL; |
243 | 30 | } |
244 | 0 | else { |
245 | 0 | /* There's enough in this buffer to satisfy us, and we'll leave |
246 | 0 | * some behind. */ |
247 | 0 | MVMint32 take = chars - found; |
248 | 0 | MVMint32 to_copy = result_chars - result_found; |
249 | 0 | memcpy(result->body.storage.blob_32 + result_found, |
250 | 0 | cur_chars->chars + ds->chars_head_pos, |
251 | 0 | to_copy * sizeof(MVMGrapheme32)); |
252 | 0 | result_found += to_copy; |
253 | 0 | found += take; |
254 | 0 | ds->chars_head_pos += take; |
255 | 0 | } |
256 | 30 | } |
257 | 10 | } |
258 | 28 | return result; |
259 | 28 | } |
260 | | MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, |
261 | 8 | MVMint32 chars, MVMint64 eof) { |
262 | 8 | MVMint32 missing; |
263 | 8 | |
264 | 8 | /* If we request nothing, give empty string. */ |
265 | 8 | if (chars == 0) |
266 | 0 | return tc->instance->str_consts.empty; |
267 | 8 | |
268 | 8 | /* If we don't already have enough chars, try and decode more. */ |
269 | 8 | missing = missing_chars(tc, ds, chars); |
270 | 8 | ds->result_size_guess = missing; |
271 | 8 | if (missing) |
272 | 8 | run_decode(tc, ds, &missing, NULL, DECODE_NOT_EOF); |
273 | 8 | |
274 | 8 | /* If we've got enough, assemble a string. Otherwise, flag EOF and retry, |
275 | 8 | * falling back to returning what's available. */ |
276 | 8 | if (missing_chars(tc, ds, chars) == 0) { |
277 | 5 | return take_chars(tc, ds, chars, 0); |
278 | 5 | } |
279 | 3 | else if (eof) { |
280 | 0 | reached_eof(tc, ds); |
281 | 0 | return missing_chars(tc, ds, chars) == 0 |
282 | 0 | ? take_chars(tc, ds, chars, 0) |
283 | 0 | : MVM_string_decodestream_get_all(tc, ds); |
284 | 0 | } |
285 | 3 | else { |
286 | 3 | return NULL; |
287 | 3 | } |
288 | 8 | } |
289 | | |
290 | | /* Gets characters up until one of the specified separators is encountered. If |
291 | | * we do not encounter it, returns 0. This may mean more input buffers are needed |
292 | | * or that we reached the end of the stream. Note that it assumes the separator |
293 | | * will exist near the end of the buffer, if it occurs at all, due to decode |
294 | | * streams looking for stoppers. */ |
295 | | static MVMint32 have_separator(MVMThreadContext *tc, MVMDecodeStreamChars *start_chars, MVMint32 start_pos, |
296 | 18 | MVMDecodeStreamSeparators *sep_spec, MVMint32 sep_idx, MVMint32 sep_graph_pos) { |
297 | 18 | MVMint32 sep_pos = 1; |
298 | 18 | MVMint32 sep_length = sep_spec->sep_lengths[sep_idx]; |
299 | 18 | MVMDecodeStreamChars *cur_chars = start_chars; |
300 | 34 | while (cur_chars) { |
301 | 18 | MVMint32 start = cur_chars == start_chars ? start_pos : 0; |
302 | 24 | MVMint32 i; |
303 | 24 | for (i = start; i < cur_chars->length; i++) { |
304 | 8 | if (cur_chars->chars[i] != sep_spec->sep_graphemes[sep_graph_pos]) |
305 | 0 | return 0; |
306 | 8 | sep_pos++; |
307 | 8 | if (sep_pos == sep_length) |
308 | 8 | return 1; |
309 | 0 | sep_graph_pos++; |
310 | 0 | } |
311 | 16 | cur_chars = cur_chars->next; |
312 | 16 | } |
313 | 10 | return 0; |
314 | 18 | } |
315 | | static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds, |
316 | | MVMDecodeStreamSeparators *sep_spec, MVMint32 *sep_length, |
317 | 98 | int eof) { |
318 | 98 | MVMint32 sep_loc = 0; |
319 | 98 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
320 | 98 | |
321 | 98 | /* First, skip over any buffers we need not consider. */ |
322 | 98 | MVMint32 max_sep_length = sep_spec->max_sep_length; |
323 | 126 | while (cur_chars && cur_chars->next) { |
324 | 37 | if (cur_chars->next->length < max_sep_length) |
325 | 9 | break; |
326 | 28 | sep_loc += cur_chars->length; |
327 | 28 | cur_chars = cur_chars->next; |
328 | 28 | } |
329 | 98 | |
330 | 98 | /* Now scan for the separator. */ |
331 | 128 | while (cur_chars) { |
332 | 53 | MVMint32 i, j; |
333 | 53 | MVMint32 start; |
334 | 53 | if (eof) { |
335 | 10 | start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0; |
336 | 14 | } |
337 | 39 | else { |
338 | 39 | start = cur_chars->length - max_sep_length; |
339 | 39 | if (cur_chars == ds->chars_head) { |
340 | 23 | if (start >= ds->chars_head_pos) |
341 | 22 | sep_loc += start - ds->chars_head_pos; |
342 | 23 | else |
343 | 1 | start = ds->chars_head_pos; |
344 | 23 | } |
345 | 16 | else { |
346 | 16 | if (start >= 0) |
347 | 16 | sep_loc += start; |
348 | 16 | else |
349 | 0 | start = 0; |
350 | 16 | } |
351 | 39 | } |
352 | 119 | for (i = start; i < cur_chars->length; i++) { |
353 | 89 | MVMint32 sep_graph_pos = 0; |
354 | 89 | MVMGrapheme32 cur_char = cur_chars->chars[i]; |
355 | 89 | sep_loc++; |
356 | 207 | for (j = 0; j < sep_spec->num_seps; j++) { |
357 | 141 | if (sep_spec->sep_graphemes[sep_graph_pos] == cur_char) { |
358 | 33 | if (sep_spec->sep_lengths[j] == 1) { |
359 | 15 | *sep_length = 1; |
360 | 15 | return sep_loc; |
361 | 15 | } |
362 | 18 | else if (have_separator(tc, cur_chars, i + 1, sep_spec, j, sep_graph_pos + 1)) { |
363 | 8 | *sep_length = sep_spec->sep_lengths[j]; |
364 | 8 | sep_loc += sep_spec->sep_lengths[j] - 1; |
365 | 8 | return sep_loc; |
366 | 8 | } |
367 | 33 | } |
368 | 118 | sep_graph_pos += sep_spec->sep_lengths[j]; |
369 | 118 | } |
370 | 89 | } |
371 | 30 | cur_chars = cur_chars->next; |
372 | 30 | } |
373 | 75 | return 0; |
374 | 98 | } |
375 | | MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, |
376 | 50 | MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) { |
377 | 50 | MVMint32 sep_loc, sep_length; |
378 | 50 | |
379 | 50 | /* Look for separator, trying more decoding if it fails. We get the place |
380 | 50 | * just beyond the separator, so can use take_chars to get what's need. |
381 | 50 | * Note that decoders are only responsible for finding the final char of |
382 | 50 | * the separator, so we may need to loop a few times around this. */ |
383 | 50 | sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 0); |
384 | 100 | while (!sep_loc) { |
385 | 77 | MVMuint32 decode_outcome = run_decode(tc, ds, NULL, sep_spec, DECODE_NOT_EOF); |
386 | 77 | if (decode_outcome == RUN_DECODE_NOTHING_DECODED) |
387 | 27 | break; |
388 | 50 | if (decode_outcome == RUN_DECODE_STOPPER_REACHED) |
389 | 37 | sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 0); |
390 | 50 | } |
391 | 50 | if (sep_loc) { |
392 | 23 | /* Use this line length as a guesstimate of the next, unless it's tiny |
393 | 23 | * in which case we treat it as an outlier (probably an empty line or |
394 | 23 | * some such). Also round up and to a nice power of 2. */ |
395 | 23 | if (sep_loc > 32) |
396 | 2 | ds->result_size_guess = (sep_loc << 1) & ~0xF; |
397 | 13 | return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0); |
398 | 23 | } |
399 | 27 | else { |
400 | 27 | return NULL; |
401 | 27 | } |
402 | 50 | } |
403 | | |
404 | | /* Variant of MVM_string_decodestream_get_until_sep that is called when we |
405 | | * reach EOF. Trims the final separator if there is one, or returns the last |
406 | | * line without the EOF marker. */ |
407 | | MVMString * MVM_string_decodestream_get_until_sep_eof(MVMThreadContext *tc, MVMDecodeStream *ds, |
408 | 11 | MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) { |
409 | 11 | MVMint32 sep_loc, sep_length; |
410 | 11 | |
411 | 11 | /* Decode anything remaining and flush normalization buffer. */ |
412 | 11 | reached_eof(tc, ds); |
413 | 11 | |
414 | 11 | /* Look for separator, which should by now be at the end, and chomp it |
415 | 11 | * off if needed. */ |
416 | 11 | sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 1); |
417 | 11 | if (sep_loc) |
418 | 0 | return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0); |
419 | 11 | |
420 | 11 | /* Otherwise, take all remaining chars. */ |
421 | 11 | return MVM_string_decodestream_get_all(tc, ds); |
422 | 11 | } |
423 | | |
424 | | /* Produces a string consisting of the characters available now in all decdoed |
425 | | * buffers. */ |
426 | 207 | static MVMString * get_all_in_buffer(MVMThreadContext *tc, MVMDecodeStream *ds) { |
427 | 207 | MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
428 | 207 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
429 | 207 | |
430 | 207 | /* If there's no codepoint buffer, then return the empty string. */ |
431 | 207 | if (!ds->chars_head) { |
432 | 5 | result->body.storage.blob_32 = NULL; |
433 | 5 | result->body.num_graphs = 0; |
434 | 5 | } |
435 | 207 | |
436 | 207 | /* If there's exactly one resulting codepoint buffer and we swallowed none |
437 | 207 | * of it, just use it. */ |
438 | 202 | else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) { |
439 | 19 | /* Set up result string. */ |
440 | 19 | result->body.storage.blob_32 = ds->chars_head->chars; |
441 | 19 | result->body.num_graphs = ds->chars_head->length; |
442 | 19 | |
443 | 19 | /* Don't free the buffer's memory itself, just the holder, as we |
444 | 19 | * stole that for the buffer into the string above. */ |
445 | 19 | free_chars(tc, ds, ds->chars_head); |
446 | 19 | ds->chars_head = ds->chars_tail = NULL; |
447 | 19 | } |
448 | 202 | |
449 | 202 | /* Otherwise, need to assemble all the things. */ |
450 | 183 | else { |
451 | 183 | /* Calculate length. */ |
452 | 183 | MVMint32 length = 0, pos = 0; |
453 | 183 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
454 | 9.75k | while (cur_chars) { |
455 | 9.57k | if (cur_chars == ds->chars_head) |
456 | 183 | length += cur_chars->length - ds->chars_head_pos; |
457 | 9.57k | else |
458 | 9.39k | length += cur_chars->length; |
459 | 9.57k | cur_chars = cur_chars->next; |
460 | 9.57k | } |
461 | 183 | |
462 | 183 | /* Allocate a result buffer of the right size. */ |
463 | 183 | result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32)); |
464 | 183 | result->body.num_graphs = length; |
465 | 183 | |
466 | 183 | /* Copy all the things into the target, freeing as we go. */ |
467 | 183 | cur_chars = ds->chars_head; |
468 | 9.75k | while (cur_chars) { |
469 | 9.57k | MVMDecodeStreamChars *next_chars = cur_chars->next; |
470 | 9.57k | if (cur_chars == ds->chars_head) { |
471 | 183 | MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos; |
472 | 183 | memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos, |
473 | 183 | to_copy * sizeof(MVMGrapheme32)); |
474 | 183 | pos += to_copy; |
475 | 183 | } |
476 | 9.39k | else { |
477 | 9.39k | memcpy(result->body.storage.blob_32 + pos, cur_chars->chars, |
478 | 9.39k | cur_chars->length * sizeof(MVMGrapheme32)); |
479 | 9.39k | pos += cur_chars->length; |
480 | 9.39k | } |
481 | 9.57k | MVM_free(cur_chars->chars); |
482 | 9.57k | free_chars(tc, ds, cur_chars); |
483 | 9.57k | cur_chars = next_chars; |
484 | 9.57k | } |
485 | 183 | ds->chars_head = ds->chars_tail = NULL; |
486 | 183 | } |
487 | 207 | |
488 | 207 | return result; |
489 | 207 | } |
490 | | |
491 | | /* Decodes all the buffers, signals EOF to flush any normalization buffers, and |
492 | | * returns a string of all decoded chars. */ |
493 | 206 | MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) { |
494 | 206 | reached_eof(tc, ds); |
495 | 206 | return get_all_in_buffer(tc, ds); |
496 | 206 | } |
497 | | |
498 | | /* Decodes all the buffers we have, and returns a string of all decoded chars. |
499 | | * There may still be more to read after this, due to incomplete multi-byte |
500 | | * or multi-codepoint sequences that are not yet completely processed. */ |
501 | 1 | MVMString * MVM_string_decodestream_get_available(MVMThreadContext *tc, MVMDecodeStream *ds) { |
502 | 1 | if (ds->bytes_head) { |
503 | 0 | ds->result_size_guess = ds->bytes_head->length; |
504 | 0 | run_decode(tc, ds, NULL, NULL, DECODE_NOT_EOF); |
505 | 0 | } |
506 | 1 | return get_all_in_buffer(tc, ds); |
507 | 1 | } |
508 | | |
509 | | /* Checks if we have the number of bytes requested. */ |
510 | 0 | MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes) { |
511 | 0 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
512 | 0 | MVMint32 found = 0; |
513 | 0 | while (cur_bytes) { |
514 | 0 | found += cur_bytes == ds->bytes_head |
515 | 0 | ? cur_bytes->length - ds->bytes_head_pos |
516 | 0 | : cur_bytes->length; |
517 | 0 | if (found >= bytes) |
518 | 0 | return 1; |
519 | 0 | cur_bytes = cur_bytes->next; |
520 | 0 | } |
521 | 0 | return 0; |
522 | 0 | } |
523 | | |
524 | | /* Gets the number of bytes available. */ |
525 | 16 | MVMint64 MVM_string_decodestream_bytes_available(MVMThreadContext *tc, const MVMDecodeStream *ds) { |
526 | 16 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
527 | 16 | MVMint32 available = 0; |
528 | 25 | while (cur_bytes) { |
529 | 9 | available += cur_bytes == ds->bytes_head |
530 | 6 | ? cur_bytes->length - ds->bytes_head_pos |
531 | 3 | : cur_bytes->length; |
532 | 9 | cur_bytes = cur_bytes->next; |
533 | 9 | } |
534 | 16 | return available; |
535 | 16 | } |
536 | | |
537 | | /* Copies up to the requested number of bytes into the supplied buffer, and |
538 | | * returns the number of bytes we actually copied. Takes from from the start |
539 | | * of the stream. */ |
540 | 1 | MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes) { |
541 | 1 | MVMint32 taken = 0; |
542 | 1 | *buf = NULL; |
543 | 3 | while (taken < bytes && ds->bytes_head) { |
544 | 2 | /* Take what we can. */ |
545 | 2 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
546 | 2 | MVMint32 required = bytes - taken; |
547 | 2 | MVMint32 available = cur_bytes->length - ds->bytes_head_pos; |
548 | 2 | if (available <= required) { |
549 | 2 | /* Take everything in this buffer and remove it. */ |
550 | 2 | if (!*buf) |
551 | 1 | *buf = MVM_malloc(cur_bytes->next ? bytes : available); |
552 | 2 | memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, available); |
553 | 2 | taken += available; |
554 | 2 | ds->bytes_head = cur_bytes->next; |
555 | 2 | ds->bytes_head_pos = 0; |
556 | 2 | MVM_free(cur_bytes->bytes); |
557 | 2 | MVM_free(cur_bytes); |
558 | 2 | } |
559 | 0 | else { |
560 | 0 | /* Just take what we need. */ |
561 | 0 | if (!*buf) |
562 | 0 | *buf = MVM_malloc(required); |
563 | 0 | memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, required); |
564 | 0 | taken += required; |
565 | 0 | ds->bytes_head_pos += required; |
566 | 0 | } |
567 | 2 | } |
568 | 1 | if (ds->bytes_head == NULL) |
569 | 1 | ds->bytes_tail = NULL; |
570 | 1 | ds->abs_byte_pos += taken; |
571 | 1 | return taken; |
572 | 1 | } |
573 | | |
574 | | /* Gets the absolute byte offset (the amount we started with plus what we've |
575 | | * chewed and handed back in decoded characters). */ |
576 | 0 | MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds) { |
577 | 0 | return ds->abs_byte_pos; |
578 | 0 | } |
579 | | |
580 | | /* Checks if the decode stream is empty. */ |
581 | 24 | MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds) { |
582 | 21 | return !ds->bytes_head && !ds->chars_head && MVM_unicode_normalizer_empty(tc, &(ds->norm)); |
583 | 24 | } |
584 | | |
585 | | /* Destroys a decoding stream, freeing all associated memory (including the |
586 | | * buffers). */ |
587 | 91 | void MVM_string_decodestream_destroy(MVMThreadContext *tc, MVMDecodeStream *ds) { |
588 | 91 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
589 | 91 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
590 | 91 | while (cur_bytes) { |
591 | 0 | MVMDecodeStreamBytes *next_bytes = cur_bytes->next; |
592 | 0 | MVM_free(cur_bytes->bytes); |
593 | 0 | MVM_free(cur_bytes); |
594 | 0 | cur_bytes = next_bytes; |
595 | 0 | } |
596 | 91 | while (cur_chars) { |
597 | 0 | MVMDecodeStreamChars *next_chars = cur_chars->next; |
598 | 0 | MVM_free(cur_chars->chars); |
599 | 0 | MVM_free(cur_chars); |
600 | 0 | cur_chars = next_chars; |
601 | 0 | } |
602 | 91 | MVM_unicode_normalizer_cleanup(tc, &(ds->norm)); |
603 | 91 | MVM_free(ds->decoder_state); |
604 | 91 | MVM_free(ds->chars_reuse); |
605 | 91 | MVM_free(ds); |
606 | 91 | } |
607 | | |
608 | | /* Calculates and caches various bits of information about separators, for |
609 | | * faster line reading. */ |
610 | 1.31k | static void cache_sep_info(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { |
611 | 1.31k | MVMGrapheme32 *final_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32)); |
612 | 1.31k | MVMint32 max_final_grapheme = -1; |
613 | 1.31k | MVMint32 max_sep_length = 1; |
614 | 1.31k | MVMint32 cur_sep_pos = 0; |
615 | 1.31k | MVMint32 i; |
616 | 3.94k | for (i = 0; i < sep_spec->num_seps; i++) { |
617 | 2.63k | MVMint32 length = sep_spec->sep_lengths[i]; |
618 | 2.63k | if (length > max_sep_length) |
619 | 4 | max_sep_length = length; |
620 | 2.63k | cur_sep_pos += length; |
621 | 2.63k | final_graphemes[i] = sep_spec->sep_graphemes[cur_sep_pos - 1]; |
622 | 2.63k | if (final_graphemes[i] > max_final_grapheme) |
623 | 1.31k | max_final_grapheme = final_graphemes[i]; |
624 | 2.63k | } |
625 | 1.31k | sep_spec->max_sep_length = max_sep_length; |
626 | 1.31k | sep_spec->final_graphemes = final_graphemes; |
627 | 1.31k | sep_spec->max_final_grapheme = max_final_grapheme; |
628 | 1.31k | } |
629 | | |
630 | | /* Sets a decode stream separator to its default value. */ |
631 | 661 | void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { |
632 | 661 | sep_spec->num_seps = 2; |
633 | 661 | sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32)); |
634 | 661 | sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32)); |
635 | 661 | |
636 | 661 | sep_spec->sep_lengths[0] = 1; |
637 | 661 | sep_spec->sep_graphemes[0] = '\n'; |
638 | 661 | |
639 | 661 | sep_spec->sep_lengths[1] = 1; |
640 | 661 | sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc); |
641 | 661 | |
642 | 661 | cache_sep_info(tc, sep_spec); |
643 | 661 | } |
644 | | |
645 | | /* Takes a string and sets it up as a decode stream separator. */ |
646 | | void MVM_string_decode_stream_sep_from_strings(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, |
647 | 655 | MVMString **seps, MVMint32 num_seps) { |
648 | 655 | MVMGraphemeIter gi; |
649 | 655 | MVMint32 i, graph_length, graph_pos; |
650 | 655 | |
651 | 655 | if (num_seps > 0xFFF) |
652 | 0 | MVM_exception_throw_adhoc(tc, "Too many line separators"); |
653 | 655 | |
654 | 655 | MVM_free(sep_spec->sep_lengths); |
655 | 655 | MVM_free(sep_spec->sep_graphemes); |
656 | 655 | MVM_free(sep_spec->final_graphemes); |
657 | 655 | |
658 | 655 | sep_spec->num_seps = num_seps; |
659 | 655 | sep_spec->sep_lengths = MVM_malloc(num_seps * sizeof(MVMint32)); |
660 | 655 | graph_length = 0; |
661 | 1.96k | for (i = 0; i < num_seps; i++) { |
662 | 1.30k | MVMuint32 num_graphs = MVM_string_graphs(tc, seps[i]); |
663 | 1.30k | if (num_graphs > 0xFFFF) |
664 | 0 | MVM_exception_throw_adhoc(tc, "Line separator too long"); |
665 | 1.30k | sep_spec->sep_lengths[i] = num_graphs; |
666 | 1.30k | graph_length += num_graphs; |
667 | 1.30k | } |
668 | 655 | |
669 | 655 | sep_spec->sep_graphemes = MVM_malloc(graph_length * sizeof(MVMGrapheme32)); |
670 | 655 | graph_pos = 0; |
671 | 1.96k | for (i = 0; i < num_seps; i++) { |
672 | 1.30k | MVM_string_gi_init(tc, &gi, seps[i]); |
673 | 2.62k | while (MVM_string_gi_has_more(tc, &gi)) |
674 | 1.31k | sep_spec->sep_graphemes[graph_pos++] = MVM_string_gi_get_grapheme(tc, &gi); |
675 | 1.30k | } |
676 | 655 | |
677 | 655 | cache_sep_info(tc, sep_spec); |
678 | 655 | } |
679 | | |
680 | | /* Cleans up memory associated with a stream separator set. */ |
681 | 91 | void MVM_string_decode_stream_sep_destroy(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { |
682 | 91 | MVM_free(sep_spec->sep_lengths); |
683 | 91 | MVM_free(sep_spec->sep_graphemes); |
684 | 91 | MVM_free(sep_spec->final_graphemes); |
685 | 91 | } |