/home/travis/build/MoarVM/MoarVM/src/strings/decode_stream.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* A decode stream represents an on-going decoding process, from bytes into |
4 | | * characters. Bytes can be contributed to the decode stream, and chars can be |
5 | | * obtained. Byte buffers and decoded char buffers are kept in linked lists. |
6 | | * Note that characters may start at the end of one byte buffer and finish in |
7 | | * the next, which is taken care of by the logic in here and the decoders |
8 | | * themselves. Additionally, normalization may be applied using the normalizer |
9 | | * in the decode stream, at the discretion of the encoding in question (some, |
10 | | * such as ASCII and Latin-1, are normalized by definition). |
11 | | */ |
12 | | |
13 | 72 | #define DECODE_NOT_EOF 0 |
14 | 172 | #define DECODE_EOF 1 |
15 | | |
16 | | /* Creates a new decoding stream. */ |
17 | | MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, |
18 | 193 | MVMint64 abs_byte_pos, MVMint32 translate_newlines) { |
19 | 193 | MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream)); |
20 | 193 | ds->encoding = encoding; |
21 | 193 | ds->abs_byte_pos = abs_byte_pos; |
22 | 193 | MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG); |
23 | 193 | if (translate_newlines) |
24 | 187 | MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm)); |
25 | 193 | return ds; |
26 | 193 | } |
27 | | |
28 | | /* Adds another byte buffer into the decoding stream. */ |
29 | 375 | void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, char *bytes, MVMint32 length) { |
30 | 375 | if (length > 0) { |
31 | 194 | MVMDecodeStreamBytes *new_bytes = MVM_calloc(1, sizeof(MVMDecodeStreamBytes)); |
32 | 194 | new_bytes->bytes = bytes; |
33 | 194 | new_bytes->length = length; |
34 | 194 | if (ds->bytes_tail) |
35 | 3 | ds->bytes_tail->next = new_bytes; |
36 | 194 | ds->bytes_tail = new_bytes; |
37 | 194 | if (!ds->bytes_head) |
38 | 191 | ds->bytes_head = new_bytes; |
39 | 194 | } |
40 | 181 | else { |
41 | 181 | /* It's empty, so free the buffer right away and don't add. */ |
42 | 181 | MVM_free(bytes); |
43 | 181 | } |
44 | 375 | } |
45 | | |
46 | | /* Adds another char result buffer into the decoding stream. */ |
47 | 242 | void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length) { |
48 | 242 | MVMDecodeStreamChars *new_chars = MVM_calloc(1, sizeof(MVMDecodeStreamChars)); |
49 | 242 | new_chars->chars = chars; |
50 | 242 | new_chars->length = length; |
51 | 242 | if (ds->chars_tail) |
52 | 33 | ds->chars_tail->next = new_chars; |
53 | 242 | ds->chars_tail = new_chars; |
54 | 242 | if (!ds->chars_head) |
55 | 209 | ds->chars_head = new_chars; |
56 | 242 | } |
57 | | |
58 | | /* Throws away byte buffers no longer needed. */ |
59 | 220 | void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos) { |
60 | 222 | while (ds->bytes_head != bytes) { |
61 | 2 | MVMDecodeStreamBytes *discard = ds->bytes_head; |
62 | 2 | ds->abs_byte_pos += discard->length - ds->bytes_head_pos; |
63 | 2 | ds->bytes_head = discard->next; |
64 | 2 | ds->bytes_head_pos = 0; |
65 | 2 | MVM_free(discard->bytes); |
66 | 2 | MVM_free(discard); |
67 | 2 | } |
68 | 220 | if (!ds->bytes_head && pos == 0) |
69 | 0 | return; |
70 | 220 | if (ds->bytes_head->length == pos) { |
71 | 185 | /* We ate all of the new head buffer too; also free it. */ |
72 | 185 | MVMDecodeStreamBytes *discard = ds->bytes_head; |
73 | 185 | ds->abs_byte_pos += discard->length - ds->bytes_head_pos; |
74 | 185 | ds->bytes_head = discard->next; |
75 | 185 | ds->bytes_head_pos = 0; |
76 | 185 | MVM_free(discard->bytes); |
77 | 185 | MVM_free(discard); |
78 | 185 | if (ds->bytes_head == NULL) |
79 | 185 | ds->bytes_tail = NULL; |
80 | 185 | } |
81 | 35 | else { |
82 | 35 | ds->abs_byte_pos += pos - ds->bytes_head_pos; |
83 | 35 | ds->bytes_head_pos = pos; |
84 | 35 | } |
85 | 220 | } |
86 | | |
87 | | /* Does a decode run, selected by encoding. Returns non-zero if we actually |
88 | | * decoded more chars. */ |
89 | 92 | #define RUN_DECODE_NOTHING_DECODED 0 |
90 | 181 | #define RUN_DECODE_STOPPER_NOT_REACHED 1 |
91 | 76 | #define RUN_DECODE_STOPPER_REACHED 2 |
92 | 244 | static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *sep_spec, MVMint32 eof) { |
93 | 244 | MVMDecodeStreamChars *prev_chars_tail = ds->chars_tail; |
94 | 244 | MVMuint32 reached_stopper; |
95 | 244 | switch (ds->encoding) { |
96 | 242 | case MVM_encoding_type_utf8: |
97 | 242 | reached_stopper = MVM_string_utf8_decodestream(tc, ds, stopper_chars, sep_spec); |
98 | 242 | break; |
99 | 1 | case MVM_encoding_type_ascii: |
100 | 1 | reached_stopper = MVM_string_ascii_decodestream(tc, ds, stopper_chars, sep_spec); |
101 | 1 | break; |
102 | 1 | case MVM_encoding_type_latin1: |
103 | 1 | reached_stopper = MVM_string_latin1_decodestream(tc, ds, stopper_chars, sep_spec); |
104 | 1 | break; |
105 | 0 | case MVM_encoding_type_windows1252: |
106 | 0 | reached_stopper = MVM_string_windows1252_decodestream(tc, ds, stopper_chars, sep_spec); |
107 | 0 | break; |
108 | 0 | case MVM_encoding_type_utf8_c8: |
109 | 0 | reached_stopper = MVM_string_utf8_c8_decodestream(tc, ds, stopper_chars, sep_spec, eof); |
110 | 0 | break; |
111 | 0 | default: |
112 | 0 | MVM_exception_throw_adhoc(tc, "Streaming decode NYI for encoding %d", |
113 | 0 | (int)ds->encoding); |
114 | 244 | } |
115 | 244 | if (ds->chars_tail == prev_chars_tail) |
116 | 28 | return RUN_DECODE_NOTHING_DECODED; |
117 | 216 | else if (reached_stopper) |
118 | 35 | return RUN_DECODE_STOPPER_REACHED; |
119 | 216 | else |
120 | 181 | return RUN_DECODE_STOPPER_NOT_REACHED; |
121 | 244 | } |
122 | | |
123 | | /* Gets the specified number of characters. If we are not yet able to decode |
124 | | * that many, returns NULL. This may mean more input buffers are needed. The |
125 | | * exclude parameter specifies a number of chars that should be taken from the |
126 | | * input buffer, but not included in the result string (for chomping a line |
127 | | * separator). */ |
128 | 16 | static MVMint32 missing_chars(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 wanted) { |
129 | 16 | MVMint32 got = 0; |
130 | 16 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
131 | 23 | while (cur_chars && got < wanted) { |
132 | 7 | if (cur_chars == ds->chars_head) |
133 | 7 | got += cur_chars->length - ds->chars_head_pos; |
134 | 7 | else |
135 | 0 | got += cur_chars->length; |
136 | 7 | cur_chars = cur_chars->next; |
137 | 7 | } |
138 | 11 | return got >= wanted ? 0 : wanted - got; |
139 | 16 | } |
140 | 25 | static MVMString * take_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint32 exclude) { |
141 | 25 | MVMString *result; |
142 | 25 | MVMint32 found = 0; |
143 | 25 | MVMint32 result_found = 0; |
144 | 25 | |
145 | 25 | MVMint32 result_chars = chars - exclude; |
146 | 25 | if (result_chars < 0) |
147 | 0 | MVM_exception_throw_adhoc(tc, "DecodeStream take_chars: chars - exclude < 0 should never happen"); |
148 | 25 | |
149 | 25 | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
150 | 25 | result->body.storage.blob_32 = MVM_malloc(result_chars * sizeof(MVMGrapheme32)); |
151 | 25 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
152 | 25 | result->body.num_graphs = result_chars; |
153 | 62 | while (found < chars) { |
154 | 37 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
155 | 37 | MVMint32 available = cur_chars->length - ds->chars_head_pos; |
156 | 37 | if (available <= chars - found) { |
157 | 37 | /* We need all that's left in this buffer and likely |
158 | 37 | * more. */ |
159 | 37 | MVMDecodeStreamChars *next_chars = cur_chars->next; |
160 | 37 | if (available <= result_chars - result_found) { |
161 | 26 | memcpy(result->body.storage.blob_32 + result_found, |
162 | 26 | cur_chars->chars + ds->chars_head_pos, |
163 | 26 | available * sizeof(MVMGrapheme32)); |
164 | 26 | result_found += available; |
165 | 26 | } |
166 | 11 | else { |
167 | 11 | MVMint32 to_copy = result_chars - result_found; |
168 | 11 | memcpy(result->body.storage.blob_32 + result_found, |
169 | 11 | cur_chars->chars + ds->chars_head_pos, |
170 | 11 | to_copy * sizeof(MVMGrapheme32)); |
171 | 11 | result_found += to_copy; |
172 | 11 | } |
173 | 37 | found += available; |
174 | 37 | MVM_free(cur_chars->chars); |
175 | 37 | MVM_free(cur_chars); |
176 | 37 | ds->chars_head = next_chars; |
177 | 37 | ds->chars_head_pos = 0; |
178 | 37 | if (ds->chars_head == NULL) |
179 | 25 | ds->chars_tail = NULL; |
180 | 37 | } |
181 | 0 | else { |
182 | 0 | /* There's enough in this buffer to satisfy us, and we'll leave |
183 | 0 | * some behind. */ |
184 | 0 | MVMint32 take = chars - found; |
185 | 0 | MVMint32 to_copy = result_chars - result_found; |
186 | 0 | memcpy(result->body.storage.blob_32 + result_found, |
187 | 0 | cur_chars->chars + ds->chars_head_pos, |
188 | 0 | to_copy * sizeof(MVMGrapheme32)); |
189 | 0 | result_found += to_copy; |
190 | 0 | found += take; |
191 | 0 | ds->chars_head_pos += take; |
192 | 0 | } |
193 | 37 | } |
194 | 25 | return result; |
195 | 25 | } |
196 | 8 | MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars) { |
197 | 8 | MVMint32 missing; |
198 | 8 | |
199 | 8 | /* If we request nothing, give empty string. */ |
200 | 8 | if (chars == 0) |
201 | 0 | return tc->instance->str_consts.empty; |
202 | 8 | |
203 | 8 | /* If we don't already have enough chars, try and decode more. */ |
204 | 8 | missing = missing_chars(tc, ds, chars); |
205 | 8 | if (missing) |
206 | 8 | run_decode(tc, ds, &missing, NULL, DECODE_NOT_EOF); |
207 | 8 | |
208 | 8 | /* If we've got enough, assemble a string. Otherwise, give up. */ |
209 | 8 | if (missing_chars(tc, ds, chars) == 0) |
210 | 5 | return take_chars(tc, ds, chars, 0); |
211 | 8 | else |
212 | 3 | return NULL; |
213 | 8 | } |
214 | | |
215 | | /* Gets characters up until one of the specified separators is encountered. If |
216 | | * we do not encounter it, returns 0. This may mean more input buffers are needed |
217 | | * or that we reached the end of the stream. Note that it assumes the separator |
218 | | * will exist near the end of the buffer, if it occurs at all, due to decode |
219 | | * streams looking for stoppers. */ |
220 | | static MVMint32 have_separator(MVMThreadContext *tc, MVMDecodeStreamChars *start_chars, MVMint32 start_pos, |
221 | 16 | MVMDecodeStreamSeparators *sep_spec, MVMint32 sep_idx, MVMint32 sep_graph_pos) { |
222 | 16 | MVMint32 sep_pos = 1; |
223 | 16 | MVMint32 sep_length = sep_spec->sep_lengths[sep_idx]; |
224 | 16 | MVMDecodeStreamChars *cur_chars = start_chars; |
225 | 32 | while (cur_chars) { |
226 | 16 | MVMint32 start = cur_chars == start_chars ? start_pos : 0; |
227 | 22 | MVMint32 i; |
228 | 22 | for (i = start; i < cur_chars->length; i++) { |
229 | 6 | if (cur_chars->chars[i] != sep_spec->sep_graphemes[sep_graph_pos]) |
230 | 0 | return 0; |
231 | 6 | sep_pos++; |
232 | 6 | if (sep_pos == sep_length) |
233 | 6 | return 1; |
234 | 0 | sep_graph_pos++; |
235 | 0 | } |
236 | 16 | cur_chars = cur_chars->next; |
237 | 16 | } |
238 | 10 | return 0; |
239 | 16 | } |
240 | | static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds, |
241 | 85 | MVMDecodeStreamSeparators *sep_spec, MVMint32 *sep_length) { |
242 | 85 | MVMint32 sep_loc = 0; |
243 | 85 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
244 | 85 | |
245 | 85 | /* First, skip over any buffers we need not consider. */ |
246 | 85 | MVMint32 max_sep_chars = MVM_string_decode_stream_sep_max_chars(tc, sep_spec); |
247 | 102 | while (cur_chars && cur_chars->next) { |
248 | 26 | if (cur_chars->next->length < max_sep_chars) |
249 | 9 | break; |
250 | 17 | sep_loc += cur_chars->length; |
251 | 17 | cur_chars = cur_chars->next; |
252 | 17 | } |
253 | 85 | |
254 | 85 | /* Now scan for the separator. */ |
255 | 109 | while (cur_chars) { |
256 | 24 | MVMint32 start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0; |
257 | 44 | MVMint32 i, j; |
258 | 188 | for (i = start; i < cur_chars->length; i++) { |
259 | 164 | MVMint32 sep_graph_pos = 0; |
260 | 164 | MVMGrapheme32 cur_char = cur_chars->chars[i]; |
261 | 164 | sep_loc++; |
262 | 447 | for (j = 0; j < sep_spec->num_seps; j++) { |
263 | 303 | if (sep_spec->sep_graphemes[sep_graph_pos] == cur_char) { |
264 | 30 | if (sep_spec->sep_lengths[j] == 1) { |
265 | 14 | *sep_length = 1; |
266 | 14 | return sep_loc; |
267 | 14 | } |
268 | 16 | else if (have_separator(tc, cur_chars, i + 1, sep_spec, j, sep_graph_pos + 1)) { |
269 | 6 | *sep_length = sep_spec->sep_lengths[j]; |
270 | 6 | sep_loc += sep_spec->sep_lengths[j] - 1; |
271 | 6 | return sep_loc; |
272 | 6 | } |
273 | 30 | } |
274 | 283 | sep_graph_pos += sep_spec->sep_lengths[j]; |
275 | 283 | } |
276 | 164 | } |
277 | 24 | cur_chars = cur_chars->next; |
278 | 24 | } |
279 | 65 | return 0; |
280 | 85 | } |
281 | | MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, |
282 | 43 | MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) { |
283 | 43 | MVMint32 sep_loc, sep_length; |
284 | 43 | |
285 | 43 | /* Look for separator, trying more decoding if it fails. We get the place |
286 | 43 | * just beyond the separator, so can use take_chars to get what's need. |
287 | 43 | * Note that decoders are only responsible for finding the final char of |
288 | 43 | * the separator, so we may need to loop a few times around this. */ |
289 | 43 | sep_loc = find_separator(tc, ds, sep_spec, &sep_length); |
290 | 84 | while (!sep_loc) { |
291 | 64 | MVMuint32 decode_outcome = run_decode(tc, ds, NULL, sep_spec, DECODE_NOT_EOF); |
292 | 64 | if (decode_outcome == RUN_DECODE_NOTHING_DECODED) |
293 | 23 | break; |
294 | 41 | if (decode_outcome == RUN_DECODE_STOPPER_REACHED) |
295 | 30 | sep_loc = find_separator(tc, ds, sep_spec, &sep_length); |
296 | 41 | } |
297 | 43 | if (sep_loc) |
298 | 20 | return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0); |
299 | 43 | else |
300 | 23 | return NULL; |
301 | 43 | } |
302 | | |
303 | | /* In situations where we have hit EOF, we need to decode what's left and flush |
304 | | * the normalization buffer also. */ |
305 | 203 | static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) { |
306 | 203 | /* Decode all the things. */ |
307 | 203 | if (ds->bytes_head) |
308 | 172 | run_decode(tc, ds, NULL, NULL, DECODE_EOF); |
309 | 203 | |
310 | 203 | /* If there's some things left in the normalization buffer, take them. */ |
311 | 203 | MVM_unicode_normalizer_eof(tc, &(ds->norm)); |
312 | 203 | if (MVM_unicode_normalizer_available(tc, &(ds->norm))) { |
313 | 24 | MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm)); |
314 | 24 | MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32)); |
315 | 24 | MVMint32 count = 0; |
316 | 48 | while (ready--) |
317 | 24 | buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm)); |
318 | 24 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
319 | 24 | } |
320 | 203 | } |
321 | | |
322 | | /* Variant of MVM_string_decodestream_get_until_sep that is called when we |
323 | | * reach EOF. Trims the final separator if there is one, or returns the last |
324 | | * line without the EOF marker. */ |
325 | | MVMString * MVM_string_decodestream_get_until_sep_eof(MVMThreadContext *tc, MVMDecodeStream *ds, |
326 | 12 | MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) { |
327 | 12 | MVMint32 sep_loc, sep_length; |
328 | 12 | |
329 | 12 | /* Decode anything remaining and flush normalization buffer. */ |
330 | 12 | reached_eof(tc, ds); |
331 | 12 | |
332 | 12 | /* Look for separator, which should by now be at the end, and chomp it |
333 | 12 | * off if needed. */ |
334 | 12 | sep_loc = find_separator(tc, ds, sep_spec, &sep_length); |
335 | 12 | if (sep_loc) |
336 | 0 | return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0); |
337 | 12 | |
338 | 12 | /* Otherwise, take all remaining chars. */ |
339 | 12 | return MVM_string_decodestream_get_all(tc, ds); |
340 | 12 | } |
341 | | |
342 | | /* Produces a string consisting of the characters available now in all decdoed |
343 | | * buffers. */ |
344 | 192 | static MVMString * get_all_in_buffer(MVMThreadContext *tc, MVMDecodeStream *ds) { |
345 | 192 | MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
346 | 192 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
347 | 192 | |
348 | 192 | /* If there's no codepoint buffer, then return the empty string. */ |
349 | 192 | if (!ds->chars_head) { |
350 | 8 | result->body.storage.blob_32 = NULL; |
351 | 8 | result->body.num_graphs = 0; |
352 | 8 | } |
353 | 192 | |
354 | 192 | /* If there's exactly one resulting codepoint buffer and we swallowed none |
355 | 192 | * of it, just use it. */ |
356 | 184 | else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) { |
357 | 164 | /* Set up result string. */ |
358 | 164 | result->body.storage.blob_32 = ds->chars_head->chars; |
359 | 164 | result->body.num_graphs = ds->chars_head->length; |
360 | 164 | |
361 | 164 | /* Don't free the buffer's memory itself, just the holder, as we |
362 | 164 | * stole that for the buffer into the string above. */ |
363 | 164 | MVM_free(ds->chars_head); |
364 | 164 | ds->chars_head = ds->chars_tail = NULL; |
365 | 164 | } |
366 | 184 | |
367 | 184 | /* Otherwise, need to assemble all the things. */ |
368 | 20 | else { |
369 | 20 | /* Calculate length. */ |
370 | 20 | MVMint32 length = 0, pos = 0; |
371 | 20 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
372 | 61 | while (cur_chars) { |
373 | 41 | if (cur_chars == ds->chars_head) |
374 | 20 | length += cur_chars->length - ds->chars_head_pos; |
375 | 41 | else |
376 | 21 | length += cur_chars->length; |
377 | 41 | cur_chars = cur_chars->next; |
378 | 41 | } |
379 | 20 | |
380 | 20 | /* Allocate a result buffer of the right size. */ |
381 | 20 | result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32)); |
382 | 20 | result->body.num_graphs = length; |
383 | 20 | |
384 | 20 | /* Copy all the things into the target, freeing as we go. */ |
385 | 20 | cur_chars = ds->chars_head; |
386 | 61 | while (cur_chars) { |
387 | 41 | MVMDecodeStreamChars *next_chars = cur_chars->next; |
388 | 41 | if (cur_chars == ds->chars_head) { |
389 | 20 | MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos; |
390 | 20 | memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos, |
391 | 20 | cur_chars->length * sizeof(MVMGrapheme32)); |
392 | 20 | pos += to_copy; |
393 | 20 | } |
394 | 21 | else { |
395 | 21 | memcpy(result->body.storage.blob_32 + pos, cur_chars->chars, |
396 | 21 | cur_chars->length * sizeof(MVMGrapheme32)); |
397 | 21 | pos += cur_chars->length; |
398 | 21 | } |
399 | 41 | MVM_free(cur_chars->chars); |
400 | 41 | MVM_free(cur_chars); |
401 | 41 | cur_chars = next_chars; |
402 | 41 | } |
403 | 20 | ds->chars_head = ds->chars_tail = NULL; |
404 | 20 | } |
405 | 192 | |
406 | 192 | return result; |
407 | 192 | } |
408 | | |
409 | | /* Decodes all the buffers, signals EOF to flush any normalization buffers, and |
410 | | * returns a string of all decoded chars. */ |
411 | 191 | MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) { |
412 | 191 | reached_eof(tc, ds); |
413 | 191 | return get_all_in_buffer(tc, ds); |
414 | 191 | } |
415 | | |
416 | | /* Decodes all the buffers we have, and returns a string of all decoded chars. |
417 | | * There may still be more to read after this, due to incomplete multi-byte |
418 | | * or multi-codepoint sequences that are not yet completely processed. */ |
419 | 1 | MVMString * MVM_string_decodestream_get_available(MVMThreadContext *tc, MVMDecodeStream *ds) { |
420 | 1 | if (ds->bytes_head) |
421 | 0 | run_decode(tc, ds, NULL, NULL, DECODE_NOT_EOF); |
422 | 1 | return get_all_in_buffer(tc, ds); |
423 | 1 | } |
424 | | |
425 | | /* Checks if we have the number of bytes requested. */ |
426 | 8 | MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes) { |
427 | 8 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
428 | 8 | MVMint32 found = 0; |
429 | 8 | while (cur_bytes) { |
430 | 4 | found += cur_bytes == ds->bytes_head |
431 | 4 | ? cur_bytes->length - ds->bytes_head_pos |
432 | 0 | : cur_bytes->length; |
433 | 4 | if (found >= bytes) |
434 | 4 | return 1; |
435 | 0 | cur_bytes = cur_bytes->next; |
436 | 0 | } |
437 | 4 | return 0; |
438 | 8 | } |
439 | | |
440 | | /* Gets the number of bytes available. */ |
441 | 4 | MVMint64 MVM_string_decodestream_bytes_available(MVMThreadContext *tc, const MVMDecodeStream *ds) { |
442 | 4 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
443 | 4 | MVMint32 available = 0; |
444 | 8 | while (cur_bytes) { |
445 | 4 | available += cur_bytes == ds->bytes_head |
446 | 3 | ? cur_bytes->length - ds->bytes_head_pos |
447 | 1 | : cur_bytes->length; |
448 | 4 | cur_bytes = cur_bytes->next; |
449 | 4 | } |
450 | 4 | return available; |
451 | 4 | } |
452 | | |
453 | | /* Copies up to the requested number of bytes into the supplied buffer, and |
454 | | * returns the number of bytes we actually copied. Takes from from the start |
455 | | * of the stream. */ |
456 | 5 | MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes) { |
457 | 5 | MVMint32 taken = 0; |
458 | 5 | *buf = NULL; |
459 | 11 | while (taken < bytes && ds->bytes_head) { |
460 | 6 | /* Take what we can. */ |
461 | 6 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
462 | 6 | MVMint32 required = bytes - taken; |
463 | 6 | MVMint32 available = cur_bytes->length - ds->bytes_head_pos; |
464 | 6 | if (available <= required) { |
465 | 6 | /* Take everything in this buffer and remove it. */ |
466 | 6 | if (!*buf) |
467 | 5 | *buf = MVM_malloc(cur_bytes->next ? bytes : available); |
468 | 6 | memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, available); |
469 | 6 | taken += available; |
470 | 6 | ds->bytes_head = cur_bytes->next; |
471 | 6 | ds->bytes_head_pos = 0; |
472 | 6 | MVM_free(cur_bytes->bytes); |
473 | 6 | MVM_free(cur_bytes); |
474 | 6 | } |
475 | 0 | else { |
476 | 0 | /* Just take what we need. */ |
477 | 0 | if (!*buf) |
478 | 0 | *buf = MVM_malloc(required); |
479 | 0 | memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, required); |
480 | 0 | taken += required; |
481 | 0 | ds->bytes_head_pos += required; |
482 | 0 | } |
483 | 6 | } |
484 | 5 | if (ds->bytes_head == NULL) |
485 | 5 | ds->bytes_tail = NULL; |
486 | 5 | ds->abs_byte_pos += taken; |
487 | 5 | return taken; |
488 | 5 | } |
489 | | |
490 | | /* Gets the absolute byte offset (the amount we started with plus what we've |
491 | | * chewed and handed back in decoded characters). */ |
492 | 8 | MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds) { |
493 | 8 | return ds->abs_byte_pos; |
494 | 8 | } |
495 | | |
496 | | /* Checks if the decode stream is empty. */ |
497 | 9 | MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds) { |
498 | 6 | return !ds->bytes_head && !ds->chars_head && MVM_unicode_normalizer_empty(tc, &(ds->norm)); |
499 | 9 | } |
500 | | |
501 | | /* Destroys a decoding stream, freeing all associated memory (including the |
502 | | * buffers). */ |
503 | 185 | void MVM_string_decodestream_destroy(MVMThreadContext *tc, MVMDecodeStream *ds) { |
504 | 185 | MVMDecodeStreamBytes *cur_bytes = ds->bytes_head; |
505 | 185 | MVMDecodeStreamChars *cur_chars = ds->chars_head; |
506 | 185 | while (cur_bytes) { |
507 | 0 | MVMDecodeStreamBytes *next_bytes = cur_bytes->next; |
508 | 0 | MVM_free(cur_bytes->bytes); |
509 | 0 | MVM_free(cur_bytes); |
510 | 0 | cur_bytes = next_bytes; |
511 | 0 | } |
512 | 185 | while (cur_chars) { |
513 | 0 | MVMDecodeStreamChars *next_chars = cur_chars->next; |
514 | 0 | MVM_free(cur_chars->chars); |
515 | 0 | MVM_free(cur_chars); |
516 | 0 | cur_chars = next_chars; |
517 | 0 | } |
518 | 185 | MVM_unicode_normalizer_cleanup(tc, &(ds->norm)); |
519 | 185 | MVM_free(ds->decoder_state); |
520 | 185 | MVM_free(ds); |
521 | 185 | } |
522 | | |
523 | | /* Sets a decode stream separator to its default value. */ |
524 | 598 | void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { |
525 | 598 | sep_spec->num_seps = 2; |
526 | 598 | sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32)); |
527 | 598 | sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32)); |
528 | 598 | |
529 | 598 | sep_spec->sep_lengths[0] = 1; |
530 | 598 | sep_spec->sep_graphemes[0] = '\n'; |
531 | 598 | |
532 | 598 | sep_spec->sep_lengths[1] = 1; |
533 | 598 | sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc); |
534 | 598 | } |
535 | | |
536 | | /* Takes a string and sets it up as a decode stream separator. */ |
537 | | void MVM_string_decode_stream_sep_from_strings(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, |
538 | 14 | MVMString **seps, MVMint32 num_seps) { |
539 | 14 | MVMGraphemeIter gi; |
540 | 14 | MVMint32 i, graph_length, graph_pos; |
541 | 14 | |
542 | 14 | if (num_seps > 0xFFF) |
543 | 0 | MVM_exception_throw_adhoc(tc, "Too many line separators"); |
544 | 14 | |
545 | 14 | MVM_free(sep_spec->sep_lengths); |
546 | 14 | MVM_free(sep_spec->sep_graphemes); |
547 | 14 | |
548 | 14 | sep_spec->num_seps = num_seps; |
549 | 14 | sep_spec->sep_lengths = MVM_malloc(num_seps * sizeof(MVMint32)); |
550 | 14 | graph_length = 0; |
551 | 35 | for (i = 0; i < num_seps; i++) { |
552 | 21 | MVMuint32 num_graphs = MVM_string_graphs(tc, seps[i]); |
553 | 21 | if (num_graphs > 0xFFFF) |
554 | 0 | MVM_exception_throw_adhoc(tc, "Line separator too long"); |
555 | 21 | sep_spec->sep_lengths[i] = num_graphs; |
556 | 21 | graph_length += num_graphs; |
557 | 21 | } |
558 | 14 | |
559 | 14 | sep_spec->sep_graphemes = MVM_malloc(graph_length * sizeof(MVMGrapheme32)); |
560 | 14 | graph_pos = 0; |
561 | 35 | for (i = 0; i < num_seps; i++) { |
562 | 21 | MVM_string_gi_init(tc, &gi, seps[i]); |
563 | 48 | while (MVM_string_gi_has_more(tc, &gi)) |
564 | 27 | sep_spec->sep_graphemes[graph_pos++] = MVM_string_gi_get_grapheme(tc, &gi); |
565 | 21 | } |
566 | 14 | } |
567 | | |
568 | | /* Returns the maximum length of any separator, in graphemes. */ |
569 | 85 | MVMint32 MVM_string_decode_stream_sep_max_chars(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { |
570 | 85 | MVMint32 i; |
571 | 85 | MVMint32 max_length = 1; |
572 | 250 | for (i = 0; i < sep_spec->num_seps; i++) |
573 | 165 | if (sep_spec->sep_lengths[i] > max_length) |
574 | 31 | max_length = sep_spec->sep_lengths[i]; |
575 | 85 | return max_length; |
576 | 85 | } |
577 | | |
578 | | /* Cleans up memory associated with a stream separator set. */ |
579 | 52 | void MVM_string_decode_stream_sep_destroy(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) { |
580 | 52 | MVM_free(sep_spec->sep_lengths); |
581 | 52 | MVM_free(sep_spec->sep_graphemes); |
582 | 52 | } |