Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/decode_stream.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* A decode stream represents an on-going decoding process, from bytes into
4
 * characters. Bytes can be contributed to the decode stream, and chars can be
5
 * obtained. Byte buffers and decoded char buffers are kept in linked lists.
6
 * Note that characters may start at the end of one byte buffer and finish in
7
 * the next, which is taken care of by the logic in here and the decoders
8
 * themselves. Additionally, normalization may be applied using the normalizer
9
 * in the decode stream, at the discretion of the encoding in question (some,
10
 * such as ASCII and Latin-1, are normalized by definition).
11
 */
12
13
85
#define DECODE_NOT_EOF  0
14
188
#define DECODE_EOF      1
15
16
/* Creates a new decoding stream. */
17
MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding,
18
661
        MVMint64 abs_byte_pos, MVMint32 translate_newlines) {
19
661
    MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream));
20
661
    ds->encoding        = encoding;
21
661
    ds->abs_byte_pos    = abs_byte_pos;
22
661
    MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG);
23
661
    if (translate_newlines)
24
647
        MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm));
25
661
    ds->result_size_guess = 64;
26
661
    return ds;
27
661
}
28
29
/* Adds another byte buffer into the decoding stream. */
30
213
void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, char *bytes, MVMint32 length) {
31
213
    if (length > 0) {
32
212
        MVMDecodeStreamBytes *new_bytes = MVM_calloc(1, sizeof(MVMDecodeStreamBytes));
33
212
        new_bytes->bytes  = bytes;
34
212
        new_bytes->length = length;
35
212
        if (ds->bytes_tail)
36
7
            ds->bytes_tail->next = new_bytes;
37
212
        ds->bytes_tail = new_bytes;
38
212
        if (!ds->bytes_head)
39
205
            ds->bytes_head = new_bytes;
40
212
    }
41
1
    else {
42
1
        /* It's empty, so free the buffer right away and don't add. */
43
1
        MVM_free(bytes);
44
1
    }
45
213
}
46
47
/* Adds another char result buffer into the decoding stream. */
48
9.64k
void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length) {
49
9.64k
    MVMDecodeStreamChars *new_chars;
50
9.64k
    if (ds->chars_reuse) {
51
28
        new_chars = ds->chars_reuse;
52
28
        ds->chars_reuse = NULL;
53
28
    }
54
9.61k
    else {
55
9.61k
        new_chars = MVM_malloc(sizeof(MVMDecodeStreamChars));
56
9.61k
    }
57
9.64k
    new_chars->chars  = chars;
58
9.64k
    new_chars->length = length;
59
9.64k
    new_chars->next = NULL;
60
9.64k
    if (ds->chars_tail)
61
9.41k
        ds->chars_tail->next = new_chars;
62
9.64k
    ds->chars_tail = new_chars;
63
9.64k
    if (!ds->chars_head)
64
230
        ds->chars_head = new_chars;
65
9.64k
}
66
67
/* Internal function to free a chars result structure, putting it into the
68
 * re-use slot if it's empty. */
69
9.64k
static void free_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamChars *chars) {
70
9.64k
    if (ds->chars_reuse)
71
9.41k
        MVM_free(chars);
72
9.64k
    else
73
230
        ds->chars_reuse = chars;
74
9.64k
}
75
76
/* Throws away byte buffers no longer needed. */
77
245
void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos) {
78
251
    while (ds->bytes_head != bytes) {
79
6
        MVMDecodeStreamBytes *discard = ds->bytes_head;
80
6
        ds->abs_byte_pos += discard->length - ds->bytes_head_pos;
81
6
        ds->bytes_head = discard->next;
82
6
        ds->bytes_head_pos = 0;
83
6
        MVM_free(discard->bytes);
84
6
        MVM_free(discard);
85
6
    }
86
245
    if (!ds->bytes_head) {
87
0
        if (MVM_LIKELY(pos == 0))
88
0
            return;
89
0
        /* Guard against null pointer dereference below. */
90
0
        else
91
0
            MVM_exception_throw_adhoc(tc,
92
0
                "Unknown error encountered in MVM_string_decodestream_discard_to");
93
0
    }
94
245
    if (ds->bytes_head->length == pos) {
95
203
        /* We ate all of the new head buffer too; also free it. */
96
203
        MVMDecodeStreamBytes *discard = ds->bytes_head;
97
203
        ds->abs_byte_pos += discard->length - ds->bytes_head_pos;
98
203
        ds->bytes_head = discard->next;
99
203
        ds->bytes_head_pos = 0;
100
203
        MVM_free(discard->bytes);
101
203
        MVM_free(discard);
102
203
        if (ds->bytes_head == NULL)
103
203
            ds->bytes_tail = NULL;
104
203
    }
105
42
    else {
106
42
        ds->abs_byte_pos += pos - ds->bytes_head_pos;
107
42
        ds->bytes_head_pos = pos;
108
42
    }
109
245
}
110
111
/* Does a decode run, selected by encoding. Returns non-zero if we actually
112
 * decoded more chars. */
113
109
#define RUN_DECODE_NOTHING_DECODED          0
114
199
#define RUN_DECODE_STOPPER_NOT_REACHED      1
115
92
#define RUN_DECODE_STOPPER_REACHED          2
116
273
static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *sep_spec, MVMint32 eof) {
117
273
    MVMDecodeStreamChars *prev_chars_tail = ds->chars_tail;
118
273
    MVMuint32 reached_stopper;
119
273
    switch (ds->encoding) {
120
271
    case MVM_encoding_type_utf8:
121
271
        reached_stopper = MVM_string_utf8_decodestream(tc, ds, stopper_chars, sep_spec);
122
271
        break;
123
1
    case MVM_encoding_type_ascii:
124
1
        reached_stopper = MVM_string_ascii_decodestream(tc, ds, stopper_chars, sep_spec);
125
1
        break;
126
1
    case MVM_encoding_type_latin1:
127
1
        reached_stopper = MVM_string_latin1_decodestream(tc, ds, stopper_chars, sep_spec);
128
1
        break;
129
0
    case MVM_encoding_type_windows1252:
130
0
        reached_stopper = MVM_string_windows1252_decodestream(tc, ds, stopper_chars, sep_spec);
131
0
        break;
132
0
    case MVM_encoding_type_windows1251:
133
0
        reached_stopper = MVM_string_windows1251_decodestream(tc, ds, stopper_chars, sep_spec);
134
0
        break;
135
0
    case MVM_encoding_type_shiftjis:
136
0
        reached_stopper = MVM_string_shiftjis_decodestream(tc, ds, stopper_chars, sep_spec);
137
0
        break;
138
0
    case MVM_encoding_type_utf8_c8:
139
0
        reached_stopper = MVM_string_utf8_c8_decodestream(tc, ds, stopper_chars, sep_spec, eof);
140
0
        break;
141
0
    default:
142
0
        MVM_exception_throw_adhoc(tc, "Streaming decode NYI for encoding %d",
143
0
            (int)ds->encoding);
144
273
    }
145
273
    if (ds->chars_tail == prev_chars_tail)
146
32
        return RUN_DECODE_NOTHING_DECODED;
147
241
    else if (reached_stopper)
148
42
        return RUN_DECODE_STOPPER_REACHED;
149
241
    else
150
199
        return RUN_DECODE_STOPPER_NOT_REACHED;
151
273
}
152
153
/* In situations where we have hit EOF, we need to decode what's left and flush
154
 * the normalization buffer also. */
155
217
static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) {
156
217
    /* Decode all the things. */
157
217
    if (ds->bytes_head)
158
188
        run_decode(tc, ds, NULL, NULL, DECODE_EOF);
159
217
160
217
    /* If there's some things left in the normalization buffer, take them. */
161
217
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
162
217
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
163
27
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
164
27
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
165
27
        MVMint32 count = 0;
166
54
        while (ready--)
167
27
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
168
27
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
169
27
    }
170
217
}
171
172
/* Gets the specified number of characters. If we are not yet able to decode
173
 * that many, returns NULL. This may mean more input buffers are needed. The
174
 * exclude parameter specifies a number of chars that should be taken from the
175
 * input buffer, but not included in the result string (for chomping a line
176
 * separator). */
177
16
static MVMint32 missing_chars(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 wanted) {
178
16
    MVMint32 got = 0;
179
16
    MVMDecodeStreamChars *cur_chars = ds->chars_head;
180
23
    while (cur_chars && got < wanted) {
181
7
        if (cur_chars == ds->chars_head)
182
7
            got += cur_chars->length - ds->chars_head_pos;
183
7
        else
184
0
            got += cur_chars->length;
185
7
        cur_chars = cur_chars->next;
186
7
    }
187
11
    return got >= wanted ? 0 : wanted - got;
188
16
}
189
28
static MVMString * take_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint32 exclude) {
190
28
    MVMString *result;
191
28
    MVMint32   found = 0;
192
28
    MVMint32   result_found = 0;
193
28
194
28
    MVMint32   result_chars = chars - exclude;
195
28
    if (result_chars < 0)
196
0
        MVM_exception_throw_adhoc(tc, "DecodeStream take_chars: chars - exclude < 0 should never happen");
197
28
198
28
    result                       = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
199
28
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
200
28
    result->body.num_graphs      = result_chars;
201
28
202
28
    /* In the best case, the head char buffer has exactly what we need. This
203
28
     * will typically happen when it a steady state of decoding lines. */
204
28
    if (ds->chars_head->length == chars && ds->chars_head_pos == 0) {
205
18
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
206
18
        result->body.storage.blob_32 = cur_chars->chars;
207
18
        ds->chars_head = cur_chars->next;
208
18
        if (ds->chars_head == NULL)
209
18
            ds->chars_tail = NULL;
210
18
        free_chars(tc, ds, cur_chars);
211
18
    }
212
28
213
28
    /* Otherwise, need to take and copy. */
214
10
    else {
215
10
        result->body.storage.blob_32 = MVM_malloc(result_chars * sizeof(MVMGrapheme32));
216
40
        while (found < chars) {
217
30
            MVMDecodeStreamChars *cur_chars = ds->chars_head;
218
30
            MVMint32 available = cur_chars->length - ds->chars_head_pos;
219
30
            if (available <= chars - found) {
220
30
                /* We need all that's left in this buffer and likely
221
30
                 * more. */
222
30
                MVMDecodeStreamChars *next_chars = cur_chars->next;
223
30
                if (available <= result_chars - result_found) {
224
23
                    memcpy(result->body.storage.blob_32 + result_found,
225
23
                        cur_chars->chars + ds->chars_head_pos,
226
23
                        available * sizeof(MVMGrapheme32));
227
23
                    result_found += available;
228
23
                }
229
7
                else {
230
7
                    MVMint32 to_copy = result_chars - result_found;
231
7
                    memcpy(result->body.storage.blob_32 + result_found,
232
7
                        cur_chars->chars + ds->chars_head_pos,
233
7
                        to_copy * sizeof(MVMGrapheme32));
234
7
                    result_found += to_copy;
235
7
                }
236
30
                found += available;
237
30
                MVM_free(cur_chars->chars);
238
30
                free_chars(tc, ds, cur_chars);
239
30
                ds->chars_head = next_chars;
240
30
                ds->chars_head_pos = 0;
241
30
                if (ds->chars_head == NULL)
242
10
                    ds->chars_tail = NULL;
243
30
            }
244
0
            else {
245
0
                /* There's enough in this buffer to satisfy us, and we'll leave
246
0
                 * some behind. */
247
0
                MVMint32 take = chars - found;
248
0
                MVMint32 to_copy = result_chars - result_found;
249
0
                memcpy(result->body.storage.blob_32 + result_found,
250
0
                    cur_chars->chars + ds->chars_head_pos,
251
0
                    to_copy * sizeof(MVMGrapheme32));
252
0
                result_found += to_copy;
253
0
                found += take;
254
0
                ds->chars_head_pos += take;
255
0
            }
256
30
        }
257
10
    }
258
28
    return result;
259
28
}
260
MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds,
261
8
                                              MVMint32 chars, MVMint64 eof) {
262
8
    MVMint32 missing;
263
8
264
8
    /* If we request nothing, give empty string. */
265
8
    if (chars == 0)
266
0
        return tc->instance->str_consts.empty;
267
8
268
8
    /* If we don't already have enough chars, try and decode more. */
269
8
    missing = missing_chars(tc, ds, chars);
270
8
    ds->result_size_guess = missing;
271
8
    if (missing)
272
8
        run_decode(tc, ds, &missing, NULL, DECODE_NOT_EOF);
273
8
274
8
    /* If we've got enough, assemble a string. Otherwise, flag EOF and retry,
275
8
     * falling back to returning what's available. */
276
8
    if (missing_chars(tc, ds, chars) == 0) {
277
5
        return take_chars(tc, ds, chars, 0);
278
5
    }
279
3
    else if (eof) {
280
0
        reached_eof(tc, ds);
281
0
        return missing_chars(tc, ds, chars) == 0
282
0
            ? take_chars(tc, ds, chars, 0)
283
0
            : MVM_string_decodestream_get_all(tc, ds);
284
0
    }
285
3
    else {
286
3
        return NULL;
287
3
    }
288
8
}
289
290
/* Gets characters up until one of the specified separators is encountered. If
291
 * we do not encounter it, returns 0. This may mean more input buffers are needed
292
 * or that we reached the end of the stream. Note that it assumes the separator
293
 * will exist near the end of the buffer, if it occurs at all, due to decode
294
 * streams looking for stoppers. */
295
static MVMint32 have_separator(MVMThreadContext *tc, MVMDecodeStreamChars *start_chars, MVMint32 start_pos,
296
18
                               MVMDecodeStreamSeparators *sep_spec, MVMint32 sep_idx, MVMint32 sep_graph_pos) {
297
18
    MVMint32 sep_pos = 1;
298
18
    MVMint32 sep_length = sep_spec->sep_lengths[sep_idx];
299
18
    MVMDecodeStreamChars *cur_chars = start_chars;
300
34
    while (cur_chars) {
301
18
        MVMint32 start = cur_chars == start_chars ? start_pos : 0;
302
24
        MVMint32 i;
303
24
        for (i = start; i < cur_chars->length; i++) {
304
8
            if (cur_chars->chars[i] != sep_spec->sep_graphemes[sep_graph_pos])
305
0
                return 0;
306
8
            sep_pos++;
307
8
            if (sep_pos == sep_length)
308
8
                return 1;
309
0
            sep_graph_pos++;
310
0
        }
311
16
        cur_chars = cur_chars->next;
312
16
    }
313
10
    return 0;
314
18
}
315
static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds,
316
                               MVMDecodeStreamSeparators *sep_spec, MVMint32 *sep_length,
317
98
                               int eof) {
318
98
    MVMint32 sep_loc = 0;
319
98
    MVMDecodeStreamChars *cur_chars = ds->chars_head;
320
98
321
98
    /* First, skip over any buffers we need not consider. */
322
98
    MVMint32 max_sep_length = sep_spec->max_sep_length;
323
126
    while (cur_chars && cur_chars->next) {
324
37
        if (cur_chars->next->length < max_sep_length)
325
9
            break;
326
28
        sep_loc += cur_chars->length;
327
28
        cur_chars = cur_chars->next;
328
28
    }
329
98
330
98
    /* Now scan for the separator. */
331
128
    while (cur_chars) {
332
53
        MVMint32 i, j;
333
53
        MVMint32 start;
334
53
        if (eof) {
335
10
            start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0;
336
14
        }
337
39
        else {
338
39
            start = cur_chars->length - max_sep_length;
339
39
            if (cur_chars == ds->chars_head) {
340
23
                if (start >= ds->chars_head_pos)
341
22
                    sep_loc += start - ds->chars_head_pos;
342
23
                else
343
1
                    start = ds->chars_head_pos;
344
23
            }
345
16
            else {
346
16
                if (start >= 0)
347
16
                    sep_loc += start;
348
16
                else
349
0
                    start = 0;
350
16
            }
351
39
        }
352
119
        for (i = start; i < cur_chars->length; i++) {
353
89
            MVMint32 sep_graph_pos = 0;
354
89
            MVMGrapheme32 cur_char = cur_chars->chars[i];
355
89
            sep_loc++;
356
207
            for (j = 0; j < sep_spec->num_seps; j++) {
357
141
                if (sep_spec->sep_graphemes[sep_graph_pos] == cur_char) {
358
33
                    if (sep_spec->sep_lengths[j] == 1) {
359
15
                        *sep_length = 1;
360
15
                        return sep_loc;
361
15
                    }
362
18
                    else if (have_separator(tc, cur_chars, i + 1, sep_spec, j, sep_graph_pos + 1)) {
363
8
                        *sep_length = sep_spec->sep_lengths[j];
364
8
                        sep_loc += sep_spec->sep_lengths[j] - 1;
365
8
                        return sep_loc;
366
8
                    }
367
33
                }
368
118
                sep_graph_pos += sep_spec->sep_lengths[j];
369
118
            }
370
89
        }
371
30
        cur_chars = cur_chars->next;
372
30
    }
373
75
    return 0;
374
98
}
375
MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds,
376
50
                                                  MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) {
377
50
    MVMint32 sep_loc, sep_length;
378
50
379
50
    /* Look for separator, trying more decoding if it fails. We get the place
380
50
     * just beyond the separator, so can use take_chars to get what's need.
381
50
     * Note that decoders are only responsible for finding the final char of
382
50
     * the separator, so we may need to loop a few times around this. */
383
50
    sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 0);
384
100
    while (!sep_loc) {
385
77
        MVMuint32 decode_outcome = run_decode(tc, ds, NULL, sep_spec, DECODE_NOT_EOF);
386
77
        if (decode_outcome == RUN_DECODE_NOTHING_DECODED)
387
27
            break;
388
50
        if (decode_outcome == RUN_DECODE_STOPPER_REACHED)
389
37
            sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 0);
390
50
    }
391
50
    if (sep_loc) {
392
23
        /* Use this line length as a guesstimate of the next, unless it's tiny
393
23
         * in which case we treat it as an outlier (probably an empty line or
394
23
         * some such). Also round up and to a nice power of 2. */
395
23
        if (sep_loc > 32)
396
2
            ds->result_size_guess = (sep_loc << 1) & ~0xF;
397
13
        return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0);
398
23
    }
399
27
    else {
400
27
        return NULL;
401
27
    }
402
50
}
403
404
/* Variant of MVM_string_decodestream_get_until_sep that is called when we
405
 * reach EOF. Trims the final separator if there is one, or returns the last
406
 * line without the EOF marker. */
407
MVMString * MVM_string_decodestream_get_until_sep_eof(MVMThreadContext *tc, MVMDecodeStream *ds,
408
11
                                                      MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) {
409
11
    MVMint32 sep_loc, sep_length;
410
11
411
11
    /* Decode anything remaining and flush normalization buffer. */
412
11
    reached_eof(tc, ds);
413
11
414
11
    /* Look for separator, which should by now be at the end, and chomp it
415
11
     * off if needed. */
416
11
    sep_loc = find_separator(tc, ds, sep_spec, &sep_length, 1);
417
11
    if (sep_loc)
418
0
        return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0);
419
11
420
11
    /* Otherwise, take all remaining chars. */
421
11
    return MVM_string_decodestream_get_all(tc, ds);
422
11
}
423
424
/* Produces a string consisting of the characters available now in all decdoed
425
 * buffers. */
426
207
static MVMString * get_all_in_buffer(MVMThreadContext *tc, MVMDecodeStream *ds) {
427
207
    MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
428
207
    result->body.storage_type = MVM_STRING_GRAPHEME_32;
429
207
430
207
    /* If there's no codepoint buffer, then return the empty string. */
431
207
    if (!ds->chars_head) {
432
5
        result->body.storage.blob_32 = NULL;
433
5
        result->body.num_graphs      = 0;
434
5
    }
435
207
436
207
    /* If there's exactly one resulting codepoint buffer and we swallowed none
437
207
     * of it, just use it. */
438
202
    else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) {
439
19
        /* Set up result string. */
440
19
        result->body.storage.blob_32 = ds->chars_head->chars;
441
19
        result->body.num_graphs      = ds->chars_head->length;
442
19
443
19
        /* Don't free the buffer's memory itself, just the holder, as we
444
19
         * stole that for the buffer into the string above. */
445
19
        free_chars(tc, ds, ds->chars_head);
446
19
        ds->chars_head = ds->chars_tail = NULL;
447
19
    }
448
202
449
202
    /* Otherwise, need to assemble all the things. */
450
183
    else {
451
183
        /* Calculate length. */
452
183
        MVMint32 length = 0, pos = 0;
453
183
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
454
9.75k
        while (cur_chars) {
455
9.57k
            if (cur_chars == ds->chars_head)
456
183
                length += cur_chars->length - ds->chars_head_pos;
457
9.57k
            else
458
9.39k
                length += cur_chars->length;
459
9.57k
            cur_chars = cur_chars->next;
460
9.57k
        }
461
183
462
183
        /* Allocate a result buffer of the right size. */
463
183
        result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32));
464
183
        result->body.num_graphs      = length;
465
183
466
183
        /* Copy all the things into the target, freeing as we go. */
467
183
        cur_chars = ds->chars_head;
468
9.75k
        while (cur_chars) {
469
9.57k
            MVMDecodeStreamChars *next_chars = cur_chars->next;
470
9.57k
            if (cur_chars == ds->chars_head) {
471
183
                MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos;
472
183
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos,
473
183
                    to_copy * sizeof(MVMGrapheme32));
474
183
                pos += to_copy;
475
183
            }
476
9.39k
            else {
477
9.39k
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars,
478
9.39k
                    cur_chars->length * sizeof(MVMGrapheme32));
479
9.39k
                pos += cur_chars->length;
480
9.39k
            }
481
9.57k
            MVM_free(cur_chars->chars);
482
9.57k
            free_chars(tc, ds, cur_chars);
483
9.57k
            cur_chars = next_chars;
484
9.57k
        }
485
183
        ds->chars_head = ds->chars_tail = NULL;
486
183
    }
487
207
488
207
    return result;
489
207
}
490
491
/* Decodes all the buffers, signals EOF to flush any normalization buffers, and
492
 * returns a string of all decoded chars. */
493
206
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) {
494
206
    reached_eof(tc, ds);
495
206
    return get_all_in_buffer(tc, ds);
496
206
}
497
498
/* Decodes all the buffers we have, and returns a string of all decoded chars.
499
 * There may still be more to read after this, due to incomplete multi-byte
500
 * or multi-codepoint sequences that are not yet completely processed. */
501
1
MVMString * MVM_string_decodestream_get_available(MVMThreadContext *tc, MVMDecodeStream *ds) {
502
1
    if (ds->bytes_head) {
503
0
        ds->result_size_guess = ds->bytes_head->length;
504
0
        run_decode(tc, ds, NULL, NULL, DECODE_NOT_EOF);
505
0
    }
506
1
    return get_all_in_buffer(tc, ds);
507
1
}
508
509
/* Checks if we have the number of bytes requested. */
510
0
MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes) {
511
0
    MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
512
0
    MVMint32 found = 0;
513
0
    while (cur_bytes) {
514
0
        found += cur_bytes == ds->bytes_head
515
0
            ? cur_bytes->length - ds->bytes_head_pos
516
0
            : cur_bytes->length;
517
0
        if (found >= bytes)
518
0
            return 1;
519
0
        cur_bytes = cur_bytes->next;
520
0
    }
521
0
    return 0;
522
0
}
523
524
/* Gets the number of bytes available. */
525
16
MVMint64 MVM_string_decodestream_bytes_available(MVMThreadContext *tc, const MVMDecodeStream *ds) {
526
16
    MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
527
16
    MVMint32 available = 0;
528
25
    while (cur_bytes) {
529
9
        available += cur_bytes == ds->bytes_head
530
6
            ? cur_bytes->length - ds->bytes_head_pos
531
3
            : cur_bytes->length;
532
9
        cur_bytes = cur_bytes->next;
533
9
    }
534
16
    return available;
535
16
}
536
537
/* Copies up to the requested number of bytes into the supplied buffer, and
538
 * returns the number of bytes we actually copied. Takes from from the start
539
 * of the stream. */
540
1
MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes) {
541
1
    MVMint32 taken = 0;
542
1
    *buf = NULL;
543
3
    while (taken < bytes && ds->bytes_head) {
544
2
        /* Take what we can. */
545
2
        MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
546
2
        MVMint32 required  = bytes - taken;
547
2
        MVMint32 available = cur_bytes->length - ds->bytes_head_pos;
548
2
        if (available <= required) {
549
2
            /* Take everything in this buffer and remove it. */
550
2
            if (!*buf)
551
1
                *buf = MVM_malloc(cur_bytes->next ? bytes : available);
552
2
            memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, available);
553
2
            taken += available;
554
2
            ds->bytes_head = cur_bytes->next;
555
2
            ds->bytes_head_pos = 0;
556
2
            MVM_free(cur_bytes->bytes);
557
2
            MVM_free(cur_bytes);
558
2
        }
559
0
        else {
560
0
            /* Just take what we need. */
561
0
            if (!*buf)
562
0
                *buf = MVM_malloc(required);
563
0
            memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, required);
564
0
            taken += required;
565
0
            ds->bytes_head_pos += required;
566
0
        }
567
2
    }
568
1
    if (ds->bytes_head == NULL)
569
1
        ds->bytes_tail = NULL;
570
1
    ds->abs_byte_pos += taken;
571
1
    return taken;
572
1
}
573
574
/* Gets the absolute byte offset (the amount we started with plus what we've
575
 * chewed and handed back in decoded characters). */
576
0
MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds) {
577
0
    return ds->abs_byte_pos;
578
0
}
579
580
/* Checks if the decode stream is empty. */
581
24
MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds) {
582
21
    return !ds->bytes_head && !ds->chars_head && MVM_unicode_normalizer_empty(tc, &(ds->norm));
583
24
}
584
585
/* Destroys a decoding stream, freeing all associated memory (including the
586
 * buffers). */
587
91
void MVM_string_decodestream_destroy(MVMThreadContext *tc, MVMDecodeStream *ds) {
588
91
    MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
589
91
    MVMDecodeStreamChars *cur_chars = ds->chars_head;
590
91
    while (cur_bytes) {
591
0
        MVMDecodeStreamBytes *next_bytes = cur_bytes->next;
592
0
        MVM_free(cur_bytes->bytes);
593
0
        MVM_free(cur_bytes);
594
0
        cur_bytes = next_bytes;
595
0
    }
596
91
    while (cur_chars) {
597
0
        MVMDecodeStreamChars *next_chars = cur_chars->next;
598
0
        MVM_free(cur_chars->chars);
599
0
        MVM_free(cur_chars);
600
0
        cur_chars = next_chars;
601
0
    }
602
91
    MVM_unicode_normalizer_cleanup(tc, &(ds->norm));
603
91
    MVM_free(ds->decoder_state);
604
91
    MVM_free(ds->chars_reuse);
605
91
    MVM_free(ds);
606
91
}
607
608
/* Calculates and caches various bits of information about separators, for
609
 * faster line reading. */
610
1.31k
static void cache_sep_info(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
611
1.31k
    MVMGrapheme32 *final_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32));
612
1.31k
    MVMint32 max_final_grapheme = -1;
613
1.31k
    MVMint32 max_sep_length = 1;
614
1.31k
    MVMint32 cur_sep_pos = 0;
615
1.31k
    MVMint32 i;
616
3.94k
    for (i = 0; i < sep_spec->num_seps; i++) {
617
2.63k
        MVMint32 length = sep_spec->sep_lengths[i];
618
2.63k
        if (length > max_sep_length)
619
4
            max_sep_length = length;
620
2.63k
        cur_sep_pos += length;
621
2.63k
        final_graphemes[i] = sep_spec->sep_graphemes[cur_sep_pos - 1];
622
2.63k
        if (final_graphemes[i] > max_final_grapheme)
623
1.31k
            max_final_grapheme = final_graphemes[i];
624
2.63k
    }
625
1.31k
    sep_spec->max_sep_length = max_sep_length;
626
1.31k
    sep_spec->final_graphemes = final_graphemes;
627
1.31k
    sep_spec->max_final_grapheme = max_final_grapheme;
628
1.31k
}
629
630
/* Sets a decode stream separator to its default value. */
631
661
void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
632
661
    sep_spec->num_seps = 2;
633
661
    sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32));
634
661
    sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32));
635
661
636
661
    sep_spec->sep_lengths[0] = 1;
637
661
    sep_spec->sep_graphemes[0] = '\n';
638
661
639
661
    sep_spec->sep_lengths[1] = 1;
640
661
    sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc);
641
661
642
661
    cache_sep_info(tc, sep_spec);
643
661
}
644
645
/* Takes a string and sets it up as a decode stream separator. */
646
void MVM_string_decode_stream_sep_from_strings(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec,
647
655
                                               MVMString **seps, MVMint32 num_seps) {
648
655
    MVMGraphemeIter gi;
649
655
    MVMint32 i, graph_length, graph_pos;
650
655
651
655
    if (num_seps > 0xFFF)
652
0
        MVM_exception_throw_adhoc(tc, "Too many line separators");
653
655
654
655
    MVM_free(sep_spec->sep_lengths);
655
655
    MVM_free(sep_spec->sep_graphemes);
656
655
    MVM_free(sep_spec->final_graphemes);
657
655
658
655
    sep_spec->num_seps = num_seps;
659
655
    sep_spec->sep_lengths = MVM_malloc(num_seps * sizeof(MVMint32));
660
655
    graph_length = 0;
661
1.96k
    for (i = 0; i < num_seps; i++) {
662
1.30k
        MVMuint32 num_graphs = MVM_string_graphs(tc, seps[i]);
663
1.30k
        if (num_graphs > 0xFFFF)
664
0
            MVM_exception_throw_adhoc(tc, "Line separator too long");
665
1.30k
        sep_spec->sep_lengths[i] = num_graphs;
666
1.30k
        graph_length += num_graphs;
667
1.30k
    }
668
655
669
655
    sep_spec->sep_graphemes = MVM_malloc(graph_length * sizeof(MVMGrapheme32));
670
655
    graph_pos = 0;
671
1.96k
    for (i = 0; i < num_seps; i++) {
672
1.30k
        MVM_string_gi_init(tc, &gi, seps[i]);
673
2.62k
        while (MVM_string_gi_has_more(tc, &gi))
674
1.31k
            sep_spec->sep_graphemes[graph_pos++] = MVM_string_gi_get_grapheme(tc, &gi);
675
1.30k
    }
676
655
677
655
    cache_sep_info(tc, sep_spec);
678
655
}
679
680
/* Cleans up memory associated with a stream separator set. */
681
91
void MVM_string_decode_stream_sep_destroy(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
682
91
    MVM_free(sep_spec->sep_lengths);
683
91
    MVM_free(sep_spec->sep_graphemes);
684
91
    MVM_free(sep_spec->final_graphemes);
685
91
}