Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/decode_stream.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* A decode stream represents an on-going decoding process, from bytes into
4
 * characters. Bytes can be contributed to the decode stream, and chars can be
5
 * obtained. Byte buffers and decoded char buffers are kept in linked lists.
6
 * Note that characters may start at the end of one byte buffer and finish in
7
 * the next, which is taken care of by the logic in here and the decoders
8
 * themselves. Additionally, normalization may be applied using the normalizer
9
 * in the decode stream, at the discretion of the encoding in question (some,
10
 * such as ASCII and Latin-1, are normalized by definition).
11
 */
12
13
72
#define DECODE_NOT_EOF  0
14
172
#define DECODE_EOF      1
15
16
/* Creates a new decoding stream. */
17
MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding,
18
193
        MVMint64 abs_byte_pos, MVMint32 translate_newlines) {
19
193
    MVMDecodeStream *ds = MVM_calloc(1, sizeof(MVMDecodeStream));
20
193
    ds->encoding        = encoding;
21
193
    ds->abs_byte_pos    = abs_byte_pos;
22
193
    MVM_unicode_normalizer_init(tc, &(ds->norm), MVM_NORMALIZE_NFG);
23
193
    if (translate_newlines)
24
187
        MVM_unicode_normalizer_translate_newlines(tc, &(ds->norm));
25
193
    return ds;
26
193
}
27
28
/* Adds another byte buffer into the decoding stream. */
29
375
void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, char *bytes, MVMint32 length) {
30
375
    if (length > 0) {
31
194
        MVMDecodeStreamBytes *new_bytes = MVM_calloc(1, sizeof(MVMDecodeStreamBytes));
32
194
        new_bytes->bytes  = bytes;
33
194
        new_bytes->length = length;
34
194
        if (ds->bytes_tail)
35
3
            ds->bytes_tail->next = new_bytes;
36
194
        ds->bytes_tail = new_bytes;
37
194
        if (!ds->bytes_head)
38
191
            ds->bytes_head = new_bytes;
39
194
    }
40
181
    else {
41
181
        /* It's empty, so free the buffer right away and don't add. */
42
181
        MVM_free(bytes);
43
181
    }
44
375
}
45
46
/* Adds another char result buffer into the decoding stream. */
47
242
void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length) {
48
242
    MVMDecodeStreamChars *new_chars = MVM_calloc(1, sizeof(MVMDecodeStreamChars));
49
242
    new_chars->chars  = chars;
50
242
    new_chars->length = length;
51
242
    if (ds->chars_tail)
52
33
        ds->chars_tail->next = new_chars;
53
242
    ds->chars_tail = new_chars;
54
242
    if (!ds->chars_head)
55
209
        ds->chars_head = new_chars;
56
242
}
57
58
/* Throws away byte buffers no longer needed. */
59
220
void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos) {
60
222
    while (ds->bytes_head != bytes) {
61
2
        MVMDecodeStreamBytes *discard = ds->bytes_head;
62
2
        ds->abs_byte_pos += discard->length - ds->bytes_head_pos;
63
2
        ds->bytes_head = discard->next;
64
2
        ds->bytes_head_pos = 0;
65
2
        MVM_free(discard->bytes);
66
2
        MVM_free(discard);
67
2
    }
68
220
    if (!ds->bytes_head && pos == 0)
69
0
        return;
70
220
    if (ds->bytes_head->length == pos) {
71
185
        /* We ate all of the new head buffer too; also free it. */
72
185
        MVMDecodeStreamBytes *discard = ds->bytes_head;
73
185
        ds->abs_byte_pos += discard->length - ds->bytes_head_pos;
74
185
        ds->bytes_head = discard->next;
75
185
        ds->bytes_head_pos = 0;
76
185
        MVM_free(discard->bytes);
77
185
        MVM_free(discard);
78
185
        if (ds->bytes_head == NULL)
79
185
            ds->bytes_tail = NULL;
80
185
    }
81
35
    else {
82
35
        ds->abs_byte_pos += pos - ds->bytes_head_pos;
83
35
        ds->bytes_head_pos = pos;
84
35
    }
85
220
}
86
87
/* Does a decode run, selected by encoding. Returns non-zero if we actually
88
 * decoded more chars. */
89
92
#define RUN_DECODE_NOTHING_DECODED          0
90
181
#define RUN_DECODE_STOPPER_NOT_REACHED      1
91
76
#define RUN_DECODE_STOPPER_REACHED          2
92
244
static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *sep_spec, MVMint32 eof) {
93
244
    MVMDecodeStreamChars *prev_chars_tail = ds->chars_tail;
94
244
    MVMuint32 reached_stopper;
95
244
    switch (ds->encoding) {
96
242
    case MVM_encoding_type_utf8:
97
242
        reached_stopper = MVM_string_utf8_decodestream(tc, ds, stopper_chars, sep_spec);
98
242
        break;
99
1
    case MVM_encoding_type_ascii:
100
1
        reached_stopper = MVM_string_ascii_decodestream(tc, ds, stopper_chars, sep_spec);
101
1
        break;
102
1
    case MVM_encoding_type_latin1:
103
1
        reached_stopper = MVM_string_latin1_decodestream(tc, ds, stopper_chars, sep_spec);
104
1
        break;
105
0
    case MVM_encoding_type_windows1252:
106
0
        reached_stopper = MVM_string_windows1252_decodestream(tc, ds, stopper_chars, sep_spec);
107
0
        break;
108
0
    case MVM_encoding_type_utf8_c8:
109
0
        reached_stopper = MVM_string_utf8_c8_decodestream(tc, ds, stopper_chars, sep_spec, eof);
110
0
        break;
111
0
    default:
112
0
        MVM_exception_throw_adhoc(tc, "Streaming decode NYI for encoding %d",
113
0
            (int)ds->encoding);
114
244
    }
115
244
    if (ds->chars_tail == prev_chars_tail)
116
28
        return RUN_DECODE_NOTHING_DECODED;
117
216
    else if (reached_stopper)
118
35
        return RUN_DECODE_STOPPER_REACHED;
119
216
    else
120
181
        return RUN_DECODE_STOPPER_NOT_REACHED;
121
244
}
122
123
/* Gets the specified number of characters. If we are not yet able to decode
124
 * that many, returns NULL. This may mean more input buffers are needed. The
125
 * exclude parameter specifies a number of chars that should be taken from the
126
 * input buffer, but not included in the result string (for chomping a line
127
 * separator). */
128
16
static MVMint32 missing_chars(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 wanted) {
129
16
    MVMint32 got = 0;
130
16
    MVMDecodeStreamChars *cur_chars = ds->chars_head;
131
23
    while (cur_chars && got < wanted) {
132
7
        if (cur_chars == ds->chars_head)
133
7
            got += cur_chars->length - ds->chars_head_pos;
134
7
        else
135
0
            got += cur_chars->length;
136
7
        cur_chars = cur_chars->next;
137
7
    }
138
11
    return got >= wanted ? 0 : wanted - got;
139
16
}
140
25
static MVMString * take_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint32 exclude) {
141
25
    MVMString *result;
142
25
    MVMint32   found = 0;
143
25
    MVMint32   result_found = 0;
144
25
145
25
    MVMint32   result_chars = chars - exclude;
146
25
    if (result_chars < 0)
147
0
        MVM_exception_throw_adhoc(tc, "DecodeStream take_chars: chars - exclude < 0 should never happen");
148
25
149
25
    result                       = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
150
25
    result->body.storage.blob_32 = MVM_malloc(result_chars * sizeof(MVMGrapheme32));
151
25
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
152
25
    result->body.num_graphs      = result_chars;
153
62
    while (found < chars) {
154
37
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
155
37
        MVMint32 available = cur_chars->length - ds->chars_head_pos;
156
37
        if (available <= chars - found) {
157
37
            /* We need all that's left in this buffer and likely
158
37
             * more. */
159
37
            MVMDecodeStreamChars *next_chars = cur_chars->next;
160
37
            if (available <= result_chars - result_found) {
161
26
                memcpy(result->body.storage.blob_32 + result_found,
162
26
                    cur_chars->chars + ds->chars_head_pos,
163
26
                    available * sizeof(MVMGrapheme32));
164
26
                result_found += available;
165
26
            }
166
11
            else {
167
11
                MVMint32 to_copy = result_chars - result_found;
168
11
                memcpy(result->body.storage.blob_32 + result_found,
169
11
                    cur_chars->chars + ds->chars_head_pos,
170
11
                    to_copy * sizeof(MVMGrapheme32));
171
11
                result_found += to_copy;
172
11
            }
173
37
            found += available;
174
37
            MVM_free(cur_chars->chars);
175
37
            MVM_free(cur_chars);
176
37
            ds->chars_head = next_chars;
177
37
            ds->chars_head_pos = 0;
178
37
            if (ds->chars_head == NULL)
179
25
                ds->chars_tail = NULL;
180
37
        }
181
0
        else {
182
0
            /* There's enough in this buffer to satisfy us, and we'll leave
183
0
             * some behind. */
184
0
            MVMint32 take = chars - found;
185
0
            MVMint32 to_copy = result_chars - result_found;
186
0
            memcpy(result->body.storage.blob_32 + result_found,
187
0
                cur_chars->chars + ds->chars_head_pos,
188
0
                to_copy * sizeof(MVMGrapheme32));
189
0
            result_found += to_copy;
190
0
            found += take;
191
0
            ds->chars_head_pos += take;
192
0
        }
193
37
    }
194
25
    return result;
195
25
}
196
8
MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars) {
197
8
    MVMint32 missing;
198
8
199
8
    /* If we request nothing, give empty string. */
200
8
    if (chars == 0)
201
0
        return tc->instance->str_consts.empty;
202
8
203
8
    /* If we don't already have enough chars, try and decode more. */
204
8
    missing = missing_chars(tc, ds, chars);
205
8
    if (missing)
206
8
        run_decode(tc, ds, &missing, NULL, DECODE_NOT_EOF);
207
8
208
8
    /* If we've got enough, assemble a string. Otherwise, give up. */
209
8
    if (missing_chars(tc, ds, chars) == 0)
210
5
        return take_chars(tc, ds, chars, 0);
211
8
    else
212
3
        return NULL;
213
8
}
214
215
/* Gets characters up until one of the specified separators is encountered. If
216
 * we do not encounter it, returns 0. This may mean more input buffers are needed
217
 * or that we reached the end of the stream. Note that it assumes the separator
218
 * will exist near the end of the buffer, if it occurs at all, due to decode
219
 * streams looking for stoppers. */
220
static MVMint32 have_separator(MVMThreadContext *tc, MVMDecodeStreamChars *start_chars, MVMint32 start_pos,
221
16
                               MVMDecodeStreamSeparators *sep_spec, MVMint32 sep_idx, MVMint32 sep_graph_pos) {
222
16
    MVMint32 sep_pos = 1;
223
16
    MVMint32 sep_length = sep_spec->sep_lengths[sep_idx];
224
16
    MVMDecodeStreamChars *cur_chars = start_chars;
225
32
    while (cur_chars) {
226
16
        MVMint32 start = cur_chars == start_chars ? start_pos : 0;
227
22
        MVMint32 i;
228
22
        for (i = start; i < cur_chars->length; i++) {
229
6
            if (cur_chars->chars[i] != sep_spec->sep_graphemes[sep_graph_pos])
230
0
                return 0;
231
6
            sep_pos++;
232
6
            if (sep_pos == sep_length)
233
6
                return 1;
234
0
            sep_graph_pos++;
235
0
        }
236
16
        cur_chars = cur_chars->next;
237
16
    }
238
10
    return 0;
239
16
}
240
static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds,
241
85
                               MVMDecodeStreamSeparators *sep_spec, MVMint32 *sep_length) {
242
85
    MVMint32 sep_loc = 0;
243
85
    MVMDecodeStreamChars *cur_chars = ds->chars_head;
244
85
245
85
    /* First, skip over any buffers we need not consider. */
246
85
    MVMint32 max_sep_chars = MVM_string_decode_stream_sep_max_chars(tc, sep_spec);
247
102
    while (cur_chars && cur_chars->next) {
248
26
        if (cur_chars->next->length < max_sep_chars)
249
9
            break;
250
17
        sep_loc += cur_chars->length;
251
17
        cur_chars = cur_chars->next;
252
17
    }
253
85
254
85
    /* Now scan for the separator. */
255
109
    while (cur_chars) {
256
24
        MVMint32 start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0;
257
44
        MVMint32 i, j;
258
188
        for (i = start; i < cur_chars->length; i++) {
259
164
            MVMint32 sep_graph_pos = 0;
260
164
            MVMGrapheme32 cur_char = cur_chars->chars[i];
261
164
            sep_loc++;
262
447
            for (j = 0; j < sep_spec->num_seps; j++) {
263
303
                if (sep_spec->sep_graphemes[sep_graph_pos] == cur_char) {
264
30
                    if (sep_spec->sep_lengths[j] == 1) {
265
14
                        *sep_length = 1;
266
14
                        return sep_loc;
267
14
                    }
268
16
                    else if (have_separator(tc, cur_chars, i + 1, sep_spec, j, sep_graph_pos + 1)) {
269
6
                        *sep_length = sep_spec->sep_lengths[j];
270
6
                        sep_loc += sep_spec->sep_lengths[j] - 1;
271
6
                        return sep_loc;
272
6
                    }
273
30
                }
274
283
                sep_graph_pos += sep_spec->sep_lengths[j];
275
283
            }
276
164
        }
277
24
        cur_chars = cur_chars->next;
278
24
    }
279
65
    return 0;
280
85
}
281
MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds,
282
43
                                                  MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) {
283
43
    MVMint32 sep_loc, sep_length;
284
43
285
43
    /* Look for separator, trying more decoding if it fails. We get the place
286
43
     * just beyond the separator, so can use take_chars to get what's need.
287
43
     * Note that decoders are only responsible for finding the final char of
288
43
     * the separator, so we may need to loop a few times around this. */
289
43
    sep_loc = find_separator(tc, ds, sep_spec, &sep_length);
290
84
    while (!sep_loc) {
291
64
        MVMuint32 decode_outcome = run_decode(tc, ds, NULL, sep_spec, DECODE_NOT_EOF);
292
64
        if (decode_outcome == RUN_DECODE_NOTHING_DECODED)
293
23
            break;
294
41
        if (decode_outcome == RUN_DECODE_STOPPER_REACHED)
295
30
            sep_loc = find_separator(tc, ds, sep_spec, &sep_length);
296
41
    }
297
43
    if (sep_loc)
298
20
        return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0);
299
43
    else
300
23
        return NULL;
301
43
}
302
303
/* In situations where we have hit EOF, we need to decode what's left and flush
304
 * the normalization buffer also. */
305
203
static void reached_eof(MVMThreadContext *tc, MVMDecodeStream *ds) {
306
203
    /* Decode all the things. */
307
203
    if (ds->bytes_head)
308
172
        run_decode(tc, ds, NULL, NULL, DECODE_EOF);
309
203
310
203
    /* If there's some things left in the normalization buffer, take them. */
311
203
    MVM_unicode_normalizer_eof(tc, &(ds->norm));
312
203
    if (MVM_unicode_normalizer_available(tc, &(ds->norm))) {
313
24
        MVMint32 ready = MVM_unicode_normalizer_available(tc, &(ds->norm));
314
24
        MVMGrapheme32 *buffer = MVM_malloc(ready * sizeof(MVMGrapheme32));
315
24
        MVMint32 count = 0;
316
48
        while (ready--)
317
24
            buffer[count++] = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
318
24
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
319
24
    }
320
203
}
321
322
/* Variant of MVM_string_decodestream_get_until_sep that is called when we
323
 * reach EOF. Trims the final separator if there is one, or returns the last
324
 * line without the EOF marker. */
325
MVMString * MVM_string_decodestream_get_until_sep_eof(MVMThreadContext *tc, MVMDecodeStream *ds,
326
12
                                                      MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp) {
327
12
    MVMint32 sep_loc, sep_length;
328
12
329
12
    /* Decode anything remaining and flush normalization buffer. */
330
12
    reached_eof(tc, ds);
331
12
332
12
    /* Look for separator, which should by now be at the end, and chomp it
333
12
     * off if needed. */
334
12
    sep_loc = find_separator(tc, ds, sep_spec, &sep_length);
335
12
    if (sep_loc)
336
0
        return take_chars(tc, ds, sep_loc, chomp ? sep_length : 0);
337
12
338
12
    /* Otherwise, take all remaining chars. */
339
12
    return MVM_string_decodestream_get_all(tc, ds);
340
12
}
341
342
/* Produces a string consisting of the characters available now in all decdoed
343
 * buffers. */
344
192
static MVMString * get_all_in_buffer(MVMThreadContext *tc, MVMDecodeStream *ds) {
345
192
    MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
346
192
    result->body.storage_type = MVM_STRING_GRAPHEME_32;
347
192
348
192
    /* If there's no codepoint buffer, then return the empty string. */
349
192
    if (!ds->chars_head) {
350
8
        result->body.storage.blob_32 = NULL;
351
8
        result->body.num_graphs      = 0;
352
8
    }
353
192
354
192
    /* If there's exactly one resulting codepoint buffer and we swallowed none
355
192
     * of it, just use it. */
356
184
    else if (ds->chars_head == ds->chars_tail && ds->chars_head_pos == 0) {
357
164
        /* Set up result string. */
358
164
        result->body.storage.blob_32 = ds->chars_head->chars;
359
164
        result->body.num_graphs      = ds->chars_head->length;
360
164
361
164
        /* Don't free the buffer's memory itself, just the holder, as we
362
164
         * stole that for the buffer into the string above. */
363
164
        MVM_free(ds->chars_head);
364
164
        ds->chars_head = ds->chars_tail = NULL;
365
164
    }
366
184
367
184
    /* Otherwise, need to assemble all the things. */
368
20
    else {
369
20
        /* Calculate length. */
370
20
        MVMint32 length = 0, pos = 0;
371
20
        MVMDecodeStreamChars *cur_chars = ds->chars_head;
372
61
        while (cur_chars) {
373
41
            if (cur_chars == ds->chars_head)
374
20
                length += cur_chars->length - ds->chars_head_pos;
375
41
            else
376
21
                length += cur_chars->length;
377
41
            cur_chars = cur_chars->next;
378
41
        }
379
20
380
20
        /* Allocate a result buffer of the right size. */
381
20
        result->body.storage.blob_32 = MVM_malloc(length * sizeof(MVMGrapheme32));
382
20
        result->body.num_graphs      = length;
383
20
384
20
        /* Copy all the things into the target, freeing as we go. */
385
20
        cur_chars = ds->chars_head;
386
61
        while (cur_chars) {
387
41
            MVMDecodeStreamChars *next_chars = cur_chars->next;
388
41
            if (cur_chars == ds->chars_head) {
389
20
                MVMint32 to_copy = ds->chars_head->length - ds->chars_head_pos;
390
20
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars + ds->chars_head_pos,
391
20
                    cur_chars->length * sizeof(MVMGrapheme32));
392
20
                pos += to_copy;
393
20
            }
394
21
            else {
395
21
                memcpy(result->body.storage.blob_32 + pos, cur_chars->chars,
396
21
                    cur_chars->length * sizeof(MVMGrapheme32));
397
21
                pos += cur_chars->length;
398
21
            }
399
41
            MVM_free(cur_chars->chars);
400
41
            MVM_free(cur_chars);
401
41
            cur_chars = next_chars;
402
41
        }
403
20
        ds->chars_head = ds->chars_tail = NULL;
404
20
    }
405
192
406
192
    return result;
407
192
}
408
409
/* Decodes all the buffers, signals EOF to flush any normalization buffers, and
410
 * returns a string of all decoded chars. */
411
191
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds) {
412
191
    reached_eof(tc, ds);
413
191
    return get_all_in_buffer(tc, ds);
414
191
}
415
416
/* Decodes all the buffers we have, and returns a string of all decoded chars.
417
 * There may still be more to read after this, due to incomplete multi-byte
418
 * or multi-codepoint sequences that are not yet completely processed. */
419
1
MVMString * MVM_string_decodestream_get_available(MVMThreadContext *tc, MVMDecodeStream *ds) {
420
1
    if (ds->bytes_head)
421
0
        run_decode(tc, ds, NULL, NULL, DECODE_NOT_EOF);
422
1
    return get_all_in_buffer(tc, ds);
423
1
}
424
425
/* Checks if we have the number of bytes requested. */
426
8
MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes) {
427
8
    MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
428
8
    MVMint32 found = 0;
429
8
    while (cur_bytes) {
430
4
        found += cur_bytes == ds->bytes_head
431
4
            ? cur_bytes->length - ds->bytes_head_pos
432
0
            : cur_bytes->length;
433
4
        if (found >= bytes)
434
4
            return 1;
435
0
        cur_bytes = cur_bytes->next;
436
0
    }
437
4
    return 0;
438
8
}
439
440
/* Gets the number of bytes available. */
441
4
MVMint64 MVM_string_decodestream_bytes_available(MVMThreadContext *tc, const MVMDecodeStream *ds) {
442
4
    MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
443
4
    MVMint32 available = 0;
444
8
    while (cur_bytes) {
445
4
        available += cur_bytes == ds->bytes_head
446
3
            ? cur_bytes->length - ds->bytes_head_pos
447
1
            : cur_bytes->length;
448
4
        cur_bytes = cur_bytes->next;
449
4
    }
450
4
    return available;
451
4
}
452
453
/* Copies up to the requested number of bytes into the supplied buffer, and
454
 * returns the number of bytes we actually copied. Takes from from the start
455
 * of the stream. */
456
5
MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes) {
457
5
    MVMint32 taken = 0;
458
5
    *buf = NULL;
459
11
    while (taken < bytes && ds->bytes_head) {
460
6
        /* Take what we can. */
461
6
        MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
462
6
        MVMint32 required  = bytes - taken;
463
6
        MVMint32 available = cur_bytes->length - ds->bytes_head_pos;
464
6
        if (available <= required) {
465
6
            /* Take everything in this buffer and remove it. */
466
6
            if (!*buf)
467
5
                *buf = MVM_malloc(cur_bytes->next ? bytes : available);
468
6
            memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, available);
469
6
            taken += available;
470
6
            ds->bytes_head = cur_bytes->next;
471
6
            ds->bytes_head_pos = 0;
472
6
            MVM_free(cur_bytes->bytes);
473
6
            MVM_free(cur_bytes);
474
6
        }
475
0
        else {
476
0
            /* Just take what we need. */
477
0
            if (!*buf)
478
0
                *buf = MVM_malloc(required);
479
0
            memcpy(*buf + taken, cur_bytes->bytes + ds->bytes_head_pos, required);
480
0
            taken += required;
481
0
            ds->bytes_head_pos += required;
482
0
        }
483
6
    }
484
5
    if (ds->bytes_head == NULL)
485
5
        ds->bytes_tail = NULL;
486
5
    ds->abs_byte_pos += taken;
487
5
    return taken;
488
5
}
489
490
/* Gets the absolute byte offset (the amount we started with plus what we've
491
 * chewed and handed back in decoded characters). */
492
8
MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds) {
493
8
    return ds->abs_byte_pos;
494
8
}
495
496
/* Checks if the decode stream is empty. */
497
9
MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds) {
498
6
    return !ds->bytes_head && !ds->chars_head && MVM_unicode_normalizer_empty(tc, &(ds->norm));
499
9
}
500
501
/* Destroys a decoding stream, freeing all associated memory (including the
502
 * buffers). */
503
185
void MVM_string_decodestream_destroy(MVMThreadContext *tc, MVMDecodeStream *ds) {
504
185
    MVMDecodeStreamBytes *cur_bytes = ds->bytes_head;
505
185
    MVMDecodeStreamChars *cur_chars = ds->chars_head;
506
185
    while (cur_bytes) {
507
0
        MVMDecodeStreamBytes *next_bytes = cur_bytes->next;
508
0
        MVM_free(cur_bytes->bytes);
509
0
        MVM_free(cur_bytes);
510
0
        cur_bytes = next_bytes;
511
0
    }
512
185
    while (cur_chars) {
513
0
        MVMDecodeStreamChars *next_chars = cur_chars->next;
514
0
        MVM_free(cur_chars->chars);
515
0
        MVM_free(cur_chars);
516
0
        cur_chars = next_chars;
517
0
    }
518
185
    MVM_unicode_normalizer_cleanup(tc, &(ds->norm));
519
185
    MVM_free(ds->decoder_state);
520
185
    MVM_free(ds);
521
185
}
522
523
/* Sets a decode stream separator to its default value. */
524
598
void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
525
598
    sep_spec->num_seps = 2;
526
598
    sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32));
527
598
    sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32));
528
598
529
598
    sep_spec->sep_lengths[0] = 1;
530
598
    sep_spec->sep_graphemes[0] = '\n';
531
598
532
598
    sep_spec->sep_lengths[1] = 1;
533
598
    sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc);
534
598
}
535
536
/* Takes a string and sets it up as a decode stream separator. */
537
void MVM_string_decode_stream_sep_from_strings(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec,
538
14
                                                     MVMString **seps, MVMint32 num_seps) {
539
14
    MVMGraphemeIter gi;
540
14
    MVMint32 i, graph_length, graph_pos;
541
14
542
14
    if (num_seps > 0xFFF)
543
0
        MVM_exception_throw_adhoc(tc, "Too many line separators");
544
14
545
14
    MVM_free(sep_spec->sep_lengths);
546
14
    MVM_free(sep_spec->sep_graphemes);
547
14
548
14
    sep_spec->num_seps = num_seps;
549
14
    sep_spec->sep_lengths = MVM_malloc(num_seps * sizeof(MVMint32));
550
14
    graph_length = 0;
551
35
    for (i = 0; i < num_seps; i++) {
552
21
        MVMuint32 num_graphs = MVM_string_graphs(tc, seps[i]);
553
21
        if (num_graphs > 0xFFFF)
554
0
            MVM_exception_throw_adhoc(tc, "Line separator too long");
555
21
        sep_spec->sep_lengths[i] = num_graphs;
556
21
        graph_length += num_graphs;
557
21
    }
558
14
559
14
    sep_spec->sep_graphemes = MVM_malloc(graph_length * sizeof(MVMGrapheme32));
560
14
    graph_pos = 0;
561
35
    for (i = 0; i < num_seps; i++) {
562
21
        MVM_string_gi_init(tc, &gi, seps[i]);
563
48
        while (MVM_string_gi_has_more(tc, &gi))
564
27
            sep_spec->sep_graphemes[graph_pos++] = MVM_string_gi_get_grapheme(tc, &gi);
565
21
    }
566
14
}
567
568
/* Returns the maximum length of any separator, in graphemes. */
569
85
MVMint32 MVM_string_decode_stream_sep_max_chars(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
570
85
    MVMint32 i;
571
85
    MVMint32 max_length = 1;
572
250
    for (i = 0; i < sep_spec->num_seps; i++)
573
165
        if (sep_spec->sep_lengths[i] > max_length)
574
31
            max_length = sep_spec->sep_lengths[i];
575
85
    return max_length;
576
85
}
577
578
/* Cleans up memory associated with a stream separator set. */
579
52
void MVM_string_decode_stream_sep_destroy(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
580
52
    MVM_free(sep_spec->sep_lengths);
581
52
    MVM_free(sep_spec->sep_graphemes);
582
52
}