/home/travis/build/MoarVM/MoarVM/src/strings/decode_stream.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Represents a bytes => chars decoding stream. */ |
2 | | struct MVMDecodeStream { |
3 | | /* Head and tail of the input byte buffers. */ |
4 | | MVMDecodeStreamBytes *bytes_head; |
5 | | MVMDecodeStreamBytes *bytes_tail; |
6 | | |
7 | | /* Head and tail of the output char buffers. */ |
8 | | MVMDecodeStreamChars *chars_head; |
9 | | MVMDecodeStreamChars *chars_tail; |
10 | | |
11 | | /* Often, when reading lines or chunks, we'll fill up one char buffer |
12 | | * and then immediately take it. That results in a lot of allocating |
13 | | * and freeing of MVMDecodeStreamChars structures. Keeping a free one |
14 | | * avoids this. (There's not really a common steady state where we |
15 | | * have multiple free ones, so a free isn't worth the extra work.) */ |
16 | | MVMDecodeStreamChars *chars_reuse; |
17 | | |
18 | | /* The byte position (for tell). */ |
19 | | MVMint64 abs_byte_pos; |
20 | | |
21 | | /* How far we've eaten into the current head bytes buffer. */ |
22 | | MVMint32 bytes_head_pos; |
23 | | |
24 | | /* How far we've eaten into the current head char buffer. */ |
25 | | MVMint32 chars_head_pos; |
26 | | |
27 | | /* The encoding we're using. */ |
28 | | MVMint32 encoding; |
29 | | |
30 | | /* Suggestion for decoders of how many bytes to guess at when allocating |
31 | | * decoded result buffers. */ |
32 | | MVMint32 result_size_guess; |
33 | | |
34 | | /* Normalizer. */ |
35 | | MVMNormalizer norm; |
36 | | |
37 | | /* Optional place for the decoder to keep any extra state it needs between |
38 | | * decode calls. Will be freed when the decode stream is destroyed. */ |
39 | | void *decoder_state; |
40 | | |
41 | | /* Stores a replacement which is used upon encountering undecodable characters. |
42 | | * Set to NULL if a replacement is not desired. */ |
43 | | MVMString *replacement; |
44 | | |
45 | | /* Currently stores only whether or not the decoder should decode strictly or |
46 | | * permissively. Set to 1 for permissive decoding, default is strict */ |
47 | | MVMuint32 config; |
48 | | }; |
49 | | |
50 | | /* A single bunch of bytes added to a decode stream, with a link to the next |
51 | | * one, if any. */ |
52 | | struct MVMDecodeStreamBytes { |
53 | | char *bytes; |
54 | | MVMint32 length; |
55 | | MVMDecodeStreamBytes *next; |
56 | | }; |
57 | | |
58 | | /* A bunch of characters already decoded, with a link to the next bunch. */ |
59 | | struct MVMDecodeStreamChars { |
60 | | MVMGrapheme32 *chars; |
61 | | MVMint32 length; |
62 | | MVMDecodeStreamChars *next; |
63 | | }; |
64 | | |
65 | | /* For situations where we need to decode up to some separators, this data |
66 | | * structure holds the information about them. */ |
67 | | struct MVMDecodeStreamSeparators { |
68 | | /* The lengths of the separators, in graphemes. */ |
69 | | MVMint32 *sep_lengths; |
70 | | |
71 | | /* The grapehemes themselves, in a single array (use sep_lengths to find |
72 | | * out how many there are in each separator). */ |
73 | | MVMGrapheme32 *sep_graphemes; |
74 | | |
75 | | /* The number of separators we have. */ |
76 | | MVMint32 num_seps; |
77 | | |
78 | | /* Cached maximum separator length, to save regular recalculation. */ |
79 | | MVMint32 max_sep_length; |
80 | | |
81 | | /* Cached final graphemes, for quick lookups in stream_maybe_sep. */ |
82 | | MVMGrapheme32 *final_graphemes; |
83 | | |
84 | | /* Since separators are most often control chars, we can quickly filter |
85 | | * out many graphemes without a separator search by keeping around the |
86 | | * maximum codepoint/synthetic index of any final grapheme and doing a |
87 | | * quick comparison. */ |
88 | | MVMGrapheme32 max_final_grapheme; |
89 | | }; |
90 | | |
91 | | /* Checks if we may have encountered one of the separators. This just looks to |
92 | | * see if we hit the final grapheme of any of the separators, which is all we |
93 | | * demand the actual encodings themselves work out (multi-grapheme separators |
94 | | * are handled in the decode stream logic itself). */ |
95 | 0 | MVM_STATIC_INLINE MVMint32 MVM_string_decode_stream_maybe_sep(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, MVMGrapheme32 g) { |
96 | 0 | if (sep_spec && g <= sep_spec->max_final_grapheme) { |
97 | 0 | MVMint32 i; |
98 | 0 | for (i = 0; i < sep_spec->num_seps; i++) |
99 | 0 | if (sep_spec->final_graphemes[i] == g) |
100 | 0 | return 1; |
101 | 0 | } |
102 | 0 | return 0; |
103 | 0 | } |
104 | | |
105 | | MVMDecodeStream * MVM_string_decodestream_create(MVMThreadContext *tc, MVMint32 encoding, MVMint64 abs_byte_pos, MVMint32 translate_newlines); |
106 | | void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds, char *bytes, MVMint32 length); |
107 | | void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length); |
108 | | void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos); |
109 | | MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars, MVMint64 eof); |
110 | | MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamSeparators *seps, MVMint32 chomp); |
111 | | MVMString * MVM_string_decodestream_get_until_sep_eof(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamSeparators *sep_spec, MVMint32 chomp); |
112 | | MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds); |
113 | | MVMString * MVM_string_decodestream_get_available(MVMThreadContext *tc, MVMDecodeStream *ds); |
114 | | MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes); |
115 | | MVMint64 MVM_string_decodestream_bytes_available(MVMThreadContext *tc, const MVMDecodeStream *ds); |
116 | | MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes); |
117 | | MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds); |
118 | | MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds); |
119 | | void MVM_string_decodestream_destroy(MVMThreadContext *tc, MVMDecodeStream *ds); |
120 | | void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec); |
121 | | void MVM_string_decode_stream_sep_from_strings(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, MVMString **seps, MVMint32 num_seps); |
122 | | void MVM_string_decode_stream_sep_destroy(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec); |