/home/travis/build/MoarVM/MoarVM/src/strings/ascii.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* Decodes the specified number of bytes of ASCII into an NFG string, creating |
4 | | * a result of the specified type. The type must have the MVMString REPR. */ |
5 | 183k | MVMString * MVM_string_ascii_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *ascii, size_t bytes) { |
6 | 183k | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
7 | 183k | size_t i, result_graphs; |
8 | 183k | |
9 | 183k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
10 | 183k | result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes); |
11 | 183k | |
12 | 183k | result_graphs = 0; |
13 | 2.48M | for (i = 0; i < bytes; i++) { |
14 | 2.30M | if (ascii[i] == '\r' && i + 1 < bytes && ascii[i + 1] == '\n') { |
15 | 0 | result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc); |
16 | 0 | i++; |
17 | 0 | } |
18 | 2.30M | else if (ascii[i] >= 0) { |
19 | 2.30M | result->body.storage.blob_32[result_graphs++] = ascii[i]; |
20 | 2.30M | } |
21 | 0 | else { |
22 | 0 | MVM_exception_throw_adhoc(tc, |
23 | 0 | "Will not decode invalid ASCII (code point > 127 found)"); |
24 | 0 | } |
25 | 2.30M | } |
26 | 183k | result->body.num_graphs = result_graphs; |
27 | 183k | |
28 | 183k | return result; |
29 | 183k | } |
30 | | |
31 | | /* Decodes a NULL-terminated ASCII string into an NFG string, creating |
32 | | * a result of the specified type. The type must have the MVMString REPR. */ |
33 | 66.2k | MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, const MVMObject *result_type, const char *ascii) { |
34 | 66.2k | return MVM_string_ascii_decode(tc, result_type, ascii, strlen(ascii)); |
35 | 66.2k | } |
36 | | |
37 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
38 | | * buffers, or until a stopper is reached. */ |
39 | | MVMuint32 MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
40 | | const MVMint32 *stopper_chars, |
41 | 1 | MVMDecodeStreamSeparators *seps) { |
42 | 1 | MVMint32 count = 0, total = 0; |
43 | 1 | MVMint32 bufsize; |
44 | 1 | MVMGrapheme32 *buffer; |
45 | 1 | MVMDecodeStreamBytes *cur_bytes; |
46 | 1 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
47 | 1 | MVMint32 last_accept_pos, last_was_cr; |
48 | 1 | MVMuint32 reached_stopper; |
49 | 1 | |
50 | 1 | /* If there's no buffers, we're done. */ |
51 | 1 | if (!ds->bytes_head) |
52 | 0 | return 0; |
53 | 1 | last_accept_pos = ds->bytes_head_pos; |
54 | 1 | |
55 | 1 | /* If we're asked for zero chars, also done. */ |
56 | 1 | if (stopper_chars && *stopper_chars == 0) |
57 | 0 | return 1; |
58 | 1 | |
59 | 1 | bufsize = ds->result_size_guess; |
60 | 1 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
61 | 1 | |
62 | 1 | /* Decode each of the buffers. */ |
63 | 1 | cur_bytes = ds->bytes_head; |
64 | 1 | last_was_cr = 0; |
65 | 1 | reached_stopper = 0; |
66 | 3 | while (cur_bytes) { |
67 | 2 | /* Process this buffer. */ |
68 | 1 | MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
69 | 2 | MVMuint8 *bytes = (MVMuint8*)cur_bytes->bytes; |
70 | 11 | while (pos < cur_bytes->length) { |
71 | 9 | MVMCodepoint codepoint = bytes[pos++]; |
72 | 9 | MVMGrapheme32 graph; |
73 | 9 | if (codepoint > 127) |
74 | 0 | MVM_exception_throw_adhoc(tc, |
75 | 0 | "Will not decode invalid ASCII (code point > 127 found)"); |
76 | 9 | if (last_was_cr) { |
77 | 0 | if (codepoint == '\n') { |
78 | 0 | graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm)); |
79 | 0 | } |
80 | 0 | else { |
81 | 0 | graph = '\r'; |
82 | 0 | pos--; |
83 | 0 | } |
84 | 0 | last_was_cr = 0; |
85 | 0 | } |
86 | 9 | else if (codepoint == '\r') { |
87 | 0 | last_was_cr = 1; |
88 | 0 | continue; |
89 | 0 | } |
90 | 9 | else { |
91 | 9 | graph = codepoint; |
92 | 9 | } |
93 | 9 | if (count == bufsize) { |
94 | 0 | /* We filled the buffer. Attach this one to the buffers |
95 | 0 | * linked list, and continue with a new one. */ |
96 | 0 | MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); |
97 | 0 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
98 | 0 | count = 0; |
99 | 0 | } |
100 | 9 | buffer[count++] = graph; |
101 | 9 | last_accept_bytes = cur_bytes; |
102 | 9 | last_accept_pos = pos; |
103 | 9 | total++; |
104 | 9 | if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) { |
105 | 0 | reached_stopper = 1; |
106 | 0 | goto done; |
107 | 0 | } |
108 | 9 | else if (stopper_chars && *stopper_chars == total) { |
109 | 0 | reached_stopper = 1; |
110 | 0 | goto done; |
111 | 0 | } |
112 | 9 | } |
113 | 2 | cur_bytes = cur_bytes->next; |
114 | 2 | } |
115 | 1 | done: |
116 | 1 | |
117 | 1 | /* Attach what we successfully parsed as a result buffer, and trim away |
118 | 1 | * what we chewed through. */ |
119 | 1 | if (count) { |
120 | 1 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
121 | 1 | } |
122 | 0 | else { |
123 | 0 | MVM_free(buffer); |
124 | 0 | } |
125 | 1 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
126 | 1 | |
127 | 1 | return reached_stopper; |
128 | 1 | } |
129 | | |
130 | | /* Encodes the specified substring to ASCII. Anything outside of ASCII range |
131 | | * will become replaced with the supplied replacement, or an exception will be |
132 | | * thrown if there isn't one. The result string is NULL terminated, but the |
133 | | * specified size is the non-null part. */ |
134 | 1.96k | char * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { |
135 | 1.96k | /* ASCII is a single byte encoding, but \r\n is a 2-byte grapheme, so we |
136 | 1.96k | * may have to resize as we go. */ |
137 | 1.96k | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
138 | 1.95k | MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - (MVMuint32)start : length); |
139 | 1.96k | MVMuint8 *result; |
140 | 1.96k | size_t result_alloc; |
141 | 1.96k | MVMuint8 *repl_bytes = NULL; |
142 | 1.96k | MVMuint64 repl_length; |
143 | 1.96k | |
144 | 1.96k | /* must check start first since it's used in the length check */ |
145 | 1.96k | if (start < 0 || start > strgraphs) |
146 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
147 | 1.96k | if (length < -1 || start + lengthu > strgraphs) |
148 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
149 | 1.96k | |
150 | 1.96k | if (replacement) |
151 | 2 | repl_bytes = (MVMuint8 *) MVM_string_ascii_encode_substr(tc, replacement, |
152 | 2 | &repl_length, 0, -1, NULL, translate_newlines); |
153 | 1.96k | |
154 | 1.96k | result_alloc = lengthu; |
155 | 1.96k | result = MVM_malloc(result_alloc + 1); |
156 | 1.96k | if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { |
157 | 0 | /* No encoding needed; directly copy. */ |
158 | 0 | memcpy(result, str->body.storage.blob_ascii, lengthu); |
159 | 0 | result[lengthu] = 0; |
160 | 0 | if (output_size) |
161 | 0 | *output_size = lengthu; |
162 | 0 | } |
163 | 1.96k | else { |
164 | 1.96k | MVMuint32 i = 0; |
165 | 1.96k | MVMCodepointIter ci; |
166 | 1.96k | MVM_string_ci_init(tc, &ci, str, translate_newlines, 0); |
167 | 33.6k | while (MVM_string_ci_has_more(tc, &ci)) { |
168 | 31.6k | MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci); |
169 | 31.6k | if (i == result_alloc) { |
170 | 2 | result_alloc += 8; |
171 | 2 | result = MVM_realloc(result, result_alloc + 1); |
172 | 2 | } |
173 | 31.6k | if (0 <= ord && ord <= 127) { |
174 | 31.6k | result[i++] = (MVMuint8)ord; |
175 | 31.6k | } |
176 | 5 | else if (replacement) { |
177 | 4 | if (repl_length >= result_alloc || i >= result_alloc - repl_length) { |
178 | 2 | result_alloc += repl_length; |
179 | 2 | result = MVM_realloc(result, result_alloc + 1); |
180 | 2 | } |
181 | 4 | memcpy(result + i, repl_bytes, repl_length); |
182 | 4 | i += repl_length; |
183 | 4 | } |
184 | 1 | else { |
185 | 1 | MVM_free(result); |
186 | 1 | MVM_free(repl_bytes); |
187 | 1 | MVM_exception_throw_adhoc(tc, |
188 | 1 | "Error encoding ASCII string: could not encode codepoint %d", |
189 | 1 | ord); |
190 | 1 | } |
191 | 31.6k | } |
192 | 1.96k | result[i] = 0; |
193 | 1.96k | if (output_size) |
194 | 23 | *output_size = i; |
195 | 1.96k | } |
196 | 1.96k | |
197 | 1.96k | if (repl_bytes) MVM_free(repl_bytes); |
198 | 1.96k | return (char *)result; |
199 | 1.96k | } |
200 | | |
201 | | /* Encodes the specified string to ASCII. */ |
202 | 1.95k | char * MVM_string_ascii_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint32 translate_newlines) { |
203 | 1.95k | return MVM_string_ascii_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines); |
204 | 1.95k | } |
205 | | |
206 | | /* Encodes the specified string to ASCII not returning length. */ |
207 | 0 | char * MVM_string_ascii_encode_any(MVMThreadContext *tc, MVMString *str) { |
208 | 0 | return MVM_string_ascii_encode(tc, str, NULL, 0); |
209 | 0 | } |