/home/travis/build/MoarVM/MoarVM/src/strings/latin1.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* Decodes the specified number of bytes of latin1 into an NFG string, |
4 | | * creating a result of the specified type. The type must have the MVMString |
5 | | * REPR. */ |
6 | | MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, const MVMObject *result_type, |
7 | 1.17M | char *latin1_c, size_t bytes) { |
8 | 1.17M | MVMuint8 *latin1 = (MVMuint8 *)latin1_c; |
9 | 1.17M | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
10 | 1.17M | size_t i, k, result_graphs; |
11 | 1.17M | |
12 | 1.17M | MVMuint8 writing_32bit = 0; |
13 | 1.17M | |
14 | 1.17M | result->body.storage_type = MVM_STRING_GRAPHEME_8; |
15 | 1.17M | result->body.storage.blob_8 = MVM_malloc(sizeof(MVMint8) * bytes); |
16 | 1.17M | |
17 | 1.17M | result_graphs = 0; |
18 | 11.1M | for (i = 0; i < bytes; i++) { |
19 | 9.95M | if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') { |
20 | 0 | if (writing_32bit) |
21 | 0 | result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc); |
22 | 0 | else |
23 | 0 | result->body.storage.blob_8[result_graphs++] = MVM_nfg_crlf_grapheme(tc); |
24 | 0 | i++; |
25 | 0 | } |
26 | 9.95M | else { |
27 | 9.95M | if (latin1[i] > 127 && !writing_32bit) { |
28 | 2.56k | MVMGrapheme8 *old_storage = result->body.storage.blob_8; |
29 | 2.56k | |
30 | 2.56k | result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes); |
31 | 2.56k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
32 | 2.56k | writing_32bit = 1; |
33 | 2.56k | |
34 | 11.5k | for (k = 0; k < i; k++) |
35 | 9.01k | result->body.storage.blob_32[k] = old_storage[k]; |
36 | 2.56k | MVM_free(old_storage); |
37 | 2.56k | } |
38 | 9.95M | if (writing_32bit) |
39 | 4.51k | result->body.storage.blob_32[result_graphs++] = latin1[i]; |
40 | 9.95M | else |
41 | 9.95M | result->body.storage.blob_8[result_graphs++] = latin1[i]; |
42 | 9.95M | } |
43 | 9.95M | } |
44 | 1.17M | result->body.num_graphs = result_graphs; |
45 | 1.17M | |
46 | 1.17M | return result; |
47 | 1.17M | } |
48 | | |
49 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
50 | | * buffers, or until a stopper is reached. */ |
51 | | MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
52 | | const MVMint32 *stopper_chars, |
53 | 1 | MVMDecodeStreamSeparators *seps) { |
54 | 1 | MVMint32 count = 0, total = 0; |
55 | 1 | MVMint32 bufsize; |
56 | 1 | MVMGrapheme32 *buffer; |
57 | 1 | MVMDecodeStreamBytes *cur_bytes; |
58 | 1 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
59 | 1 | MVMint32 last_accept_pos, last_was_cr; |
60 | 1 | MVMuint32 reached_stopper; |
61 | 1 | |
62 | 1 | /* If there's no buffers, we're done. */ |
63 | 1 | if (!ds->bytes_head) |
64 | 0 | return 0; |
65 | 1 | last_accept_pos = ds->bytes_head_pos; |
66 | 1 | |
67 | 1 | /* If we're asked for zero chars, also done. */ |
68 | 1 | if (stopper_chars && *stopper_chars == 0) |
69 | 0 | return 1; |
70 | 1 | |
71 | 1 | bufsize = ds->result_size_guess; |
72 | 1 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
73 | 1 | |
74 | 1 | /* Decode each of the buffers. */ |
75 | 1 | cur_bytes = ds->bytes_head; |
76 | 1 | last_was_cr = 0; |
77 | 1 | reached_stopper = 0; |
78 | 2 | while (cur_bytes) { |
79 | 1 | /* Process this buffer. */ |
80 | 1 | MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
81 | 1 | unsigned char *bytes = (unsigned char *)cur_bytes->bytes; |
82 | 3 | while (pos < cur_bytes->length) { |
83 | 2 | MVMCodepoint codepoint = bytes[pos++]; |
84 | 2 | MVMGrapheme32 graph; |
85 | 2 | if (last_was_cr) { |
86 | 0 | if (codepoint == '\n') { |
87 | 0 | graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm)); |
88 | 0 | } |
89 | 0 | else { |
90 | 0 | graph = '\r'; |
91 | 0 | pos--; |
92 | 0 | } |
93 | 0 | last_was_cr = 0; |
94 | 0 | } |
95 | 2 | else if (codepoint == '\r') { |
96 | 0 | last_was_cr = 1; |
97 | 0 | continue; |
98 | 0 | } |
99 | 2 | else { |
100 | 2 | graph = codepoint; |
101 | 2 | } |
102 | 2 | if (count == bufsize) { |
103 | 0 | /* We filled the buffer. Attach this one to the buffers |
104 | 0 | * linked list, and continue with a new one. */ |
105 | 0 | MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); |
106 | 0 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
107 | 0 | count = 0; |
108 | 0 | } |
109 | 2 | buffer[count++] = graph; |
110 | 2 | last_accept_bytes = cur_bytes; |
111 | 2 | last_accept_pos = pos; |
112 | 2 | total++; |
113 | 2 | if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) { |
114 | 0 | reached_stopper = 1; |
115 | 0 | goto done; |
116 | 0 | } |
117 | 2 | else if (stopper_chars && *stopper_chars == total) { |
118 | 0 | reached_stopper = 1; |
119 | 0 | goto done; |
120 | 0 | } |
121 | 2 | } |
122 | 1 | cur_bytes = cur_bytes->next; |
123 | 1 | } |
124 | 1 | done: |
125 | 1 | |
126 | 1 | /* Attach what we successfully parsed as a result buffer, and trim away |
127 | 1 | * what we chewed through. */ |
128 | 1 | if (count) { |
129 | 1 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
130 | 1 | } |
131 | 0 | else { |
132 | 0 | MVM_free(buffer); |
133 | 0 | } |
134 | 1 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
135 | 1 | |
136 | 1 | return reached_stopper; |
137 | 1 | } |
138 | | |
139 | | /* Encodes the specified substring to latin-1. Anything outside of latin-1 range |
140 | | * will become a ?. The result string is NULL terminated, but the specified |
141 | | * size is the non-null part. */ |
142 | | char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, |
143 | 33.9k | MVMString *replacement, MVMint32 translate_newlines) { |
144 | 33.9k | /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we |
145 | 33.9k | * may have to resize as we go. */ |
146 | 33.9k | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
147 | 33.9k | MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - (MVMuint32)start : length); |
148 | 33.9k | MVMuint8 *result; |
149 | 33.9k | size_t result_alloc; |
150 | 33.9k | MVMuint8 *repl_bytes = NULL; |
151 | 33.9k | MVMuint64 repl_length; |
152 | 33.9k | |
153 | 33.9k | /* must check start first since it's used in the length check */ |
154 | 33.9k | if (start < 0 || start > strgraphs) |
155 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
156 | 33.9k | if (length < -1 || start + lengthu > strgraphs) |
157 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
158 | 33.9k | |
159 | 33.9k | if (replacement) |
160 | 2 | repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc, |
161 | 2 | replacement, &repl_length, 0, -1, NULL, translate_newlines); |
162 | 33.9k | |
163 | 33.9k | result_alloc = lengthu; |
164 | 33.9k | result = MVM_malloc(result_alloc + 1); |
165 | 33.9k | if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { |
166 | 0 | /* No encoding needed; directly copy. */ |
167 | 0 | memcpy(result, str->body.storage.blob_ascii, lengthu); |
168 | 0 | result[lengthu] = 0; |
169 | 0 | if (output_size) |
170 | 0 | *output_size = lengthu; |
171 | 0 | } |
172 | 33.9k | else { |
173 | 33.9k | MVMuint32 i = 0; |
174 | 33.9k | MVMCodepointIter ci; |
175 | 33.9k | MVM_string_ci_init(tc, &ci, str, translate_newlines, 0); |
176 | 412k | while (MVM_string_ci_has_more(tc, &ci)) { |
177 | 378k | MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci); |
178 | 378k | if (i == result_alloc) { |
179 | 2 | result_alloc += 8; |
180 | 2 | result = MVM_realloc(result, result_alloc + 1); |
181 | 2 | } |
182 | 378k | if (ord >= 0 && ord <= 255) { |
183 | 378k | result[i] = (MVMuint8)ord; |
184 | 378k | i++; |
185 | 378k | } |
186 | 4 | else if (replacement) { |
187 | 4 | if (repl_length >= result_alloc || i >= result_alloc - repl_length) { |
188 | 2 | result_alloc += repl_length; |
189 | 2 | result = MVM_realloc(result, result_alloc + 1); |
190 | 2 | } |
191 | 4 | memcpy(result + i, repl_bytes, repl_length); |
192 | 4 | i += repl_length; |
193 | 4 | } |
194 | 0 | else { |
195 | 0 | MVM_free(result); |
196 | 0 | MVM_free(repl_bytes); |
197 | 0 | MVM_exception_throw_adhoc(tc, |
198 | 0 | "Error encoding Latin-1 string: could not encode codepoint %d", |
199 | 0 | ord); |
200 | 0 | } |
201 | 378k | } |
202 | 33.9k | result[i] = 0; |
203 | 33.9k | if (output_size) |
204 | 33.9k | *output_size = i; |
205 | 33.9k | } |
206 | 33.9k | MVM_free(repl_bytes); |
207 | 33.9k | return (char *)result; |
208 | 33.9k | } |
209 | | |
210 | | /* Encodes the specified string to latin-1. Anything outside of latin-1 range |
211 | | * will become a ?. The result string is NULL terminated, but the specified |
212 | | * size is the non-null part. */ |
213 | | char * MVM_string_latin1_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, |
214 | 33.9k | MVMint32 translate_newlines) { |
215 | 33.9k | return MVM_string_latin1_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines); |
216 | 33.9k | } |