/home/travis/build/MoarVM/MoarVM/src/strings/ops.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "platform/memmem.h" |
2 | | #include "platform/memmem32.h" |
3 | | #include "moar.h" |
4 | | #define MVM_DEBUG_STRANDS 0 |
5 | 9.29k | #define MVM_string_KMP_max_pattern_length 8192 |
6 | | /* Max value possible for MVMuint32 MVMStringBody.num_graphs */ |
7 | 1.39M | #define MAX_GRAPHEMES 0xFFFFFFFFLL |
8 | | |
9 | | #if MVM_DEBUG_STRANDS |
10 | | static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) { |
11 | | MVMGraphemeIter gi; |
12 | | MVMuint32 len; |
13 | | MVM_string_gi_init(tc, &gi, s); |
14 | | len = 0; |
15 | | while (MVM_string_gi_has_more(tc, &gi)) { |
16 | | MVM_string_gi_get_grapheme(tc, &gi); |
17 | | len++; |
18 | | } |
19 | | if (len != MVM_string_graphs(tc, s)) |
20 | | MVM_exception_throw_adhoc(tc, |
21 | | "Strand sanity check failed (strand length %d != num_graphs %d)", |
22 | | len, MVM_string_graphs(tc, s)); |
23 | | } |
24 | | #define STRAND_CHECK(tc, s) check_strand_sanity(tc, s); |
25 | | #else |
26 | | #define STRAND_CHECK(tc, s) |
27 | | #endif |
28 | | |
29 | | static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in); |
30 | | #if MVM_DEBUG_NFG |
31 | | static char * NFG_check_make_debug_string (MVMThreadContext *tc, MVMGrapheme32 g) { |
32 | | char *result = NULL; |
33 | | char *picked = NULL; |
34 | | if (g == '\r') |
35 | | picked = "\\r"; |
36 | | else if (g == '\n') |
37 | | picked = "\\n"; |
38 | | else if (g == MVM_nfg_crlf_grapheme(tc)) |
39 | | picked = "\\r\\n"; |
40 | | else if (0 <= g && !MVM_string_is_control_full(tc, g)) |
41 | | result = MVM_string_utf8_encode_C_string(tc, MVM_string_chr(tc, g)); |
42 | | else if (g < 0) { |
43 | | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
44 | | char *format_str = " with num_codes = "; |
45 | | char *format_str2 = " first, second cp = "; |
46 | | char *synthtype_str = synth->is_utf8_c8 ? "utf8-8 Synthetic" : "Normal Synthetic"; |
47 | | int this_len = strlen(format_str) + strlen(synthtype_str) + 6 + strlen(format_str2) + 11 + 1 + 11 + 1; |
48 | | result = MVM_malloc(this_len); |
49 | | if (2 <= synth->num_codes) |
50 | | sprintf(result, "%s%s%5i%s%.10"PRIi32",%.10"PRIi32"", synthtype_str, format_str, synth->num_codes, format_str2, synth->codes[0], synth->codes[1]); |
51 | | else |
52 | | sprintf(result, "WARNING synth has less than 2 codes"); |
53 | | fprintf(stderr, "synth numcodes %i %"PRIi32"\n", |
54 | | MVM_nfg_get_synthetic_info(tc, synth->codes[1])->num_codes, MVM_nfg_get_synthetic_info(tc, synth->codes[1])->codes[0] |
55 | | ); |
56 | | } |
57 | | else |
58 | | picked = "[Control]"; |
59 | | if (picked) { |
60 | | result = MVM_malloc(sizeof(char) * (strlen(picked) + 1)); |
61 | | strcpy(result, picked); |
62 | | } |
63 | | if (!result) { |
64 | | result = MVM_malloc(sizeof(char) * 1); |
65 | | result[0] = 0; |
66 | | } |
67 | | return result; |
68 | | } |
69 | | static char * NFG_checker (MVMThreadContext *tc, MVMString *orig, char *varname); |
70 | | void NFG_check (MVMThreadContext *tc, MVMString *orig, char *varname) { |
71 | | char *out = NFG_checker(tc, orig, varname); |
72 | | char *waste[2] = { out, NULL }; |
73 | | if (!out) |
74 | | return; |
75 | | MVM_exception_throw_adhoc_free(tc, waste, "%s", out); |
76 | | } |
77 | | static char * NFG_checker (MVMThreadContext *tc, MVMString *orig, char *varname) { |
78 | | MVMString *renorm = NULL; |
79 | | MVMStringIndex orig_graphs = MVM_string_graphs(tc, orig), |
80 | | renorm_graphs = -1; |
81 | | MVMROOT2(tc, orig, renorm, { |
82 | | renorm = re_nfg(tc, orig); |
83 | | renorm_graphs = MVM_string_graphs(tc, renorm); |
84 | | }); |
85 | | if (MVM_DEBUG_NFG_STRICT || orig_graphs != renorm_graphs) { |
86 | | MVMGraphemeIter orig_gi, renorm_gi; |
87 | | MVMint64 index = 0; |
88 | | MVM_string_gi_init(tc, &orig_gi, orig); |
89 | | MVM_string_gi_init(tc, &renorm_gi, renorm); |
90 | | while (MVM_string_gi_has_more(tc, &orig_gi) && MVM_string_gi_has_more(tc, &renorm_gi)) { |
91 | | MVMGrapheme32 orig_g = MVM_string_gi_get_grapheme(tc, &orig_gi), |
92 | | renorm_g = MVM_string_gi_get_grapheme(tc, &renorm_gi); |
93 | | if (orig_g != renorm_g) { |
94 | | char *orig_render = NFG_check_make_debug_string(tc, orig_g), |
95 | | *renorm_render = NFG_check_make_debug_string(tc, renorm_g); |
96 | | char *format = "NFG failure. Got different grapheme count of %s. " |
97 | | "Got %i but after re_nfg got %i\n" |
98 | | "Differing grapheme at index %"PRIi64"\n" |
99 | | "orig: %"PRIi32" (%s) after re_nfg: %"PRIi32" (%s)\n"; |
100 | | int out_size = strlen(orig_render) + strlen(renorm_render) |
101 | | + strlen(varname) + strlen(format) + (5 * 7) + 1; |
102 | | char *out = MVM_malloc(sizeof(char) * out_size); |
103 | | char *waste[] = {orig_render, renorm_render, NULL}; |
104 | | char **w = waste; |
105 | | snprintf(out, out_size, |
106 | | format, |
107 | | varname, |
108 | | orig_graphs, renorm_graphs, |
109 | | index, |
110 | | orig_g, orig_render, renorm_g, renorm_render); |
111 | | MVM_free(orig_render); |
112 | | MVM_free(renorm_render); |
113 | | return out; |
114 | | } |
115 | | index++; |
116 | | } |
117 | | } |
118 | | return NULL; |
119 | | } |
120 | | void NFG_check_concat (MVMThreadContext *tc, MVMString *result, MVMString *a, MVMString *b, char *varname) { |
121 | | char *a_out = NFG_checker(tc, a, "string ‘a’"); |
122 | | char *b_out = NFG_checker(tc, b, "string ‘b’"); |
123 | | char *out = NFG_checker(tc, result, varname); |
124 | | char *strings[] = { a_out, b_out, out }; |
125 | | char *names[] = { "\nconcat string ‘a’: ", "\nconcat string ‘b’: ", "\nconcat result: " }; |
126 | | int i = 0, elems = 4; |
127 | | int rtrn = 0; |
128 | | char * empty = ""; |
129 | | if (!a_out && !b_out && !out) |
130 | | return; |
131 | | else { |
132 | | MVMGrapheme32 last_a = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1), |
133 | | first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0); |
134 | | char *debug_a = NFG_check_make_debug_string(tc, last_a), |
135 | | *debug_b = NFG_check_make_debug_string(tc, first_b), |
136 | | *escaped_a = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, a)), |
137 | | *escaped_b = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, b)), |
138 | | *escaped_result = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, result)); |
139 | | char *waste[] = { out, debug_a, debug_b, escaped_a, escaped_b, escaped_result, NULL }; |
140 | | MVM_exception_throw_adhoc_free(tc, waste, |
141 | | "In concat: a graphs: %"PRIi32" b graphs: %"PRIi32"\n" |
142 | | "last_a: %"PRIi32" (%s) first_b %"PRIi32" (%s)\n" |
143 | | "a: “%s”\n" |
144 | | "b: “%s”\n" |
145 | | "result: “%s”\n" |
146 | | "%s%s%s%s%s%s", |
147 | | MVM_string_graphs(tc, a), MVM_string_graphs(tc, b), |
148 | | last_a, debug_a, first_b, debug_b, |
149 | | escaped_a, |
150 | | escaped_b, |
151 | | escaped_result, |
152 | | (a_out?names[0]:""), (a_out?a_out:""), |
153 | | (b_out?names[1]:""), (b_out?b_out:""), |
154 | | (out?names[2]:""), (out?out:"")); |
155 | | } |
156 | | |
157 | | |
158 | | } |
159 | | #endif |
160 | | |
161 | | MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, void *Hs_or_gic, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase, int is_gic); |
162 | | static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset); |
163 | | |
164 | | /* Allocates strand storage. */ |
165 | 1.70M | static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) { |
166 | 1.70M | return MVM_malloc(num_strands * sizeof(MVMStringStrand)); |
167 | 1.70M | } |
168 | | |
169 | | /* Copies strands from one strand string to another. */ |
170 | | static void copy_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset, |
171 | 1.36M | MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) { |
172 | 1.36M | assert(from->body.storage_type == MVM_STRING_STRAND); |
173 | 1.36M | assert(to->body.storage_type == MVM_STRING_STRAND); |
174 | 1.36M | memcpy( |
175 | 1.36M | to->body.storage.strands + to_offset, |
176 | 1.36M | from->body.storage.strands + from_offset, |
177 | 1.36M | num_strands * sizeof(MVMStringStrand)); |
178 | 1.36M | } |
179 | | /* Move strands inside the same strand string. */ |
180 | | static void move_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset, |
181 | 26 | MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) { |
182 | 26 | assert(from->body.storage_type == MVM_STRING_STRAND); |
183 | 26 | assert(to->body.storage_type == MVM_STRING_STRAND); |
184 | 26 | memmove( |
185 | 26 | to->body.storage.strands + to_offset, |
186 | 26 | from->body.storage.strands + from_offset, |
187 | 26 | num_strands * sizeof(MVMStringStrand)); |
188 | 26 | } |
189 | | |
190 | 156k | #define can_fit_into_8bit(g) ((-128 <= (g) && (g) <= 127)) |
191 | | |
192 | 0 | MVM_STATIC_INLINE int can_fit_into_ascii (MVMGrapheme32 g) { |
193 | 0 | return 0 <= g && g <= 127; |
194 | 0 | } |
195 | | /* If a string is currently using 32bit storage, turn it into using |
196 | | * 8 bit storage. Doesn't do any checks at all. */ |
197 | 561 | static void turn_32bit_into_8bit_unchecked(MVMThreadContext *tc, MVMString *str) { |
198 | 561 | MVMGrapheme32 *old_buf = str->body.storage.blob_32; |
199 | 561 | MVMStringIndex i; |
200 | 561 | MVMGrapheme8 *dest_buf = NULL; |
201 | 561 | MVMStringIndex num_graphs = MVM_string_graphs_nocheck(tc, str); |
202 | 561 | str->body.storage_type = MVM_STRING_GRAPHEME_8; |
203 | 561 | dest_buf = str->body.storage.blob_8 = MVM_malloc(str->body.num_graphs * sizeof(MVMGrapheme8)); |
204 | 561 | MVM_VECTORIZE_LOOP |
205 | 6.79k | for (i = 0; i < num_graphs; i++) { |
206 | 6.22k | dest_buf[i] = old_buf[i]; |
207 | 6.22k | } |
208 | 561 | |
209 | 561 | MVM_free(old_buf); |
210 | 561 | } |
211 | | /* Checks if the next num_graphs graphemes in the iterator can fit into 8 bits. |
212 | | * This was written to take advantage of SIMD vectorization, so we use a multiple |
213 | | * bitwise operations to check, and biwise OR it with val. Care must be taken |
214 | | * to not use any variables altered by the loop outside of the loop and to not |
215 | | * have any branching or funcion calls. `i` is not used outside the loop |
216 | | * `val` is allowed as biwise OR works with the vectorization well. |
217 | | * NOTE: GraphemeIter is not modified by this function. */ |
218 | 6.64k | static int string_can_be_8bit(MVMThreadContext *tc, MVMGraphemeIter *gi_orig, MVMStringIndex num_graphs) { |
219 | 6.64k | MVMStringIndex pos = 0; |
220 | 6.64k | MVMGraphemeIter gi; |
221 | 6.64k | memcpy(&gi, gi_orig, sizeof(MVMGraphemeIter)); |
222 | 28.2k | while (1) { |
223 | 28.2k | MVMStringIndex strand_len = MVM_string_gi_graphs_left_in_strand(tc, &gi); |
224 | 28.2k | MVMStringIndex togo = num_graphs - pos < strand_len |
225 | 2.96k | ? num_graphs - pos |
226 | 25.2k | : strand_len; |
227 | 28.2k | if (MVM_string_gi_blob_type(tc, &gi) == MVM_STRING_GRAPHEME_32) { |
228 | 3.61k | if (!MVM_string_buf32_can_fit_into_8bit(MVM_string_gi_active_blob_32_pos(tc, &gi), togo)) |
229 | 5 | return 0; |
230 | 3.61k | } |
231 | 28.2k | pos += togo; |
232 | 28.2k | if (num_graphs == pos || !MVM_string_gi_has_more_strands_rep(tc, &gi)) { |
233 | 6.64k | break; |
234 | 6.64k | } |
235 | 21.5k | MVM_string_gi_next_strand_rep(tc, &gi); |
236 | 21.5k | } |
237 | 6.64k | return 1; |
238 | 6.64k | |
239 | 6.64k | } |
240 | | /* Accepts an allocated string that should have body.num_graphs set but the blob |
241 | | * unallocated. This function will allocate the space for the blob and iterate |
242 | | * the supplied grapheme iterator for the length of body.num_graphs. Very fast |
243 | | * since compilers will convert them to SIMD vector operations. */ |
244 | 6.64k | static void iterate_gi_into_string(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *result, MVMString *orig, MVMStringIndex num) { |
245 | 6.64k | int result_pos = 0; |
246 | 6.64k | MVMGrapheme8 *result8 = NULL; |
247 | 6.64k | MVMGrapheme32 *result32 = NULL; |
248 | 6.64k | MVMStringIndex result_graphs = MVM_string_graphs_nocheck(tc, result); |
249 | 6.64k | if (!result_graphs) |
250 | 0 | return; |
251 | 6.64k | |
252 | 6.64k | if (string_can_be_8bit(tc, gi, result_graphs)) { |
253 | 6.64k | MVMStringIndex result_pos = 0; |
254 | 6.64k | result->body.storage_type = MVM_STRING_GRAPHEME_8; |
255 | 6.64k | result8 = result->body.storage.blob_8 = |
256 | 6.64k | MVM_malloc(result_graphs * sizeof(MVMGrapheme8)); |
257 | 8.21k | while (1) { |
258 | 8.21k | MVMStringIndex strand_len = |
259 | 8.21k | MVM_string_gi_graphs_left_in_strand(tc, gi); |
260 | 8.21k | MVMStringIndex to_copy = result_graphs - result_pos < strand_len |
261 | 2.96k | ? result_graphs - result_pos |
262 | 5.25k | : strand_len; |
263 | 8.21k | MVMGrapheme8 *result_blob8 = result8 + result_pos; |
264 | 8.21k | switch (MVM_string_gi_blob_type(tc, gi)) { |
265 | 3.60k | case MVM_STRING_GRAPHEME_32: { |
266 | 3.60k | MVMStringIndex i; |
267 | 3.60k | MVMGrapheme32 *active_blob = |
268 | 3.60k | MVM_string_gi_active_blob_32_pos(tc, gi); |
269 | 3.60k | MVM_VECTORIZE_LOOP |
270 | 7.92k | for (i = 0; i < to_copy; i++) { |
271 | 4.31k | result_blob8[i] = active_blob[i]; |
272 | 4.31k | } |
273 | 3.60k | break; |
274 | 3.60k | } |
275 | 4.61k | case MVM_STRING_GRAPHEME_8: |
276 | 4.61k | case MVM_STRING_GRAPHEME_ASCII: { |
277 | 4.61k | memcpy( |
278 | 4.61k | result_blob8, |
279 | 4.61k | MVM_string_gi_active_blob_8_pos(tc, gi), |
280 | 4.61k | to_copy * sizeof(MVMGrapheme8) |
281 | 4.61k | ); |
282 | 4.61k | break; |
283 | 4.61k | } |
284 | 0 | default: |
285 | 0 | MVM_exception_throw_adhoc(tc, |
286 | 0 | "Internal error, string corruption in iterate_gi_into_string\n"); |
287 | 8.21k | } |
288 | 8.21k | result_pos += to_copy; |
289 | 8.21k | if (result_graphs <= result_pos || !MVM_string_gi_has_more_strands_rep(tc, gi)) { |
290 | 6.64k | break; |
291 | 6.64k | } |
292 | 1.57k | MVM_string_gi_next_strand_rep(tc, gi); |
293 | 1.57k | } |
294 | 6.64k | } |
295 | 5 | else { |
296 | 5 | MVMStringIndex result_pos = 0; |
297 | 5 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
298 | 5 | result32 = result->body.storage.blob_32 = |
299 | 5 | result_graphs ? MVM_malloc(result_graphs * sizeof(MVMGrapheme32)) : NULL; |
300 | 20.0k | while (1) { |
301 | 20.0k | MVMStringIndex strand_len = MVM_string_gi_graphs_left_in_strand(tc, gi); |
302 | 20.0k | MVMStringIndex to_copy = result_graphs - result_pos < strand_len |
303 | 0 | ? result_graphs - result_pos |
304 | 20.0k | : strand_len; |
305 | 20.0k | switch (MVM_string_gi_blob_type(tc, gi)) { |
306 | 20.0k | case MVM_STRING_GRAPHEME_8: |
307 | 20.0k | case MVM_STRING_GRAPHEME_ASCII: { |
308 | 20.0k | MVMGrapheme8 *active_blob = |
309 | 20.0k | MVM_string_gi_active_blob_8_pos(tc, gi); |
310 | 20.0k | MVMGrapheme32 *result_blob32 = result32 + result_pos; |
311 | 20.0k | MVMStringIndex i; |
312 | 20.0k | MVM_VECTORIZE_LOOP |
313 | 540k | for (i = 0; i < to_copy; i++) { |
314 | 520k | result_blob32[i] = active_blob[i]; |
315 | 520k | } |
316 | 20.0k | break; |
317 | 20.0k | } |
318 | 5 | case MVM_STRING_GRAPHEME_32: { |
319 | 5 | memcpy( |
320 | 5 | result32 + result_pos, |
321 | 5 | MVM_string_gi_active_blob_32_pos(tc, gi), |
322 | 5 | to_copy * sizeof(MVMGrapheme32) |
323 | 5 | ); |
324 | 5 | break; |
325 | 20.0k | } |
326 | 0 | default: |
327 | 0 | MVM_exception_throw_adhoc(tc, |
328 | 0 | "Internal error, string corruption in iterate_gi_into_string\n"); |
329 | 20.0k | } |
330 | 20.0k | result_pos += to_copy; |
331 | 20.0k | if (result_graphs <= result_pos || !MVM_string_gi_has_more_strands_rep(tc, gi)) { |
332 | 5 | break; |
333 | 5 | } |
334 | 20.0k | MVM_string_gi_next_strand_rep(tc, gi); |
335 | 20.0k | } |
336 | 5 | } |
337 | 6.64k | } |
338 | | #define copy_strands_memcpy(BLOB_TYPE, SIZEOF_TYPE, STORAGE_TYPE) { \ |
339 | | result->body.storage.BLOB_TYPE = MVM_malloc(sizeof(SIZEOF_TYPE) * MVM_string_graphs_nocheck(tc, orig)); \ |
340 | | for (i = 0; i < orig->body.num_strands; i++) { \ |
341 | | size_t graphs_this_strand = orig->body.storage.strands[i].end - orig->body.storage.strands[i].start; \ |
342 | | /* If it's 8bit format and there's only one grapheme */ \ |
343 | | if ((STORAGE_TYPE == MVM_STRING_GRAPHEME_ASCII || STORAGE_TYPE == MVM_STRING_GRAPHEME_8) && graphs_this_strand == 1) { \ |
344 | | /* If there are not repetitions we can directly set the grapheme */ \ |
345 | | if (!orig->body.storage.strands[i].repetitions) \ |
346 | | result->body.storage.BLOB_TYPE[graphs_so_far] = orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE[orig->body.storage.strands[i].start]; \ |
347 | | /* Otherwise, use memset for the correct number of repetitions */ \ |
348 | | else { \ |
349 | | graphs_this_strand += orig->body.storage.strands[i].repetitions; \ |
350 | | memset(graphs_so_far + result->body.storage.BLOB_TYPE, \ |
351 | | orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE[orig->body.storage.strands[i].start], \ |
352 | | graphs_this_strand \ |
353 | | ); \ |
354 | | } \ |
355 | | graphs_so_far += graphs_this_strand; \ |
356 | | } \ |
357 | | else { \ |
358 | | int j = 0; \ |
359 | | for (; j <= orig->body.storage.strands[i].repetitions; j++) { \ |
360 | | memcpy(graphs_so_far + result->body.storage.BLOB_TYPE, \ |
361 | | orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE + orig->body.storage.strands[i].start, \ |
362 | | sizeof(SIZEOF_TYPE) * graphs_this_strand \ |
363 | | ); \ |
364 | | graphs_so_far += graphs_this_strand; \ |
365 | | } \ |
366 | | } \ |
367 | | } \ |
368 | | } |
369 | | /* Collapses a bunch of strands into a single blob string. */ |
370 | 22.0k | static MVMString * collapse_strands(MVMThreadContext *tc, MVMString *orig) { |
371 | 22.0k | MVMString *result = NULL; |
372 | 22.0k | size_t graphs_so_far = 0; |
373 | 22.0k | |
374 | 22.0k | /* If it's not a strand, just return it */ |
375 | 22.0k | if (orig->body.storage_type != MVM_STRING_STRAND) |
376 | 0 | return orig; |
377 | 22.0k | /* If the original string is a STRAND and all the composite strands are |
378 | 22.0k | * of the same type, then we will collapse it using memcpy instead of |
379 | 22.0k | * using a grapheme iterator. */ |
380 | 22.0k | else { |
381 | 22.0k | size_t i; |
382 | 22.0k | MVMint32 common_storage_type = orig->body.storage.strands[0].blob_string->body.storage_type; |
383 | 22.0k | MVMROOT(tc, orig, { |
384 | 22.0k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
385 | 22.0k | result->body.num_graphs = MVM_string_graphs(tc, orig); |
386 | 22.0k | for (i = 1; i < orig->body.num_strands; i++) { |
387 | 22.0k | if (common_storage_type != orig->body.storage.strands[i].blob_string->body.storage_type) { |
388 | 22.0k | common_storage_type = -1; |
389 | 22.0k | break; |
390 | 22.0k | } |
391 | 22.0k | } |
392 | 22.0k | result->body.storage_type = common_storage_type; |
393 | 22.0k | switch (common_storage_type) { |
394 | 22.0k | case MVM_STRING_GRAPHEME_32: |
395 | 22.0k | copy_strands_memcpy(blob_32, MVMGrapheme32, MVM_STRING_GRAPHEME_32); |
396 | 22.0k | break; |
397 | 22.0k | case MVM_STRING_GRAPHEME_ASCII: |
398 | 22.0k | case MVM_STRING_GRAPHEME_8: |
399 | 22.0k | copy_strands_memcpy(blob_8, MVMGrapheme8, MVM_STRING_GRAPHEME_8); |
400 | 22.0k | break; |
401 | 22.0k | default: { |
402 | 22.0k | MVMGraphemeIter gi; |
403 | 22.0k | MVM_string_gi_init(tc, &gi, orig); |
404 | 22.0k | iterate_gi_into_string(tc, &gi, result, orig, 0); |
405 | 22.0k | } |
406 | 22.0k | } |
407 | 22.0k | }); |
408 | 22.0k | } |
409 | 22.0k | #if (MVM_DEBUG_STRANDS || MVM_DEBUG_NFG) |
410 | | if (!MVM_string_equal(tc, result, orig)) |
411 | | MVM_exception_throw_adhoc(tc, "result and original were not eq in collapse_strands"); |
412 | | #endif |
413 | 22.0k | return result; |
414 | 22.0k | } |
415 | | |
416 | | /* Takes a string that is no longer in NFG form after some concatenation-style |
417 | | * operation, and returns a new string that is in NFG. Note that we could do a |
418 | | * much, much, smarter thing in the future that doesn't involve all of this |
419 | | * copying and allocation and re-doing the whole string, but cases like this |
420 | | * should be fairly rare anyway, so go with simplicity for now. */ |
421 | 13 | static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) { |
422 | 13 | MVMNormalizer norm; |
423 | 13 | MVMCodepointIter ci; |
424 | 13 | MVMint32 ready; |
425 | 13 | MVMString *out = NULL; |
426 | 13 | MVMuint32 bufsize = in->body.num_graphs; |
427 | 13 | |
428 | 13 | /* Create the output buffer. We used to believe it can't ever be bigger |
429 | 13 | * than the initial estimate, but utf8-c8 showed us otherwise. */ |
430 | 13 | MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
431 | 13 | MVMint64 out_pos = 0; |
432 | 13 | /* Iterate codepoints and normalizer. */ |
433 | 13 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
434 | 13 | /* Codepoint iterator that passes back utf8-c8 graphemes unchanged */ |
435 | 13 | MVM_string_ci_init(tc, &ci, in, 0, 1); |
436 | 107 | while (MVM_string_ci_has_more(tc, &ci)) { |
437 | 94 | MVMGrapheme32 g; |
438 | 94 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g); |
439 | 94 | if (ready) { |
440 | 65 | if (out_pos + ready > bufsize) { |
441 | 0 | /* Doubling up the buffer size seems excessive, so just |
442 | 0 | * add a generous amount of storage */ |
443 | 0 | bufsize += ready + 32; |
444 | 0 | out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32)); |
445 | 0 | } |
446 | 65 | out_buffer[out_pos++] = g; |
447 | 77 | while (--ready > 0) { |
448 | 12 | out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
449 | 12 | } |
450 | 65 | } |
451 | 94 | } |
452 | 13 | MVM_unicode_normalizer_eof(tc, &norm); |
453 | 13 | ready = MVM_unicode_normalizer_available(tc, &norm); |
454 | 13 | if (out_pos + ready > bufsize) { |
455 | 0 | bufsize += ready + 1; |
456 | 0 | out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32)); |
457 | 0 | } |
458 | 18 | while (ready--) { |
459 | 5 | out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
460 | 5 | } |
461 | 13 | MVM_unicode_normalizer_cleanup(tc, &norm); |
462 | 13 | |
463 | 13 | /* Build result string. */ |
464 | 13 | out = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
465 | 13 | out->body.storage.blob_32 = out_buffer; |
466 | 13 | out->body.storage_type = MVM_STRING_GRAPHEME_32; |
467 | 13 | out->body.num_graphs = out_pos; |
468 | 13 | return out; |
469 | 13 | } |
470 | | |
471 | | /* Returns nonzero if two substrings are equal, doesn't check bounds */ |
472 | | MVMint64 MVM_string_substrings_equal_nocheck(MVMThreadContext *tc, MVMString *a, |
473 | 16.9M | MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) { |
474 | 16.9M | MVMint64 i; |
475 | 16.9M | |
476 | 16.9M | /* Fast paths when storage types are identical. */ |
477 | 16.9M | switch (a->body.storage_type) { |
478 | 1.68M | case MVM_STRING_GRAPHEME_32: |
479 | 1.68M | if (b->body.storage_type == MVM_STRING_GRAPHEME_32) |
480 | 34.9k | return 0 == memcmp( |
481 | 34.9k | a->body.storage.blob_32 + starta, |
482 | 34.9k | b->body.storage.blob_32 + startb, |
483 | 34.9k | length * sizeof(MVMGrapheme32)); |
484 | 1.65M | break; |
485 | 15.0M | case MVM_STRING_GRAPHEME_ASCII: |
486 | 15.0M | case MVM_STRING_GRAPHEME_8: |
487 | 15.0M | if (b->body.storage_type == MVM_STRING_GRAPHEME_ASCII || |
488 | 15.0M | b->body.storage_type == MVM_STRING_GRAPHEME_8) |
489 | 14.8M | return 0 == memcmp( |
490 | 14.8M | a->body.storage.blob_8 + starta, |
491 | 14.8M | b->body.storage.blob_8 + startb, |
492 | 14.8M | length); |
493 | 179k | break; |
494 | 16.9M | } |
495 | 16.9M | |
496 | 16.9M | /* If both are flat, use MVM_string_get_grapheme_at_nocheck on both for speed */ |
497 | 2.07M | if (a->body.storage_type != MVM_STRING_STRAND && b->body.storage_type != MVM_STRING_STRAND) { |
498 | 1.39M | for (i = 0; i < length; i++) |
499 | 1.09M | if (MVM_string_get_grapheme_at_nocheck(tc, a, starta + i) |
500 | 1.09M | != MVM_string_get_grapheme_at_nocheck(tc, b, startb + i)) |
501 | 74.7k | return 0; |
502 | 302k | return 1; |
503 | 377k | } |
504 | 1.69M | else if (a->body.storage_type == MVM_STRING_STRAND && b->body.storage_type == MVM_STRING_STRAND) { |
505 | 77.5k | MVMGraphemeIter gia, gib; |
506 | 77.5k | /* Normal path, for the rest of the time. */ |
507 | 77.5k | MVM_string_gi_init(tc, &gia, a); |
508 | 77.5k | MVM_string_gi_init(tc, &gib, b); |
509 | 77.5k | /* Move the grapheme iterator if start is not 0 */ |
510 | 77.5k | if (starta) MVM_string_gi_move_to(tc, &gia, starta); |
511 | 77.5k | if (startb) MVM_string_gi_move_to(tc, &gib, startb); |
512 | 3.55M | for (i = 0; i < length; i++) |
513 | 3.47M | if (MVM_string_gi_get_grapheme(tc, &gia) != MVM_string_gi_get_grapheme(tc, &gib)) |
514 | 121 | return 0; |
515 | 77.4k | return 1; |
516 | 77.5k | } |
517 | 1.61M | else { |
518 | 1.61M | MVMGraphemeIter gi_y; |
519 | 1.61M | MVMString *y = NULL, *z = NULL; |
520 | 1.61M | MVMint64 starty, startz; |
521 | 1.61M | if (a->body.storage_type == MVM_STRING_STRAND) { |
522 | 165k | y = a; z = b; |
523 | 165k | starty = starta; startz = startb; |
524 | 165k | } |
525 | 1.45M | else { |
526 | 1.45M | y = b; z = a; |
527 | 1.45M | starty = startb; startz = starta; |
528 | 1.45M | } |
529 | 1.61M | MVM_string_gi_init(tc, &gi_y, y); |
530 | 1.61M | if (starty) MVM_string_gi_move_to(tc, &gi_y, starty); |
531 | 5.72M | for (i = 0; i < length; i++) |
532 | 4.47M | if (MVM_string_gi_get_grapheme(tc, &gi_y) != MVM_string_get_grapheme_at_nocheck(tc, z, startz + i)) |
533 | 372k | return 0; |
534 | 1.24M | return 1; |
535 | 1.61M | } |
536 | 2.07M | } |
537 | 183k | static MVMint64 MVM_string_memmem_grapheme32 (MVMThreadContext *tc, MVMGrapheme32 *H_blob32, MVMGrapheme32 *n_blob32, MVMint64 H_start, MVMStringIndex H_graphs, MVMStringIndex n_graphs) { |
538 | 183k | MVMGrapheme32 * rtrn = memmem_uint32(H_blob32 + H_start, H_graphs - H_start, n_blob32, n_graphs); |
539 | 130k | return rtrn == NULL ? -1 : rtrn - H_blob32; |
540 | 183k | } |
541 | 22.3k | static MVMint64 MVM_string_memmem_grapheme32str (MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_start, MVMStringIndex H_graphs, MVMStringIndex n_graphs) { |
542 | 22.3k | MVMGrapheme32 *needle_buf = NULL; |
543 | 22.3k | MVMint64 rtrn; |
544 | 22.3k | if (needle->body.storage_type != MVM_STRING_GRAPHEME_32) { |
545 | 4.09k | MVMStringIndex i; |
546 | 4.09k | MVMGraphemeIter n_gi; |
547 | 4.09k | needle_buf = MVM_malloc(needle->body.num_graphs * sizeof(MVMGrapheme32)); |
548 | 4.09k | if (needle->body.storage_type != MVM_STRING_GRAPHEME_8) MVM_string_gi_init(tc, &n_gi, needle); |
549 | 10.7k | for (i = 0; i < needle->body.num_graphs; i++) { |
550 | 6.60k | needle_buf[i] = needle->body.storage_type == MVM_STRING_GRAPHEME_8 ? needle->body.storage.blob_8[i] : MVM_string_gi_get_grapheme(tc, &n_gi); |
551 | 6.60k | } |
552 | 4.09k | } |
553 | 18.2k | rtrn = MVM_string_memmem_grapheme32(tc, Haystack->body.storage.blob_32, needle_buf ? needle_buf : needle->body.storage.blob_32, H_start, H_graphs, n_graphs); |
554 | 22.3k | if (needle_buf) MVM_free(needle_buf); |
555 | 22.3k | return rtrn; |
556 | 22.3k | } |
557 | | /* Returns the location of one string in another or -1 */ |
558 | 500k | MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) { |
559 | 500k | size_t index = (size_t)start; |
560 | 500k | MVMStringIndex H_graphs, n_graphs; |
561 | 500k | MVM_string_check_arg(tc, Haystack, "index search target"); |
562 | 500k | MVM_string_check_arg(tc, needle, "index search term"); |
563 | 500k | H_graphs = MVM_string_graphs_nocheck(tc, Haystack); |
564 | 500k | n_graphs = MVM_string_graphs_nocheck(tc, needle); |
565 | 500k | |
566 | 500k | if (!n_graphs) |
567 | 52 | return start <= H_graphs ? start : -1; /* the empty string is in any other string */ |
568 | 500k | |
569 | 500k | if (!H_graphs) |
570 | 41 | return -1; |
571 | 500k | |
572 | 500k | if (start < 0 || H_graphs <= start) |
573 | 148 | return -1; |
574 | 500k | |
575 | 499k | if (H_graphs < n_graphs || n_graphs < 1) |
576 | 3 | return -1; |
577 | 499k | |
578 | 499k | /* Fast paths when storage types are identical. Uses memmem function, which |
579 | 499k | * uses Knuth-Morris-Pratt algorithm on Linux and on others |
580 | 499k | * Crochemore+Perrin two-way string matching */ |
581 | 499k | switch (Haystack->body.storage_type) { |
582 | 22.3k | case MVM_STRING_GRAPHEME_32: |
583 | 22.3k | if (needle->body.storage_type == MVM_STRING_GRAPHEME_32 || needle->body.num_graphs < 100) { |
584 | 22.3k | return MVM_string_memmem_grapheme32str(tc, Haystack, needle, start, H_graphs, n_graphs); |
585 | 22.3k | } |
586 | 0 | break; |
587 | 438k | case MVM_STRING_GRAPHEME_8: |
588 | 438k | if (needle->body.storage_type == MVM_STRING_GRAPHEME_8 || needle->body.num_graphs < 100) { |
589 | 438k | void *mm_return_8 = NULL; |
590 | 438k | MVMGrapheme8 *needle_buf = NULL; |
591 | 438k | if (needle->body.storage_type != MVM_STRING_GRAPHEME_8) { |
592 | 9.01k | MVMStringIndex i; |
593 | 9.01k | MVMGraphemeIter n_gi; |
594 | 9.01k | needle_buf = MVM_malloc(needle->body.num_graphs * sizeof(MVMGrapheme8)); |
595 | 9.01k | if (needle->body.storage_type != MVM_STRING_GRAPHEME_32) MVM_string_gi_init(tc, &n_gi, needle); |
596 | 18.0k | for (i = 0; i < needle->body.num_graphs; i++) { |
597 | 9.01k | MVMGrapheme32 g = needle->body.storage_type == MVM_STRING_GRAPHEME_32 |
598 | 0 | ? needle->body.storage.blob_32[i] |
599 | 9.01k | : MVM_string_gi_get_grapheme(tc, &n_gi); |
600 | 9.01k | /* Haystack is 8 bit, needle is 32 bit. if we encounter a non8bit grapheme |
601 | 9.01k | * it's impossible to match */ |
602 | 9.01k | if (!can_fit_into_8bit(g)) { |
603 | 0 | MVM_free(needle_buf); |
604 | 0 | return -1; |
605 | 0 | } |
606 | 9.01k | needle_buf[i] = g; |
607 | 9.01k | } |
608 | 9.01k | } |
609 | 438k | mm_return_8 = MVM_memmem( |
610 | 438k | Haystack->body.storage.blob_8 + start, /* start position */ |
611 | 438k | (H_graphs - start) * sizeof(MVMGrapheme8), /* length of Haystack from start position to end */ |
612 | 429k | needle_buf ? needle_buf : needle->body.storage.blob_8, /* needle start */ |
613 | 438k | n_graphs * sizeof(MVMGrapheme8) /* needle length */ |
614 | 438k | ); |
615 | 438k | if (needle_buf) MVM_free(needle_buf); |
616 | 438k | if (mm_return_8 == NULL) |
617 | 366k | return -1; |
618 | 438k | else |
619 | 71.9k | return (MVMGrapheme8*)mm_return_8 - Haystack->body.storage.blob_8; |
620 | 438k | } |
621 | 0 | break; |
622 | 499k | } |
623 | 499k | /* Minimal code version for needles of size 1 */ |
624 | 39.0k | if (n_graphs == 1) { |
625 | 29.7k | MVMGraphemeIter H_gi; |
626 | 29.7k | MVMGrapheme32 n_g = MVM_string_get_grapheme_at_nocheck(tc, needle, 0); |
627 | 29.7k | MVM_string_gi_init(tc, &H_gi, Haystack); |
628 | 29.7k | if (index) MVM_string_gi_move_to(tc, &H_gi, index); |
629 | 3.47M | while (index < H_graphs) { |
630 | 3.46M | if (n_g == MVM_string_gi_get_grapheme(tc, &H_gi)) |
631 | 11.9k | return (MVMint64)index; |
632 | 3.44M | index++; |
633 | 3.44M | } |
634 | 29.7k | } |
635 | 9.29k | else if (n_graphs <= MVM_string_KMP_max_pattern_length) |
636 | 9.29k | return knuth_morris_pratt_string_index(tc, needle, Haystack, start); |
637 | 0 | else { |
638 | 0 | int is_gic = Haystack->body.storage_type == MVM_STRING_STRAND ? 1 : 0; |
639 | 0 | void *Hs_or_gic = Haystack; |
640 | 0 | /* If Haystack is a strand allocate space for a MVMGraphemeIter_cached |
641 | 0 | * and initialize it */ |
642 | 0 | if (is_gic) { |
643 | 0 | Hs_or_gic = alloca(sizeof(MVMGraphemeIter_cached)); |
644 | 0 | MVM_string_gi_cached_init(tc, Hs_or_gic, Haystack, start); |
645 | 0 | } |
646 | 0 | /* For needles > MVM_string_KMP_max_pattern_length we must revert to brute force for now. |
647 | 0 | * Eventually we can implement brute force after it matches the whole needle OR |
648 | 0 | * allocate more space for the pattern on reaching the end of the pattern */ |
649 | 0 | while (index <= H_graphs - n_graphs) { |
650 | 0 | if (string_equal_at_ignore_case_INTERNAL_loop(tc, Hs_or_gic, needle, index, H_graphs, n_graphs, 0, 0, is_gic) != -1) |
651 | 0 | return (MVMint64)index; |
652 | 0 | index++; |
653 | 0 | } |
654 | 0 | } |
655 | 17.7k | return -1; |
656 | 39.0k | } |
657 | | |
658 | | /* Returns the location of one string in another or -1 */ |
659 | 16 | MVMint64 MVM_string_index_from_end(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) { |
660 | 16 | MVMint64 result = -1; |
661 | 16 | size_t index; |
662 | 16 | MVMStringIndex H_graphs, n_graphs; |
663 | 16 | |
664 | 16 | MVM_string_check_arg(tc, Haystack, "rindex search target"); |
665 | 16 | MVM_string_check_arg(tc, needle, "rindex search term"); |
666 | 16 | H_graphs = MVM_string_graphs_nocheck(tc, Haystack); |
667 | 16 | n_graphs = MVM_string_graphs_nocheck(tc, needle); |
668 | 16 | |
669 | 16 | if (!n_graphs) { |
670 | 8 | if (0 <= start) |
671 | 7 | return start <= H_graphs ? start : -1; /* the empty string is in any other string */ |
672 | 8 | else |
673 | 1 | return H_graphs; /* no start, so return end */ |
674 | 8 | } |
675 | 16 | |
676 | 8 | if (!H_graphs) |
677 | 0 | return -1; |
678 | 8 | |
679 | 8 | if (H_graphs < n_graphs || n_graphs < 1) |
680 | 0 | return -1; |
681 | 8 | |
682 | 8 | if (start == -1) |
683 | 2 | start = H_graphs - n_graphs; |
684 | 8 | |
685 | 8 | if (start < 0 || H_graphs <= start) |
686 | 8 | /* maybe return -1 instead? */ |
687 | 0 | MVM_exception_throw_adhoc(tc, "index start offset out of range"); |
688 | 8 | |
689 | 8 | index = start; |
690 | 8 | |
691 | 8 | if (H_graphs < index + n_graphs) { |
692 | 1 | index = H_graphs - n_graphs; |
693 | 1 | } |
694 | 8 | |
695 | 8 | /* brute force for now. horrible, yes. halp. */ |
696 | 19 | do { |
697 | 19 | if (MVM_string_substrings_equal_nocheck(tc, needle, 0, n_graphs, Haystack, index)) { |
698 | 6 | result = (MVMint64)index; |
699 | 6 | break; |
700 | 6 | } |
701 | 13 | } while (0 < index--); |
702 | 8 | return result; |
703 | 8 | } |
704 | | |
705 | | /* Returns a substring of the given string */ |
706 | 343k | MVMString * MVM_string_substring(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMint64 length) { |
707 | 343k | MVMString *result; |
708 | 343k | MVMint64 start_pos, end_pos; |
709 | 343k | |
710 | 343k | MVMint64 agraphs; |
711 | 343k | |
712 | 343k | MVM_string_check_arg(tc, a, "substring"); |
713 | 343k | /* convert to signed to avoid implicit arithmetic conversions */ |
714 | 343k | agraphs = (MVMint64)MVM_string_graphs_nocheck(tc, a); |
715 | 343k | |
716 | 343k | /* -1 signifies go to the end of the string; anything less is a bug */ |
717 | 343k | if (length < -1) |
718 | 0 | MVM_exception_throw_adhoc(tc, "Substring length (%"PRId64") cannot be negative", length); |
719 | 343k | |
720 | 343k | /* negative offsets count from the end */ |
721 | 343k | start_pos = offset < 0 ? offset + agraphs : offset; |
722 | 303k | end_pos = length == -1 ? agraphs : start_pos + length; |
723 | 343k | |
724 | 343k | /* return an empty string if start_pos is out of bounds but positive */ |
725 | 343k | if (agraphs < start_pos) { |
726 | 0 | start_pos = 0; |
727 | 0 | end_pos = 0; |
728 | 0 | } |
729 | 343k | |
730 | 343k | if (end_pos < 0) |
731 | 0 | MVM_exception_throw_adhoc(tc, "Substring end (%"PRId64") cannot be less than 0", end_pos); |
732 | 343k | |
733 | 343k | /* Ensure we're within bounds. */ |
734 | 343k | if (start_pos < 0) |
735 | 0 | start_pos = 0; |
736 | 343k | if (agraphs < end_pos) |
737 | 841 | end_pos = agraphs; |
738 | 343k | |
739 | 343k | /* Check trivial cases: empty string and whole string. */ |
740 | 343k | if (start_pos == end_pos) |
741 | 11.6k | return tc->instance->str_consts.empty; |
742 | 331k | if (start_pos == 0 && end_pos == agraphs) |
743 | 19.6k | return a; |
744 | 331k | |
745 | 331k | /* Construct a result; how we efficiently do so will vary based on the |
746 | 331k | * input string. */ |
747 | 312k | MVMROOT(tc, a, { |
748 | 312k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
749 | 312k | result->body.num_graphs = end_pos - start_pos; |
750 | 312k | if (a->body.storage_type != MVM_STRING_STRAND) { |
751 | 312k | /* It's some kind of buffer. Construct a strand view into it. */ |
752 | 312k | result->body.storage_type = MVM_STRING_STRAND; |
753 | 312k | result->body.storage.strands = allocate_strands(tc, 1); |
754 | 312k | result->body.num_strands = 1; |
755 | 312k | result->body.storage.strands[0].blob_string = a; |
756 | 312k | result->body.storage.strands[0].start = start_pos; |
757 | 312k | result->body.storage.strands[0].end = end_pos; |
758 | 312k | result->body.storage.strands[0].repetitions = 0; |
759 | 312k | } |
760 | 312k | else if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) { |
761 | 312k | /* Single strand string; quite possibly already a substring. We'll |
762 | 312k | * just produce an updated view. */ |
763 | 312k | MVMStringStrand *orig_strand = &(a->body.storage.strands[0]); |
764 | 312k | result->body.storage_type = MVM_STRING_STRAND; |
765 | 312k | result->body.storage.strands = allocate_strands(tc, 1); |
766 | 312k | result->body.num_strands = 1; |
767 | 312k | result->body.storage.strands[0].blob_string = orig_strand->blob_string; |
768 | 312k | result->body.storage.strands[0].start = orig_strand->start + start_pos; |
769 | 312k | result->body.storage.strands[0].end = orig_strand->start + end_pos; |
770 | 312k | result->body.storage.strands[0].repetitions = 0; |
771 | 312k | } |
772 | 312k | else { |
773 | 312k | /* Produce a new blob string, collapsing the strands. */ |
774 | 312k | MVMGraphemeIter gi; |
775 | 312k | MVM_string_gi_init(tc, &gi, a); |
776 | 312k | MVM_string_gi_move_to(tc, &gi, start_pos); |
777 | 312k | iterate_gi_into_string(tc, &gi, result, a, start_pos); |
778 | 312k | } |
779 | 312k | }); |
780 | 312k | |
781 | 312k | STRAND_CHECK(tc, result); |
782 | 312k | return result; |
783 | 331k | } |
784 | | |
785 | 1 | MVMString * MVM_string_replace(MVMThreadContext *tc, MVMString *original, MVMint64 start, MVMint64 count, MVMString *replacement) { |
786 | 1 | /* XXX this could probably be done more efficiently directly. */ |
787 | 1 | MVMString *first_part = NULL; |
788 | 1 | MVMString *rest_part = NULL; |
789 | 1 | MVMString *result = NULL; |
790 | 1 | |
791 | 1 | MVM_gc_root_temp_push(tc, (MVMCollectable **)&replacement); |
792 | 1 | MVM_gc_root_temp_push(tc, (MVMCollectable **)&original); |
793 | 1 | MVM_gc_root_temp_push(tc, (MVMCollectable **)&first_part); |
794 | 1 | first_part = MVM_string_substring(tc, original, 0, start); |
795 | 1 | |
796 | 1 | rest_part = MVM_string_substring(tc, original, start + count, -1); |
797 | 1 | rest_part = MVM_string_concatenate(tc, replacement, rest_part); |
798 | 1 | result = MVM_string_concatenate(tc, first_part, rest_part); |
799 | 1 | |
800 | 1 | |
801 | 1 | STRAND_CHECK(tc, result); |
802 | 1 | NFG_CHECK(tc, result, "MVM_string_replace"); |
803 | 1 | MVM_gc_root_temp_pop_n(tc, 3); |
804 | 1 | return result; |
805 | 1 | } |
806 | | |
807 | 144 | static MVMString * string_from_strand_at_index(MVMThreadContext *tc, MVMString *a, MVMuint16 index) { |
808 | 144 | MVMStringStrand *ss = &(a->body.storage.strands[index]); |
809 | 144 | return MVM_string_substring(tc, ss->blob_string, ss->start, ss->end - ss->start); |
810 | 144 | } |
811 | | |
812 | 1.39M | static MVMuint16 final_strand_match_with_repetition_count(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
813 | 1.39M | if (a->body.storage_type == MVM_STRING_STRAND) { |
814 | 1.29M | MVMStringStrand *sa = &(a->body.storage.strands[a->body.num_strands - 1]); |
815 | 1.29M | /* If the final strand of a eq b, we'll just increment the final strand of a's repetitions. */ |
816 | 1.29M | if (sa->end - sa->start == MVM_string_graphs_nocheck(tc, b)) { |
817 | 1.23M | if (MVM_string_equal_at(tc, sa->blob_string, b, sa->start)) |
818 | 1.20M | return 1; |
819 | 1.23M | } |
820 | 1.29M | /* If the final strand of a eq the first (and only) strand of b, we'll just add b's repetitions |
821 | 1.29M | * (plus 1 for the strand itself) to the final strand of a's repetitions. */ |
822 | 62.4k | else if (b->body.storage_type == MVM_STRING_STRAND && b->body.num_strands == 1) { |
823 | 8.38k | MVMStringStrand *sb = &(b->body.storage.strands[0]); |
824 | 8.38k | if (sa->end - sa->start == sb->end - sb->start) |
825 | 72 | if (MVM_string_equal(tc, string_from_strand_at_index(tc, a, a->body.num_strands - 1), string_from_strand_at_index(tc, b, 0))) |
826 | 0 | return b->body.storage.strands[0].repetitions + 1; |
827 | 8.38k | } |
828 | 1.29M | } |
829 | 196k | return 0; |
830 | 1.39M | } |
831 | | |
832 | | /* Append one string to another. */ |
833 | 1.40M | MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
834 | 1.40M | MVMString *result = NULL, *renormalized_section = NULL; |
835 | 1.40M | int renormalized_section_graphs = 0, consumed_a = 0, consumed_b = 0; |
836 | 1.40M | MVMuint32 agraphs, bgraphs; |
837 | 1.40M | MVMuint64 total_graphs; |
838 | 1.40M | int lost_strands = 0; |
839 | 1.40M | int is_concat_stable = 0; |
840 | 1.40M | int index_ss_b; |
841 | 1.40M | MVMuint16 matching_repetition_count; |
842 | 1.40M | MVM_string_check_arg(tc, a, "concatenate"); |
843 | 1.40M | MVM_string_check_arg(tc, b, "concatenate"); |
844 | 1.40M | |
845 | 1.40M | /* Simple empty-string cases. */ |
846 | 1.40M | agraphs = MVM_string_graphs_nocheck(tc, a); |
847 | 1.40M | if (agraphs == 0) |
848 | 11.0k | return b; |
849 | 1.39M | bgraphs = MVM_string_graphs_nocheck(tc, b); |
850 | 1.39M | if (bgraphs == 0) |
851 | 356 | return a; |
852 | 1.39M | |
853 | 1.39M | is_concat_stable = MVM_nfg_is_concat_stable(tc, a, b); |
854 | 1.39M | |
855 | 1.39M | /* If is_concat_stable equals 0 and a and b are not repetitions. */ |
856 | 1.39M | if (is_concat_stable == 0 && !(a->body.storage_type == MVM_STRING_STRAND && a->body.storage.strands[a->body.num_strands - 1].repetitions) |
857 | 46 | && !(b->body.storage_type == MVM_STRING_STRAND && b->body.storage.strands[0].repetitions)) { |
858 | 46 | MVMCodepoint last_a_first_b[2] = { |
859 | 46 | MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1), |
860 | 46 | MVM_string_get_grapheme_at_nocheck(tc, b, 0) |
861 | 46 | }; |
862 | 46 | MVMROOT2(tc, a, b, { |
863 | 46 | /* If both are not synthetics, we can can pass those values unchanged |
864 | 46 | * instead of iterating by codepoint */ |
865 | 46 | if (0 <= last_a_first_b[0] && 0 <= last_a_first_b[1]) { |
866 | 46 | renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b, 2); |
867 | 46 | consumed_a = 1; consumed_b = 1; |
868 | 46 | } |
869 | 46 | else { |
870 | 46 | MVMCodepointIter last_a_ci; |
871 | 46 | MVMCodepointIter first_b_ci; |
872 | 46 | MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci, last_a_first_b[0], 1); |
873 | 46 | MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1], 1); |
874 | 46 | /* MSVC doesn't allow variable length arrays so use alloca to allocate onto the stack */ |
875 | 46 | MVMCodepoint *last_a_first_b_codes = alloca((a_codes + b_codes) * sizeof(MVMCodepoint)); |
876 | 46 | MVMuint32 i = 0; |
877 | 46 | for (; MVM_string_grapheme_ci_has_more(tc, &last_a_ci); i++) { |
878 | 46 | last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &last_a_ci); |
879 | 46 | } |
880 | 46 | for (; MVM_string_grapheme_ci_has_more(tc, &first_b_ci); i++) { |
881 | 46 | last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &first_b_ci); |
882 | 46 | } |
883 | 46 | renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b_codes, a_codes + b_codes); |
884 | 46 | consumed_a = 1; consumed_b = 1; |
885 | 46 | } |
886 | 46 | }); |
887 | 46 | if (renormalized_section) { |
888 | 46 | if (agraphs == consumed_a && bgraphs == consumed_b) { |
889 | 17 | NFG_CHECK_CONCAT(tc, renormalized_section, a, b, "renormalized_section"); |
890 | 17 | return renormalized_section; |
891 | 17 | } |
892 | 29 | renormalized_section_graphs = MVM_string_graphs_nocheck(tc, renormalized_section); |
893 | 29 | } |
894 | 46 | } |
895 | 1.39M | /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */ |
896 | 1.39M | total_graphs = (MVMuint64)agraphs + (MVMuint64)bgraphs; |
897 | 1.39M | if (MAX_GRAPHEMES < total_graphs) |
898 | 0 | MVM_exception_throw_adhoc(tc, |
899 | 0 | "Can't concatenate strings, required number of graphemes %"PRIu64" > max allowed of %lld", |
900 | 0 | total_graphs, MAX_GRAPHEMES); |
901 | 1.39M | |
902 | 1.39M | /* Otherwise, we'll assemble a result string. */ |
903 | 1.39M | MVMROOT4(tc, a, b, renormalized_section, result, { |
904 | 1.39M | |
905 | 1.39M | /* Allocate it. */ |
906 | 1.39M | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
907 | 1.39M | |
908 | 1.39M | /* Total graphemes is trivial; just total up inputs. */ |
909 | 1.39M | result->body.num_graphs = (MVMuint32)total_graphs; |
910 | 1.39M | |
911 | 1.39M | /* Result string will be made of strands. */ |
912 | 1.39M | result->body.storage_type = MVM_STRING_STRAND; |
913 | 1.39M | |
914 | 1.39M | /* Detect the wonderful case where we're repeatedly concating the same |
915 | 1.39M | * string again and again, and thus can just bump a repetition. */ |
916 | 1.39M | if (is_concat_stable == 1 && (matching_repetition_count = final_strand_match_with_repetition_count(tc, a, b))) { |
917 | 1.39M | /* We have it; just copy the strands to a new string and bump the |
918 | 1.39M | * repetitions count of the last one. */ |
919 | 1.39M | result->body.storage.strands = allocate_strands(tc, a->body.num_strands); |
920 | 1.39M | copy_strands(tc, a, 0, result, 0, a->body.num_strands); |
921 | 1.39M | result->body.storage.strands[a->body.num_strands - 1].repetitions += matching_repetition_count; |
922 | 1.39M | result->body.num_strands = a->body.num_strands; |
923 | 1.39M | } |
924 | 1.39M | |
925 | 1.39M | /* Otherwise, construct a new strand string. */ |
926 | 1.39M | else { |
927 | 1.39M | /* See if we have too many strands between the two. If so, we will |
928 | 1.39M | * collapse the biggest side. */ |
929 | 1.39M | MVMuint16 strands_a = a->body.storage_type == MVM_STRING_STRAND |
930 | 1.39M | ? a->body.num_strands |
931 | 1.39M | : 1; |
932 | 1.39M | MVMuint16 strands_b = b->body.storage_type == MVM_STRING_STRAND |
933 | 1.39M | ? b->body.num_strands |
934 | 1.39M | : 1; |
935 | 1.39M | MVMString *effective_a = a; |
936 | 1.39M | MVMString *effective_b = b; |
937 | 1.39M | if (MVM_STRING_MAX_STRANDS < strands_a + strands_b) { |
938 | 1.39M | MVMROOT(tc, result, { |
939 | 1.39M | if (strands_b <= strands_a) { |
940 | 1.39M | effective_a = collapse_strands(tc, effective_a); |
941 | 1.39M | strands_a = 1; |
942 | 1.39M | } |
943 | 1.39M | else { |
944 | 1.39M | effective_b = collapse_strands(tc, effective_b); |
945 | 1.39M | strands_b = 1; |
946 | 1.39M | } |
947 | 1.39M | }); |
948 | 1.39M | } |
949 | 1.39M | /* Assemble the result. */ |
950 | 1.39M | result->body.num_strands = strands_a + strands_b + (renormalized_section_graphs ? 1 : 0); |
951 | 1.39M | result->body.storage.strands = allocate_strands(tc, result->body.num_strands); |
952 | 1.39M | /* START 1 */ |
953 | 1.39M | if (effective_a->body.storage_type == MVM_STRING_STRAND) { |
954 | 1.39M | copy_strands(tc, effective_a, 0, result, 0, strands_a); |
955 | 1.39M | } |
956 | 1.39M | else { |
957 | 1.39M | int index_ss_a = 0; |
958 | 1.39M | MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]); |
959 | 1.39M | ss_a->blob_string = effective_a; |
960 | 1.39M | ss_a->start = 0; |
961 | 1.39M | ss_a->end = effective_a->body.num_graphs; |
962 | 1.39M | ss_a->repetitions = 0; |
963 | 1.39M | } |
964 | 1.39M | if (renormalized_section) { |
965 | 1.39M | int index_ss_re; |
966 | 1.39M | int index_ss_a = strands_a - 1; |
967 | 1.39M | /* Tweak the end of the last strand of string a. Since if a is made up of multiple strands, we can't just refer to index 0 and instead erfer to strands_a - 1 */ |
968 | 1.39M | MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]); |
969 | 1.39M | MVMStringStrand *ss_re = NULL; |
970 | 1.39M | ss_a->end -= consumed_a; |
971 | 1.39M | /* If the strands ends up to be zero length we need to reduce the number of strand_index and also incease lost_strands so the next operation writes over it */ |
972 | 1.39M | if (ss_a->start == ss_a->end) |
973 | 1.39M | lost_strands++; |
974 | 1.39M | /* END 1 */ |
975 | 1.39M | /* START 1.5 (only triggered in some cases) */ |
976 | 1.39M | index_ss_re = strands_a - lost_strands; |
977 | 1.39M | ss_re = &(result->body.storage.strands[index_ss_re]); |
978 | 1.39M | |
979 | 1.39M | /* Add the renormalized section in as a strand */ |
980 | 1.39M | ss_re->blob_string = renormalized_section; |
981 | 1.39M | ss_re->start = 0; |
982 | 1.39M | ss_re->end = renormalized_section->body.num_graphs; |
983 | 1.39M | ss_re->repetitions = 0; |
984 | 1.39M | if (ss_re->start == ss_re->end) { |
985 | 1.39M | MVM_exception_throw_adhoc(tc, "Unexpected error in concatenation: renormalized_section is 0 graphemes.\n"); |
986 | 1.39M | /* renormalized_section should always be at least one grapheme |
987 | 1.39M | * in length so throw if it does not (zero length is an error |
988 | 1.39M | * we shouldn't lost_strands++ unlike the other strands */ |
989 | 1.39M | } |
990 | 1.39M | /* END 1.5 */ |
991 | 1.39M | } |
992 | 1.39M | /* START 2 */ |
993 | 1.39M | index_ss_b = strands_a - lost_strands + (renormalized_section_graphs ? 1 : 0 ); |
994 | 1.39M | if (effective_b->body.storage_type == MVM_STRING_STRAND) { |
995 | 1.39M | copy_strands(tc, effective_b, 0, result, index_ss_b, strands_b); |
996 | 1.39M | } |
997 | 1.39M | else { |
998 | 1.39M | MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]); |
999 | 1.39M | ss_b->blob_string = effective_b; |
1000 | 1.39M | ss_b->start = 0; |
1001 | 1.39M | ss_b->end = effective_b->body.num_graphs; |
1002 | 1.39M | ss_b->repetitions = 0; |
1003 | 1.39M | } |
1004 | 1.39M | if (renormalized_section_graphs) { |
1005 | 1.39M | /* Tweak the beginning of the first strand of string b */ |
1006 | 1.39M | MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]); |
1007 | 1.39M | ss_b->start += consumed_b; |
1008 | 1.39M | if (ss_b->start == ss_b->end) { |
1009 | 1.39M | lost_strands++; |
1010 | 1.39M | move_strands(tc, result, index_ss_b + 1, result, index_ss_b, strands_b - 1); |
1011 | 1.39M | } |
1012 | 1.39M | /* END 2 */ |
1013 | 1.39M | /* Adjust result->num_strands */ |
1014 | 1.39M | if (lost_strands) |
1015 | 1.39M | result->body.num_strands -= lost_strands; |
1016 | 1.39M | /* Adjust result->num_graphs */ |
1017 | 1.39M | result->body.num_graphs += renormalized_section_graphs - consumed_b - consumed_a; |
1018 | 1.39M | } |
1019 | 1.39M | } |
1020 | 1.39M | STRAND_CHECK(tc, result); |
1021 | 1.39M | if (is_concat_stable == 1 || (is_concat_stable == 0 && renormalized_section)) |
1022 | 1.39M | NFG_CHECK_CONCAT(tc, result, a, b, "'result'"); |
1023 | 1.39M | }); |
1024 | 1.39M | if (is_concat_stable == 1 || (is_concat_stable == 0 && renormalized_section)) |
1025 | 1.39M | return result; |
1026 | 1.39M | /* If it's regional indicator (is_concat_stable == 2) */ |
1027 | 8 | return re_nfg(tc, result); |
1028 | 1.39M | } |
1029 | | |
1030 | 445 | MVMString * MVM_string_repeat(MVMThreadContext *tc, MVMString *a, MVMint64 count) { |
1031 | 445 | MVMString *result = NULL; |
1032 | 445 | MVMuint32 agraphs; |
1033 | 445 | MVMuint64 total_graphs; |
1034 | 445 | |
1035 | 445 | MVM_string_check_arg(tc, a, "repeat"); |
1036 | 445 | |
1037 | 445 | /* Validate count; handle common cases. */ |
1038 | 445 | if (count == 0) |
1039 | 9 | return tc->instance->str_consts.empty; |
1040 | 436 | if (count == 1) |
1041 | 22 | return a; |
1042 | 414 | if (count < 0) |
1043 | 0 | MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be negative", count); |
1044 | 414 | if (MAX_GRAPHEMES < count) |
1045 | 0 | MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be greater than max allowed number of graphemes %lld", count, MAX_GRAPHEMES); |
1046 | 414 | |
1047 | 414 | /* If input string is empty, repeating it is empty. */ |
1048 | 414 | agraphs = MVM_string_graphs_nocheck(tc, a); |
1049 | 414 | if (agraphs == 0) |
1050 | 0 | return tc->instance->str_consts.empty; |
1051 | 414 | |
1052 | 414 | /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */ |
1053 | 414 | total_graphs = (MVMuint64)agraphs * (MVMuint64)count; |
1054 | 414 | if (MAX_GRAPHEMES < total_graphs) |
1055 | 0 | MVM_exception_throw_adhoc(tc, |
1056 | 0 | "Can't repeat string, required number of graphemes (%"PRIu32" * %"PRIu64") greater than max allowed of %lld", |
1057 | 0 | agraphs, count, MAX_GRAPHEMES); |
1058 | 414 | |
1059 | 414 | /* Now build a result string with the repetition set. */ |
1060 | 414 | MVMROOT(tc, a, { |
1061 | 414 | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1062 | 414 | result->body.num_graphs = agraphs * count; |
1063 | 414 | result->body.storage_type = MVM_STRING_STRAND; |
1064 | 414 | result->body.storage.strands = allocate_strands(tc, 1); |
1065 | 414 | if (a->body.storage_type == MVM_STRING_STRAND) { |
1066 | 414 | if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) { |
1067 | 414 | copy_strands(tc, a, 0, result, 0, 1); |
1068 | 414 | } |
1069 | 414 | else { |
1070 | 414 | MVMROOT(tc, result, { |
1071 | 414 | a = collapse_strands(tc, a); |
1072 | 414 | }); |
1073 | 414 | result->body.storage.strands[0].blob_string = a; |
1074 | 414 | result->body.storage.strands[0].start = 0; |
1075 | 414 | result->body.storage.strands[0].end = agraphs; |
1076 | 414 | } |
1077 | 414 | } |
1078 | 414 | else { |
1079 | 414 | result->body.storage.strands[0].blob_string = a; |
1080 | 414 | result->body.storage.strands[0].start = 0; |
1081 | 414 | result->body.storage.strands[0].end = agraphs; |
1082 | 414 | } |
1083 | 414 | result->body.storage.strands[0].repetitions = count - 1; |
1084 | 414 | result->body.num_strands = 1; |
1085 | 414 | }); |
1086 | 414 | /* If string a is not stable under concatenation, we need to create a flat |
1087 | 414 | * string and ensure it is normalized */ |
1088 | 414 | if (!MVM_nfg_is_concat_stable(tc, a, a)) |
1089 | 0 | result = re_nfg(tc, result); |
1090 | 414 | STRAND_CHECK(tc, result); |
1091 | 414 | return result; |
1092 | 414 | } |
1093 | | |
1094 | 315 | void MVM_string_say(MVMThreadContext *tc, MVMString *a) { |
1095 | 315 | MVM_string_check_arg(tc, a, "say"); |
1096 | 315 | MVM_string_print(tc, MVM_string_concatenate(tc, a, |
1097 | 315 | tc->instance->str_consts.platform_newline)); |
1098 | 315 | } |
1099 | | |
1100 | 316 | void MVM_string_print(MVMThreadContext *tc, MVMString *a) { |
1101 | 316 | MVMOSHandle *handle = (MVMOSHandle *)tc->instance->stdout_handle; |
1102 | 316 | MVMuint64 encoded_size; |
1103 | 316 | char *encoded; |
1104 | 316 | MVM_string_check_arg(tc, a, "print"); |
1105 | 316 | encoded = MVM_string_utf8_encode(tc, a, &encoded_size, MVM_TRANSLATE_NEWLINE_OUTPUT); |
1106 | 316 | MVM_io_write_bytes_c(tc, tc->instance->stdout_handle, encoded, encoded_size); |
1107 | 316 | MVM_free(encoded); |
1108 | 316 | } |
1109 | | /* Meant to be pased in a MVMNormalizer of type MVM_NORMALIZE_NFD */ |
1110 | 2.77k | static MVMGrapheme32 ord_getbasechar (MVMThreadContext *tc, MVMGrapheme32 g) { |
1111 | 2.77k | /* If we get a synthetic, extract the base codepoint and call ord_getbasechar again */ |
1112 | 2.77k | if (g < 0) { |
1113 | 0 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
1114 | 0 | return ord_getbasechar(tc, synth->codes[synth->base_index]); |
1115 | 0 | } |
1116 | 2.77k | else { |
1117 | 2.77k | MVMGrapheme32 return_g; |
1118 | 2.77k | MVMint32 ready; |
1119 | 2.77k | MVMNormalizer norm; |
1120 | 2.77k | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
1121 | 2.77k | |
1122 | 2.77k | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, g, &return_g); |
1123 | 2.77k | MVM_unicode_normalizer_eof(tc, &norm); |
1124 | 2.77k | if (!ready) |
1125 | 91 | return_g = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
1126 | 2.77k | MVM_unicode_normalizer_cleanup(tc, &norm); |
1127 | 2.77k | return return_g; |
1128 | 2.77k | } |
1129 | 2.77k | } |
1130 | | /* Tests whether one string a has the other string b as a substring at that index */ |
1131 | 1.93M | MVMint64 MVM_string_equal_at(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 offset) { |
1132 | 1.93M | |
1133 | 1.93M | MVMStringIndex agraphs, bgraphs; |
1134 | 1.93M | |
1135 | 1.93M | MVM_string_check_arg(tc, a, "equal_at"); |
1136 | 1.93M | MVM_string_check_arg(tc, b, "equal_at"); |
1137 | 1.93M | |
1138 | 1.93M | agraphs = MVM_string_graphs_nocheck(tc, a); |
1139 | 1.93M | bgraphs = MVM_string_graphs_nocheck(tc, b); |
1140 | 1.93M | |
1141 | 1.93M | if (offset < 0) { |
1142 | 3 | offset += agraphs; |
1143 | 3 | if (offset < 0) |
1144 | 1 | offset = 0; /* XXX I think this is the right behavior here */ |
1145 | 3 | } |
1146 | 1.93M | if (agraphs - offset < bgraphs) |
1147 | 9.07k | return 0; |
1148 | 1.92M | return MVM_string_substrings_equal_nocheck(tc, a, offset, bgraphs, b, 0); |
1149 | 1.93M | } |
1150 | | /* Ensure return value can hold numbers at least 3x higher than MVMStringIndex. |
1151 | | * Theoretically if the string has all ffi ligatures and 1/3 the max size of |
1152 | | * MVMStringIndex in length, we could have some weird results. */ |
1153 | | |
1154 | | /* ignoremark is 0 for normal operation and 1 for ignoring diacritics */ |
1155 | 2.63k | MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, void *Hs_or_gic, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase, int is_gic) { |
1156 | 2.63k | MVMuint32 H_fc_cps; |
1157 | 2.63k | /* An additional needle offset which is used only when codepoints expand |
1158 | 2.63k | * when casefolded. The offset is the number of additional codepoints that |
1159 | 2.63k | * have been seen so Haystack and needle stay aligned */ |
1160 | 2.63k | MVMint64 n_offset = 0; |
1161 | 2.63k | MVMint64 i, j; |
1162 | 2.63k | MVMGrapheme32 H_g, n_g; |
1163 | 3.48k | for (i = 0; i + H_start < H_graphs && i + n_offset < n_fc_graphs; i++) { |
1164 | 2.94k | const MVMCodepoint* H_result_cps; |
1165 | 2.35k | H_g = is_gic ? MVM_string_gi_cached_get_grapheme(tc, Hs_or_gic, H_start + i) : MVM_string_get_grapheme_at_nocheck(tc, Hs_or_gic, H_start + i); |
1166 | 2.94k | if (!ignorecase) { |
1167 | 112 | H_fc_cps = 0; |
1168 | 112 | } |
1169 | 2.82k | else if (0 <= H_g) { |
1170 | 2.82k | /* For codeponits we can get the case change directly */ |
1171 | 2.82k | H_fc_cps = MVM_unicode_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, &H_result_cps); |
1172 | 2.82k | } |
1173 | 0 | else { |
1174 | 0 | /* Synthetics must use this function */ |
1175 | 0 | H_fc_cps = MVM_nfg_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, (MVMGrapheme32**) &H_result_cps); |
1176 | 0 | } |
1177 | 2.94k | /* If we get 0 for the number that means the cp doesn't change when casefolded */ |
1178 | 2.94k | if (H_fc_cps == 0) { |
1179 | 2.59k | n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset); |
1180 | 2.59k | if (ignoremark) { |
1181 | 1.23k | H_g = ord_getbasechar(tc, H_g); |
1182 | 1.23k | n_g = ord_getbasechar(tc, n_g); |
1183 | 1.23k | } |
1184 | 2.59k | if (H_g != n_g) |
1185 | 2.02k | return -1; |
1186 | 2.59k | } |
1187 | 347 | else if (1 <= H_fc_cps) { |
1188 | 770 | for (j = 0; j < H_fc_cps; j++) { |
1189 | 488 | n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset); |
1190 | 488 | H_g = H_result_cps[j]; |
1191 | 488 | if (ignoremark) { |
1192 | 129 | H_g = ord_getbasechar(tc, H_g); |
1193 | 129 | n_g = ord_getbasechar(tc, n_g); |
1194 | 129 | } |
1195 | 488 | if (H_g != n_g) |
1196 | 65 | return -1; |
1197 | 423 | n_offset++; |
1198 | 423 | } |
1199 | 282 | n_offset--; |
1200 | 282 | } |
1201 | 2.94k | } |
1202 | 543 | return n_offset; |
1203 | 2.63k | /* We return -1 if the strings are not equal and 0 or more if they are equal |
1204 | 2.63k | * The return values from 0, 1 etc designate how many Haystack graphemes |
1205 | 2.63k | * were expanded. |
1206 | 2.63k | * This may seem like an odd arangement, but this extra information is needed |
1207 | 2.63k | * to determine the length of the Haystack which was traversed, as it can |
1208 | 2.63k | * differ from the length of the needle if there are expansions. */ |
1209 | 2.63k | } |
1210 | | /* Checks if needle exists at the offset, but ignores case. |
1211 | | * Sometimes there is a difference in length of a string before and after foldcase, |
1212 | | * because of this we must compare this differently than just foldcasing both |
1213 | | * strings to ensure the offset is correct */ |
1214 | 333 | static MVMint64 string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset, int ignoremark, int ignorecase) { |
1215 | 333 | /* Foldcase version of needle */ |
1216 | 333 | MVMString *needle_fc = NULL; |
1217 | 333 | MVMStringIndex H_graphs = MVM_string_graphs(tc, Haystack); |
1218 | 333 | MVMStringIndex n_graphs = MVM_string_graphs(tc, needle); |
1219 | 333 | MVMStringIndex n_fc_graphs; |
1220 | 333 | /* H_expansion must be able to hold integers 3x larger than MVMStringIndex */ |
1221 | 333 | MVMint64 H_expansion; |
1222 | 333 | |
1223 | 333 | if (H_offset < 0) { |
1224 | 3 | H_offset += H_graphs; |
1225 | 3 | if (H_offset < 0) |
1226 | 1 | H_offset = 0; /* XXX I think this is the right behavior here */ |
1227 | 3 | } |
1228 | 333 | /* If the offset is greater or equal to the number of Haystack graphemes |
1229 | 333 | * return 0. Since size of graphemes could change under casefolding, we |
1230 | 333 | * can't assume too much. If optimizing this be careful */ |
1231 | 333 | if (H_graphs <= H_offset) |
1232 | 1 | return 0; |
1233 | 332 | MVMROOT(tc, Haystack, { |
1234 | 332 | needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle; |
1235 | 332 | }); |
1236 | 332 | n_fc_graphs = MVM_string_graphs(tc, needle_fc); |
1237 | 332 | if (Haystack->body.storage_type == MVM_STRING_STRAND) { |
1238 | 160 | MVMGraphemeIter_cached H_gic; |
1239 | 160 | MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset); |
1240 | 160 | H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, &H_gic, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 1); |
1241 | 160 | } |
1242 | 172 | else { |
1243 | 172 | H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 0); |
1244 | 172 | } |
1245 | 332 | if (0 <= H_expansion) |
1246 | 296 | return n_fc_graphs <= H_graphs + H_expansion - H_offset ? 1 : 0; |
1247 | 36 | return 0; |
1248 | 332 | } |
1249 | | /* Processes the pattern. The pattern must be able to store negative and positive |
1250 | | * numbers. It must be able to store at least 1/2 the length of the needle, |
1251 | | * though possibly more (though I am not sure it's possible for it to be more than |
1252 | | * 1/2). */ |
1253 | 9.29k | static void knuth_morris_pratt_process_pattern (MVMThreadContext *tc, MVMString *pat, MVMint16 *next, MVMStringIndex pat_graphs) { |
1254 | 9.29k | MVMint64 i = 0; |
1255 | 9.29k | MVMint64 j = next[0] = -1; |
1256 | 32.4k | while (i < pat_graphs) { |
1257 | 23.1k | if (j == -1 || MVM_string_get_grapheme_at_nocheck(tc, pat, i) |
1258 | 20.7k | == MVM_string_get_grapheme_at_nocheck(tc, pat, j)) { |
1259 | 20.7k | i++; j++; |
1260 | 20.7k | next[i] = (i < pat_graphs |
1261 | 11.4k | && MVM_string_get_grapheme_at_nocheck(tc, pat, j) |
1262 | 11.4k | == MVM_string_get_grapheme_at_nocheck(tc, pat, i)) |
1263 | 9.04k | ? next[j] |
1264 | 11.6k | : j; |
1265 | 20.7k | } |
1266 | 2.39k | else j = next[j]; |
1267 | 23.1k | } |
1268 | 9.29k | } |
1269 | | |
1270 | 9.29k | static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset) { |
1271 | 9.29k | MVMint64 needle_offset = 0; |
1272 | 9.29k | MVMint64 text_offset = H_offset; |
1273 | 9.29k | MVMStringIndex Haystack_graphs = MVM_string_graphs_nocheck(tc, Haystack); |
1274 | 9.29k | MVMStringIndex needle_graphs = MVM_string_graphs_nocheck(tc, needle); |
1275 | 9.29k | MVMint16 *next = NULL; |
1276 | 9.29k | MVMString *flat_needle = NULL; |
1277 | 9.29k | size_t next_size = (1 + needle_graphs) * sizeof(MVMint16); |
1278 | 9.29k | int next_is_malloced = 0; |
1279 | 9.29k | assert(needle_graphs <= MVM_string_KMP_max_pattern_length); |
1280 | 9.29k | /* Empty string is found at start of string */ |
1281 | 9.29k | if (needle_graphs == 0) |
1282 | 0 | return 0; |
1283 | 9.29k | /* Allocate max 8K onto the stack, otherwise malloc */ |
1284 | 9.29k | if (next_size < 3000) |
1285 | 9.29k | next = alloca(next_size); |
1286 | 0 | else { |
1287 | 0 | next = MVM_malloc(next_size); |
1288 | 0 | next_is_malloced = 1; |
1289 | 0 | } |
1290 | 9.29k | /* If the needle is a strand, flatten it, otherwise use the original string */ |
1291 | 9.29k | flat_needle = needle->body.storage_type == MVM_STRING_STRAND |
1292 | 248 | ? collapse_strands(tc, needle) |
1293 | 9.04k | : needle; |
1294 | 9.29k | /* Process the needle into a jump table put into variable 'next' */ |
1295 | 9.29k | knuth_morris_pratt_process_pattern(tc, flat_needle, next, needle_graphs); |
1296 | 9.29k | /* If the Haystack is a strand, use MVM_string_gi_cached_get_grapheme |
1297 | 9.29k | * since it retains its grapheme iterator over invocations unlike |
1298 | 9.29k | * MVM_string_get_grapheme_at_nocheck and caches the previous grapheme. It |
1299 | 9.29k | * is slower for flat Haystacks though. */ |
1300 | 9.29k | #define MVM_kmp_loop(Haystack_function) {\ |
1301 | 668k | while (text_offset < Haystack_graphs && needle_offset < needle_graphs) {\ |
1302 | 659k | if (needle_offset == -1 || MVM_string_get_grapheme_at_nocheck(tc, flat_needle, needle_offset)\ |
1303 | 581k | == (Haystack_function)) {\ |
1304 | 581k | text_offset++; needle_offset++;\ |
1305 | 581k | if (needle_offset == needle_graphs) {\ |
1306 | 258 | if (next_is_malloced) MVM_free(next);\ |
1307 | 258 | return text_offset - needle_offset;\ |
1308 | 258 | }\ |
1309 | 581k | }\ |
1310 | 77.9k | else needle_offset = next[needle_offset];\ |
1311 | 659k | }\ |
1312 | 9.29k | } |
1313 | 9.29k | if (Haystack->body.storage_type == MVM_STRING_STRAND) { |
1314 | 9.29k | MVMGraphemeIter_cached H_gic; |
1315 | 9.29k | MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset); |
1316 | 9.29k | MVM_kmp_loop(MVM_string_gi_cached_get_grapheme(tc, &H_gic, text_offset)); |
1317 | 9.03k | } |
1318 | 0 | else { |
1319 | 0 | MVM_kmp_loop(MVM_string_get_grapheme_at_nocheck(tc, Haystack, text_offset)); |
1320 | 0 | } |
1321 | 9.03k | if (next_is_malloced) MVM_free(next); |
1322 | 9.03k | return -1; |
1323 | 9.29k | } |
1324 | 270 | static MVMint64 string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start, int ignoremark, int ignorecase) { |
1325 | 270 | /* Foldcase version of needle */ |
1326 | 270 | MVMString *needle_fc = NULL; |
1327 | 270 | MVMStringIndex n_fc_graphs; |
1328 | 270 | |
1329 | 270 | size_t index = (size_t)start; |
1330 | 270 | MVMStringIndex H_graphs, n_graphs; |
1331 | 270 | /* H_expansion must be able to hold integers 3x larger than MVMStringIndex */ |
1332 | 270 | MVMint64 H_expansion; |
1333 | 270 | MVMint64 return_val = -1; |
1334 | 270 | int is_gic = Haystack->body.storage_type == MVM_STRING_STRAND ? 1 : 0; |
1335 | 270 | void *Hs_or_gic = Haystack; |
1336 | 142 | MVM_string_check_arg(tc, Haystack, ignoremark ? "index ignore case ignore mark search target" : "index ignore case search target"); |
1337 | 142 | MVM_string_check_arg(tc, needle, ignoremark ? "index ignore case ignore mark search term" : "index ignore case search term"); |
1338 | 270 | H_graphs = MVM_string_graphs_nocheck(tc, Haystack); |
1339 | 270 | n_graphs = MVM_string_graphs_nocheck(tc, needle); |
1340 | 270 | if (!n_graphs) |
1341 | 12 | return start <= H_graphs ? start : -1; /* Empty string is in any other string */ |
1342 | 258 | if (!H_graphs) |
1343 | 0 | return -1; |
1344 | 258 | if (start < 0 || H_graphs <= start) |
1345 | 0 | return -1; |
1346 | 258 | /* Codepoints can expand into up to THREE codepoints (as of Unicode 9.0). The next check |
1347 | 258 | * checks if it is at all possible for the needle grapheme number to be higher |
1348 | 258 | * than the Haystack */ |
1349 | 258 | if (H_graphs * 3 < n_graphs) |
1350 | 11 | return -1; |
1351 | 258 | |
1352 | 247 | if (n_graphs < 1) |
1353 | 0 | return -1; |
1354 | 247 | |
1355 | 247 | MVMROOT(tc, Haystack, { |
1356 | 247 | needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle; |
1357 | 247 | }); |
1358 | 247 | n_fc_graphs = MVM_string_graphs(tc, needle_fc); |
1359 | 247 | /* brute force for now. horrible, yes. halp. */ |
1360 | 247 | if (is_gic) { |
1361 | 180 | Hs_or_gic = alloca(sizeof(MVMGraphemeIter_cached)); |
1362 | 180 | MVM_string_gi_cached_init(tc, Hs_or_gic, Haystack, start); |
1363 | 180 | } |
1364 | 2.30k | while (index <= H_graphs) { |
1365 | 2.30k | H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Hs_or_gic, needle_fc, index, H_graphs, n_fc_graphs, ignoremark, ignorecase, is_gic); |
1366 | 2.30k | if (0 <= H_expansion) |
1367 | 247 | return n_fc_graphs <= H_graphs + H_expansion - index ? (MVMint64)index : -1; |
1368 | 2.05k | index++; |
1369 | 2.05k | } |
1370 | 0 | return -1; |
1371 | 247 | } |
1372 | | |
1373 | 221 | MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) { |
1374 | 221 | return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 0, 1); |
1375 | 221 | } |
1376 | 142 | MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) { |
1377 | 142 | return string_index_ignore_case(tc, Haystack, needle, start, 0, 1); |
1378 | 142 | } |
1379 | 110 | MVMint64 MVM_string_equal_at_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) { |
1380 | 110 | return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1, 0); |
1381 | 110 | } |
1382 | 4 | MVMint64 MVM_string_index_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) { |
1383 | 4 | return string_index_ignore_case(tc, Haystack, needle, start, 1, 0); |
1384 | 4 | } |
1385 | 2 | MVMint64 MVM_string_equal_at_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) { |
1386 | 2 | return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1, 1); |
1387 | 2 | } |
1388 | 124 | MVMint64 MVM_string_index_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) { |
1389 | 124 | return string_index_ignore_case(tc, Haystack, needle, start, 1, 1); |
1390 | 124 | } |
1391 | | |
1392 | 932k | MVMGrapheme32 MVM_string_ord_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) { |
1393 | 932k | MVMStringIndex agraphs; |
1394 | 932k | MVMGrapheme32 g; |
1395 | 932k | |
1396 | 932k | MVM_string_check_arg(tc, s, "grapheme_at"); |
1397 | 932k | |
1398 | 932k | agraphs = MVM_string_graphs(tc, s); |
1399 | 932k | if (offset < 0 || agraphs <= offset) |
1400 | 0 | return -1; |
1401 | 932k | |
1402 | 932k | g = MVM_string_get_grapheme_at_nocheck(tc, s, offset); |
1403 | 932k | |
1404 | 932k | return 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
1405 | 932k | } |
1406 | | |
1407 | | /* Gets the base character at a grapheme position, ignoring things like diacritics */ |
1408 | 54 | MVMGrapheme32 MVM_string_ord_basechar_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) { |
1409 | 54 | MVMStringIndex agraphs; |
1410 | 54 | MVMint32 ready; |
1411 | 54 | |
1412 | 54 | MVM_string_check_arg(tc, s, "ord_basechar_at"); |
1413 | 54 | |
1414 | 54 | agraphs = MVM_string_graphs_nocheck(tc, s); |
1415 | 54 | if (offset < 0 || agraphs <= offset) |
1416 | 0 | return -1; /* fixes RT #126771 */ |
1417 | 54 | |
1418 | 54 | return ord_getbasechar(tc, MVM_string_get_grapheme_at_nocheck(tc, s, offset)); |
1419 | 54 | } |
1420 | | |
1421 | | |
1422 | | /* Compares two strings for equality. */ |
1423 | 55.3M | MVMint64 MVM_string_equal(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
1424 | 55.3M | MVMStringIndex agraphs, bgraphs; |
1425 | 55.3M | |
1426 | 55.3M | MVM_string_check_arg(tc, a, "equal"); |
1427 | 55.3M | MVM_string_check_arg(tc, b, "equal"); |
1428 | 55.3M | |
1429 | 55.3M | if (a == b) |
1430 | 16.2M | return 1; |
1431 | 55.3M | |
1432 | 39.0M | agraphs = MVM_string_graphs_nocheck(tc, a); |
1433 | 39.0M | bgraphs = MVM_string_graphs_nocheck(tc, b); |
1434 | 39.0M | |
1435 | 39.0M | if (agraphs != bgraphs) |
1436 | 23.8M | return 0; |
1437 | 39.0M | /* If we have cached hash codes from both a and b we can compare if they are identical. |
1438 | 39.0M | * If they don't match then we already know the two strings are not equal. */ |
1439 | 15.1M | if (a->body.cached_hash_code && b->body.cached_hash_code && a->body.cached_hash_code != b->body.cached_hash_code) |
1440 | 115k | return 0; |
1441 | 15.1M | |
1442 | 15.0M | return MVM_string_substrings_equal_nocheck(tc, a, 0, bgraphs, b, 0); |
1443 | 15.1M | } |
1444 | | |
1445 | | /* more general form of has_at; compares two substrings for equality */ |
1446 | | MVMint64 MVM_string_have_at(MVMThreadContext *tc, MVMString *a, |
1447 | 0 | MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) { |
1448 | 0 |
|
1449 | 0 | MVM_string_check_arg(tc, a, "have_at"); |
1450 | 0 | MVM_string_check_arg(tc, b, "have_at"); |
1451 | 0 |
|
1452 | 0 | if (starta < 0 || startb < 0) |
1453 | 0 | return 0; |
1454 | 0 | if (length == 0) |
1455 | 0 | return 1; |
1456 | 0 | if (MVM_string_graphs_nocheck(tc, a) < starta + length || MVM_string_graphs_nocheck(tc, b) < startb + length) |
1457 | 0 | return 0; |
1458 | 0 |
|
1459 | 0 | return MVM_string_substrings_equal_nocheck(tc, a, starta, length, b, startb); |
1460 | 0 | } |
1461 | | |
1462 | | /* Returns the grapheme at a given index of the string */ |
1463 | 625 | MVMint64 MVM_string_get_grapheme_at(MVMThreadContext *tc, MVMString *a, MVMint64 index) { |
1464 | 625 | MVMStringIndex agraphs; |
1465 | 625 | |
1466 | 625 | MVM_string_check_arg(tc, a, "grapheme_at"); |
1467 | 625 | |
1468 | 625 | agraphs = MVM_string_graphs_nocheck(tc, a); |
1469 | 625 | |
1470 | 625 | if (index < 0 || agraphs <= index) |
1471 | 0 | MVM_exception_throw_adhoc(tc, "Invalid string index: max %"PRId32", got %"PRId64, |
1472 | 0 | agraphs - 1, index); |
1473 | 625 | |
1474 | 625 | return (MVMint64)MVM_string_get_grapheme_at_nocheck(tc, a, index); |
1475 | 625 | } |
1476 | | |
1477 | | /* Finds the location of a grapheme in a string. Useful for small character class lookup */ |
1478 | 2.15M | MVMint64 MVM_string_index_of_grapheme(MVMThreadContext *tc, MVMString *a, MVMGrapheme32 grapheme) { |
1479 | 2.15M | size_t index = -1; |
1480 | 2.15M | MVMGraphemeIter gi; |
1481 | 2.15M | |
1482 | 2.15M | MVM_string_check_arg(tc, a, "string_index_of_grapheme"); |
1483 | 2.15M | |
1484 | 2.15M | MVM_string_gi_init(tc, &gi, a); |
1485 | 12.1M | while (MVM_string_gi_has_more(tc, &gi)) { |
1486 | 10.3M | index++; |
1487 | 10.3M | if (MVM_string_gi_get_grapheme(tc, &gi) == grapheme) |
1488 | 296k | return index; |
1489 | 10.3M | } |
1490 | 1.86M | return -1; |
1491 | 2.15M | } |
1492 | | |
1493 | | /* Case change functions. */ |
1494 | | MVMint64 MVM_string_grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g); |
1495 | 13.2k | static MVMString * do_case_change(MVMThreadContext *tc, MVMString *s, MVMint32 type, char *error) { |
1496 | 13.2k | MVMint64 sgraphs; |
1497 | 13.2k | MVM_string_check_arg(tc, s, error); |
1498 | 13.2k | sgraphs = MVM_string_graphs_nocheck(tc, s); |
1499 | 13.2k | if (sgraphs) { |
1500 | 11.7k | MVMString *result; |
1501 | 11.7k | MVMGraphemeIter gi; |
1502 | 11.7k | MVMint64 result_graphs = sgraphs; |
1503 | 11.7k | MVMGrapheme32 *result_buf = MVM_malloc(result_graphs * sizeof(MVMGrapheme32)); |
1504 | 11.7k | MVMint32 changed = 0; |
1505 | 11.7k | MVMint64 i = 0; |
1506 | 11.7k | MVM_string_gi_init(tc, &gi, s); |
1507 | 69.9k | while (MVM_string_gi_has_more(tc, &gi)) { |
1508 | 58.1k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
1509 | 58.1k | peeked: |
1510 | 58.1k | if (g == 0x03A3) { |
1511 | 0 | /* Greek sigma needs special handling when lowercased. */ |
1512 | 0 | switch (type) { |
1513 | 0 | case MVM_unicode_case_change_type_upper: |
1514 | 0 | case MVM_unicode_case_change_type_title: |
1515 | 0 | result_buf[i++] = g; |
1516 | 0 | break; |
1517 | 0 | case MVM_unicode_case_change_type_lower: |
1518 | 0 | changed = 1; |
1519 | 0 | if (i == 0) { |
1520 | 0 | /* Start of string, so not final. */ |
1521 | 0 | result_buf[i++] = 0x03C3; |
1522 | 0 | } |
1523 | 0 | else if (!MVM_string_grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, result_buf[i - 1])) { |
1524 | 0 | /* Previous char is not a letter; not final (as has |
1525 | 0 | * to be at end of a word and not only thing in a |
1526 | 0 | * word). */ |
1527 | 0 | result_buf[i++] = 0x03C3; |
1528 | 0 | } |
1529 | 0 | else if (!MVM_string_gi_has_more(tc, &gi)) { |
1530 | 0 | /* End of string. We only reach here if we have a |
1531 | 0 | * letter before us, so it must be final. */ |
1532 | 0 | result_buf[i++] = 0x03C2; |
1533 | 0 | } |
1534 | 0 | else { |
1535 | 0 | /* Letter before us, something ahead of us. Need to |
1536 | 0 | * peek ahead to see if it's a letter, to decide if |
1537 | 0 | * we have final sigma or not. */ |
1538 | 0 | g = MVM_string_gi_get_grapheme(tc, &gi); |
1539 | 0 | if (MVM_string_grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, g)) |
1540 | 0 | result_buf[i++] = 0x03C3; |
1541 | 0 | else |
1542 | 0 | result_buf[i++] = 0x03C2; |
1543 | 0 | goto peeked; |
1544 | 0 | } |
1545 | 0 | break; |
1546 | 0 | case MVM_unicode_case_change_type_fold: |
1547 | 0 | result_buf[i++] = 0x03C3; |
1548 | 0 | changed = 1; |
1549 | 0 | break; |
1550 | 0 | } |
1551 | 0 | } |
1552 | 58.1k | else if (0 <= g) { |
1553 | 58.1k | const MVMCodepoint *result_cps; |
1554 | 58.1k | MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc, |
1555 | 58.1k | g, type, &result_cps); |
1556 | 58.1k | if (num_result_cps == 0) { |
1557 | 40.7k | result_buf[i++] = g; |
1558 | 40.7k | } |
1559 | 17.4k | else if (num_result_cps == 1) { |
1560 | 17.3k | result_buf[i++] = *result_cps; |
1561 | 17.3k | changed = 1; |
1562 | 17.3k | } |
1563 | 148 | else { |
1564 | 148 | /* To maintain NFG, we need to re-normalize when we get an |
1565 | 148 | * expansion. */ |
1566 | 148 | MVMNormalizer norm; |
1567 | 148 | MVMint32 num_result_graphs; |
1568 | 148 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
1569 | 148 | MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, num_result_cps); |
1570 | 148 | MVM_unicode_normalizer_eof(tc, &norm); |
1571 | 148 | num_result_graphs = MVM_unicode_normalizer_available(tc, &norm); |
1572 | 148 | |
1573 | 148 | /* Make space for any extra graphemes. */ |
1574 | 148 | if (1 < num_result_graphs) { |
1575 | 148 | result_graphs += num_result_graphs - 1; |
1576 | 148 | result_buf = MVM_realloc(result_buf, |
1577 | 148 | result_graphs * sizeof(MVMGrapheme32)); |
1578 | 148 | } |
1579 | 148 | |
1580 | 148 | /* Copy resulting graphemes. */ |
1581 | 444 | while (0 < num_result_graphs) { |
1582 | 296 | result_buf[i++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
1583 | 296 | num_result_graphs--; |
1584 | 296 | } |
1585 | 148 | changed = 1; |
1586 | 148 | |
1587 | 148 | /* Clean up normalizer (we could init one per transform |
1588 | 148 | * and keep it around in the future, if we find it's a |
1589 | 148 | * worthwhile gain). */ |
1590 | 148 | MVM_unicode_normalizer_cleanup(tc, &norm); |
1591 | 148 | } |
1592 | 58.1k | } |
1593 | 0 | else { |
1594 | 0 | MVMGrapheme32 *transformed; |
1595 | 0 | MVMint32 num_transformed = MVM_nfg_get_case_change(tc, g, type, &transformed); |
1596 | 0 | if (num_transformed == 0) { |
1597 | 0 | result_buf[i++] = g; |
1598 | 0 | } |
1599 | 0 | else if (num_transformed == 1) { |
1600 | 0 | result_buf[i++] = *transformed; |
1601 | 0 | changed = 1; |
1602 | 0 | } |
1603 | 0 | else { |
1604 | 0 | MVMuint32 j; |
1605 | 0 | result_graphs += num_transformed - 1; |
1606 | 0 | result_buf = MVM_realloc(result_buf, |
1607 | 0 | result_graphs * sizeof(MVMGrapheme32)); |
1608 | 0 | MVM_VECTORIZE_LOOP |
1609 | 0 | for (j = 0; j < num_transformed; j++) |
1610 | 0 | result_buf[i++] = transformed[j]; |
1611 | 0 | changed = 1; |
1612 | 0 | } |
1613 | 0 | } |
1614 | 58.1k | } |
1615 | 11.7k | if (changed) { |
1616 | 3.62k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1617 | 3.62k | result->body.num_graphs = result_graphs; |
1618 | 3.62k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
1619 | 3.62k | result->body.storage.blob_32 = result_buf; |
1620 | 3.62k | return result; |
1621 | 3.62k | } |
1622 | 8.12k | else { |
1623 | 8.12k | MVM_free(result_buf); |
1624 | 8.12k | } |
1625 | 11.7k | } |
1626 | 9.67k | STRAND_CHECK(tc, s); |
1627 | 9.67k | return s; |
1628 | 13.2k | } |
1629 | 128 | MVMString * MVM_string_uc(MVMThreadContext *tc, MVMString *s) { |
1630 | 128 | return do_case_change(tc, s, MVM_unicode_case_change_type_upper, "uc"); |
1631 | 128 | } |
1632 | 12.6k | MVMString * MVM_string_lc(MVMThreadContext *tc, MVMString *s) { |
1633 | 12.6k | return do_case_change(tc, s, MVM_unicode_case_change_type_lower, "lc"); |
1634 | 12.6k | } |
1635 | 12 | MVMString * MVM_string_tc(MVMThreadContext *tc, MVMString *s) { |
1636 | 12 | return do_case_change(tc, s, MVM_unicode_case_change_type_title, "tc"); |
1637 | 12 | } |
1638 | 469 | MVMString * MVM_string_fc(MVMThreadContext *tc, MVMString *s) { |
1639 | 469 | return do_case_change(tc, s, MVM_unicode_case_change_type_fold, "fc"); |
1640 | 469 | } |
1641 | | |
1642 | | /* "Strict"ly (if possible) decodes a C buffer to an MVMString, dependent on the |
1643 | | * encoding type flag. Unlike MVM_string_decode, it will not pass through |
1644 | | * codepoints which have no official mapping. `config` can be set to 1 to indicate |
1645 | | * that you want to decode non-strict ("permissive"), which will try and decode |
1646 | | * as long as it's possible (For example codepoint 129 in windows-1252 is invalid, |
1647 | | * but is technically possible to use Unicode codepoint 129 instead (though it's |
1648 | | * most likely this means the input is actually *not* windows-1252). |
1649 | | * For now windows-1252 and windows-1251 are the only ones this makes a difference |
1650 | | * on. And it is mostly irrelevant for utf8/utf8-c8 encodings since they can |
1651 | | * already represent all codepoints below 0x10FFFF */ |
1652 | | MVMString * MVM_string_decode_config(MVMThreadContext *tc, |
1653 | | const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, |
1654 | 29 | MVMint64 encoding_flag, MVMString *replacement, MVMint64 config) { |
1655 | 29 | switch(encoding_flag) { |
1656 | 6 | case MVM_encoding_type_utf8: |
1657 | 6 | return MVM_string_utf8_decode_strip_bom(tc, type_object, Cbuf, byte_length); |
1658 | 0 | case MVM_encoding_type_ascii: |
1659 | 0 | return MVM_string_ascii_decode(tc, type_object, Cbuf, byte_length); |
1660 | 2 | case MVM_encoding_type_latin1: |
1661 | 2 | return MVM_string_latin1_decode(tc, type_object, Cbuf, byte_length); |
1662 | 1 | case MVM_encoding_type_utf16: |
1663 | 1 | return MVM_string_utf16_decode(tc, type_object, Cbuf, byte_length); |
1664 | 7 | case MVM_encoding_type_windows1252: |
1665 | 7 | return MVM_string_windows1252_decode_config(tc, type_object, Cbuf, byte_length, replacement, config); |
1666 | 0 | case MVM_encoding_type_windows1251: |
1667 | 0 | return MVM_string_windows1251_decode_config(tc, type_object, Cbuf, byte_length, replacement, config); |
1668 | 0 | case MVM_encoding_type_shiftjis: |
1669 | 0 | return MVM_string_shiftjis_decode(tc, type_object, Cbuf, byte_length, replacement, config); |
1670 | 13 | case MVM_encoding_type_utf8_c8: |
1671 | 13 | return MVM_string_utf8_c8_decode(tc, type_object, Cbuf, byte_length); |
1672 | 0 | default: |
1673 | 0 | MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag); |
1674 | 29 | } |
1675 | 29 | } |
1676 | | /* Strictly decodes a C buffer to an MVMString, dependent on the encoding type flag. |
1677 | | * See the comments above MVM_string_decode_config() above for more details. */ |
1678 | | MVMString * MVM_string_decode(MVMThreadContext *tc, |
1679 | 4 | const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, MVMint64 encoding_flag) { |
1680 | 4 | return MVM_string_decode_config(tc, type_object, Cbuf, byte_length, encoding_flag, NULL, MVM_ENCODING_PERMISSIVE); |
1681 | 4 | } |
1682 | | |
1683 | | /* Strictly encodes an MVMString to a C buffer, dependent on the encoding type flag. |
1684 | | * See comments for MVM_string_decode_config() above for more details. */ |
1685 | | char * MVM_string_encode_config(MVMThreadContext *tc, MVMString *s, MVMint64 start, |
1686 | | MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag, |
1687 | 13.0k | MVMString *replacement, MVMint32 translate_newlines, MVMuint8 config) { |
1688 | 13.0k | switch(encoding_flag) { |
1689 | 13.0k | case MVM_encoding_type_utf8: |
1690 | 13.0k | return MVM_string_utf8_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1691 | 6 | case MVM_encoding_type_ascii: |
1692 | 6 | return MVM_string_ascii_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1693 | 2 | case MVM_encoding_type_latin1: |
1694 | 2 | return MVM_string_latin1_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1695 | 1 | case MVM_encoding_type_utf16: |
1696 | 1 | return MVM_string_utf16_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1697 | 9 | case MVM_encoding_type_windows1252: |
1698 | 9 | return MVM_string_windows1252_encode_substr_config(tc, s, output_size, start, length, replacement, translate_newlines, config); |
1699 | 0 | case MVM_encoding_type_windows1251: |
1700 | 0 | return MVM_string_windows1251_encode_substr_config(tc, s, output_size, start, length, replacement, translate_newlines, config); |
1701 | 0 | case MVM_encoding_type_shiftjis: |
1702 | 0 | return MVM_string_shiftjis_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines, config); |
1703 | 8 | case MVM_encoding_type_utf8_c8: |
1704 | 8 | return MVM_string_utf8_c8_encode_substr(tc, s, output_size, start, length, replacement); |
1705 | 0 | default: |
1706 | 0 | MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag); |
1707 | 13.0k | } |
1708 | 13.0k | } |
1709 | | char * MVM_string_encode(MVMThreadContext *tc, MVMString *s, MVMint64 start, |
1710 | | MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag, |
1711 | 0 | MVMString *replacement, MVMint32 translate_newlines) { |
1712 | 0 | return MVM_string_encode_config(tc, s, start, length, output_size, encoding_flag, replacement, translate_newlines, MVM_ENCODING_PERMISSIVE); |
1713 | 0 | } |
1714 | | |
1715 | | /* Strictly encodes a string, and writes the encoding string into the supplied Buf |
1716 | | * instance, which should be an integer array with MVMArray REPR. */ |
1717 | | MVMObject * MVM_string_encode_to_buf_config(MVMThreadContext *tc, MVMString *s, MVMString *enc_name, |
1718 | 13.0k | MVMObject *buf, MVMString *replacement, MVMint64 config) { |
1719 | 13.0k | MVMuint64 output_size; |
1720 | 13.0k | MVMuint8 *encoded; |
1721 | 13.0k | MVMArrayREPRData *buf_rd; |
1722 | 13.0k | MVMuint8 elem_size = 0; |
1723 | 13.0k | |
1724 | 13.0k | /* Ensure the target is in the correct form. */ |
1725 | 13.0k | MVM_string_check_arg(tc, s, "encode"); |
1726 | 13.0k | if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray) |
1727 | 0 | MVM_exception_throw_adhoc(tc, "encode requires a native array to write into"); |
1728 | 13.0k | buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data; |
1729 | 13.0k | if (buf_rd) { |
1730 | 13.0k | switch (buf_rd->slot_type) { |
1731 | 0 | case MVM_ARRAY_I64: elem_size = 8; break; |
1732 | 0 | case MVM_ARRAY_I32: elem_size = 4; break; |
1733 | 0 | case MVM_ARRAY_I16: elem_size = 2; break; |
1734 | 0 | case MVM_ARRAY_I8: elem_size = 1; break; |
1735 | 0 | case MVM_ARRAY_U64: elem_size = 8; break; |
1736 | 0 | case MVM_ARRAY_U32: elem_size = 4; break; |
1737 | 1 | case MVM_ARRAY_U16: elem_size = 2; break; |
1738 | 13.0k | case MVM_ARRAY_U8: elem_size = 1; break; |
1739 | 13.0k | } |
1740 | 13.0k | } |
1741 | 13.0k | if (!elem_size) |
1742 | 0 | MVM_exception_throw_adhoc(tc, "encode requires a native int array"); |
1743 | 13.0k | if (((MVMArray *)buf)->body.slots.any) |
1744 | 0 | MVM_exception_throw_adhoc(tc, "encode requires an empty array"); |
1745 | 13.0k | |
1746 | 13.0k | /* At least find_encoding may allocate on first call, so root just |
1747 | 13.0k | * in case. */ |
1748 | 13.0k | MVMROOT2(tc, buf, s, { |
1749 | 13.0k | const MVMuint8 encoding_flag = MVM_string_find_encoding(tc, enc_name); |
1750 | 13.0k | encoded = (MVMuint8 *)MVM_string_encode_config(tc, s, 0, MVM_string_graphs_nocheck(tc, s), &output_size, |
1751 | 13.0k | encoding_flag, replacement, 0, config); |
1752 | 13.0k | }); |
1753 | 13.0k | |
1754 | 13.0k | /* Stash the encoded data in the VMArray. */ |
1755 | 13.0k | ((MVMArray *)buf)->body.slots.i8 = (MVMint8 *)encoded; |
1756 | 13.0k | ((MVMArray *)buf)->body.start = 0; |
1757 | 13.0k | ((MVMArray *)buf)->body.ssize = output_size / elem_size; |
1758 | 13.0k | ((MVMArray *)buf)->body.elems = output_size / elem_size; |
1759 | 13.0k | return buf; |
1760 | 13.0k | } |
1761 | | MVMObject * MVM_string_encode_to_buf(MVMThreadContext *tc, MVMString *s, MVMString *enc_name, |
1762 | 13.0k | MVMObject *buf, MVMString *replacement) { |
1763 | 13.0k | return MVM_string_encode_to_buf_config(tc, s, enc_name, buf, replacement, MVM_ENCODING_PERMISSIVE); |
1764 | 13.0k | } |
1765 | | /* Decodes a string using the data from the specified Buf. Decodes "strict" by |
1766 | | * default, but optionally can be "permissive". */ |
1767 | | MVMString * MVM_string_decode_from_buf_config(MVMThreadContext *tc, MVMObject *buf, |
1768 | 25 | MVMString *enc_name, MVMString *replacement, MVMint64 config) { |
1769 | 25 | MVMArrayREPRData *buf_rd; |
1770 | 25 | MVMuint8 encoding_flag; |
1771 | 25 | MVMuint8 elem_size = 0; |
1772 | 25 | |
1773 | 25 | /* Ensure the source is in the correct form. */ |
1774 | 25 | if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray) |
1775 | 0 | MVM_exception_throw_adhoc(tc, "decode requires a native array to read from"); |
1776 | 25 | buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data; |
1777 | 25 | if (buf_rd) { |
1778 | 25 | switch (buf_rd->slot_type) { |
1779 | 0 | case MVM_ARRAY_I64: elem_size = 8; break; |
1780 | 0 | case MVM_ARRAY_I32: elem_size = 4; break; |
1781 | 0 | case MVM_ARRAY_I16: elem_size = 2; break; |
1782 | 1 | case MVM_ARRAY_I8: elem_size = 1; break; |
1783 | 0 | case MVM_ARRAY_U64: elem_size = 8; break; |
1784 | 0 | case MVM_ARRAY_U32: elem_size = 4; break; |
1785 | 1 | case MVM_ARRAY_U16: elem_size = 2; break; |
1786 | 23 | case MVM_ARRAY_U8: elem_size = 1; break; |
1787 | 25 | } |
1788 | 25 | } |
1789 | 25 | if (!elem_size) |
1790 | 0 | MVM_exception_throw_adhoc(tc, "encode requires a native int array"); |
1791 | 25 | |
1792 | 25 | /* Decode. */ |
1793 | 25 | MVMROOT(tc, buf, { |
1794 | 25 | encoding_flag = MVM_string_find_encoding(tc, enc_name); |
1795 | 25 | }); |
1796 | 25 | return MVM_string_decode_config(tc, tc->instance->VMString, |
1797 | 25 | (char *)(((MVMArray *)buf)->body.slots.i8 + ((MVMArray *)buf)->body.start), |
1798 | 25 | ((MVMArray *)buf)->body.elems * elem_size, |
1799 | 25 | encoding_flag, replacement, config); |
1800 | 25 | } |
1801 | 21 | MVMString * MVM_string_decode_from_buf(MVMThreadContext *tc, MVMObject *buf, MVMString *enc_name) { |
1802 | 21 | return MVM_string_decode_from_buf_config(tc, buf, enc_name, NULL, MVM_ENCODING_PERMISSIVE); |
1803 | 21 | } |
1804 | | |
1805 | 43.3k | MVMObject * MVM_string_split(MVMThreadContext *tc, MVMString *separator, MVMString *input) { |
1806 | 43.3k | MVMObject *result = NULL; |
1807 | 43.3k | MVMStringIndex start, end, sep_length; |
1808 | 43.3k | MVMHLLConfig *hll = MVM_hll_current(tc); |
1809 | 43.3k | |
1810 | 43.3k | MVM_string_check_arg(tc, separator, "split separator"); |
1811 | 43.3k | MVM_string_check_arg(tc, input, "split input"); |
1812 | 43.3k | |
1813 | 43.3k | MVMROOT3(tc, input, separator, result, { |
1814 | 43.3k | result = MVM_repr_alloc_init(tc, hll->slurpy_array_type); |
1815 | 43.3k | start = 0; |
1816 | 43.3k | end = MVM_string_graphs_nocheck(tc, input); |
1817 | 43.3k | sep_length = MVM_string_graphs_nocheck(tc, separator); |
1818 | 43.3k | |
1819 | 43.3k | while (start < end) { |
1820 | 43.3k | MVMString *portion; |
1821 | 43.3k | MVMStringIndex index; |
1822 | 43.3k | MVMStringIndex length; |
1823 | 43.3k | |
1824 | 43.3k | /* XXX make this use the dual-traverse iterator, but such that it |
1825 | 43.3k | can reset the index of what it's comparing... <!> */ |
1826 | 43.3k | index = MVM_string_index(tc, input, separator, start); |
1827 | 43.3k | length = sep_length ? (index == -1 ? end : index) - start : 1; |
1828 | 43.3k | if (0 < length || (sep_length && length == 0)) { |
1829 | 43.3k | portion = MVM_string_substring(tc, input, start, length); |
1830 | 43.3k | MVMROOT(tc, portion, { |
1831 | 43.3k | MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type); |
1832 | 43.3k | MVM_repr_set_str(tc, pobj, portion); |
1833 | 43.3k | MVM_repr_push_o(tc, result, pobj); |
1834 | 43.3k | }); |
1835 | 43.3k | } |
1836 | 43.3k | start += length + sep_length; |
1837 | 43.3k | /* Gather an empty string if the delimiter is found at the end. */ |
1838 | 43.3k | if (sep_length && start == end) { |
1839 | 43.3k | MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type); |
1840 | 43.3k | MVM_repr_set_str(tc, pobj, tc->instance->str_consts.empty); |
1841 | 43.3k | MVM_repr_push_o(tc, result, pobj); |
1842 | 43.3k | } |
1843 | 43.3k | } |
1844 | 43.3k | }); |
1845 | 43.3k | |
1846 | 43.3k | return result; |
1847 | 43.3k | } |
1848 | | /* Used in the MVM_string_join function. Moved here to simplify the code */ |
1849 | | void copy_to_32bit (MVMThreadContext *tc, MVMString *source, |
1850 | 28.9k | MVMString *dest, MVMint64 *position, MVMGraphemeIter *gi) { |
1851 | 28.9k | /* Add source. */ |
1852 | 28.9k | switch (source->body.storage_type) { |
1853 | 636 | case MVM_STRING_GRAPHEME_32: { |
1854 | 636 | memcpy( |
1855 | 636 | dest->body.storage.blob_32 + *position, |
1856 | 636 | source->body.storage.blob_32, |
1857 | 636 | source->body.num_graphs * sizeof(MVMGrapheme32)); |
1858 | 636 | *position += source->body.num_graphs; |
1859 | 636 | break; |
1860 | 636 | } |
1861 | 2.72k | case MVM_STRING_GRAPHEME_ASCII: |
1862 | 2.72k | case MVM_STRING_GRAPHEME_8: { |
1863 | 2.72k | MVMStringIndex sindex = 0; |
1864 | 8.37k | while (sindex < source->body.num_graphs) |
1865 | 5.65k | dest->body.storage.blob_32[(*position)++] = |
1866 | 5.65k | source->body.storage.blob_8[sindex++]; |
1867 | 2.72k | break; |
1868 | 2.72k | } |
1869 | 25.5k | default: |
1870 | 25.5k | MVM_string_gi_init(tc, gi, source); |
1871 | 787k | while (MVM_string_gi_has_more(tc, gi)) |
1872 | 762k | dest->body.storage.blob_32[(*position)++] = |
1873 | 762k | MVM_string_gi_get_grapheme(tc, gi); |
1874 | 25.5k | break; |
1875 | 28.9k | } |
1876 | 28.9k | } |
1877 | | /* Used in MVM_string_join to check stability of adding the next piece */ |
1878 | | MVM_STATIC_INLINE void join_check_stability(MVMThreadContext *tc, MVMString *piece, |
1879 | 15.3k | MVMString *separator, MVMString **pieces, MVMint32 *concats_stable, MVMint64 num_pieces, MVMint64 sgraphs, MVMint64 piece_index) { |
1880 | 15.3k | if (!sgraphs) { |
1881 | 15.0k | /* If there's no separator and one piece is The Empty String we |
1882 | 15.0k | * have to be extra careful about concat stability */ |
1883 | 15.0k | if (!MVM_string_graphs_nocheck(tc, piece) |
1884 | 18 | && piece_index + 1 < num_pieces |
1885 | 15 | && !MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], pieces[piece_index + 1])) { |
1886 | 0 | *concats_stable = 0; |
1887 | 0 | } |
1888 | 15.0k | /* Separator has no graphemes, so NFG stability check |
1889 | 15.0k | * should consider pieces. */ |
1890 | 15.0k | else if (!MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], piece)) |
1891 | 4 | *concats_stable = 0; |
1892 | 15.0k | } |
1893 | 15.3k | /* If we have a separator, check concat stability */ |
1894 | 306 | else { |
1895 | 306 | if (!MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], separator) /* Before */ |
1896 | 306 | || !MVM_nfg_is_concat_stable(tc, separator, piece)) { /* And after separator */ |
1897 | 0 | *concats_stable = 0; |
1898 | 0 | } |
1899 | 306 | } |
1900 | 15.3k | } |
1901 | 34.4k | MVM_STATIC_INLINE MVMString * join_get_str_from_pos(MVMThreadContext *tc, MVMObject *array, MVMint64 index, MVMint64 is_str_array) { |
1902 | 34.4k | if (is_str_array) { |
1903 | 27.0k | MVMString *piece = MVM_repr_at_pos_s(tc, array, index); |
1904 | 27.0k | if (piece) |
1905 | 27.0k | return piece; |
1906 | 27.0k | } |
1907 | 7.44k | else { |
1908 | 7.44k | MVMObject *item = MVM_repr_at_pos_o(tc, array, index); |
1909 | 7.44k | if (item && IS_CONCRETE(item)) |
1910 | 7.44k | return MVM_repr_get_str(tc, item); |
1911 | 7.44k | } |
1912 | 0 | return (MVMString*)NULL; |
1913 | 34.4k | } |
1914 | 19.2k | MVMString * MVM_string_join(MVMThreadContext *tc, MVMString *separator, MVMObject *input) { |
1915 | 19.2k | MVMString *result = NULL; |
1916 | 19.2k | MVMString **pieces = NULL; |
1917 | 19.2k | MVMint64 elems, num_pieces, sgraphs, i, is_str_array, total_graphs; |
1918 | 19.2k | MVMuint16 sstrands, total_strands; |
1919 | 19.2k | MVMint32 concats_stable = 1, all_strands; |
1920 | 19.2k | size_t bytes; |
1921 | 19.2k | |
1922 | 19.2k | MVM_string_check_arg(tc, separator, "join separator"); |
1923 | 19.2k | if (!IS_CONCRETE(input)) |
1924 | 0 | MVM_exception_throw_adhoc(tc, "join needs a concrete array to join"); |
1925 | 19.2k | |
1926 | 19.2k | /* See how many things we have to join; if the answer is "none" then we |
1927 | 19.2k | * can make a hasty escape. */ |
1928 | 19.2k | elems = MVM_repr_elems(tc, input); |
1929 | 19.2k | if (elems == 0) |
1930 | 171 | return tc->instance->str_consts.empty; |
1931 | 19.0k | bytes = elems * sizeof(MVMString *); |
1932 | 19.0k | is_str_array = REPR(input)->pos_funcs.get_elem_storage_spec(tc, |
1933 | 19.0k | STABLE(input)).boxed_primitive == MVM_STORAGE_SPEC_BP_STR; |
1934 | 19.0k | |
1935 | 19.0k | /* If there's only one element to join, just return it. */ |
1936 | 19.0k | if (elems == 1) { |
1937 | 5.83k | MVMString *piece = join_get_str_from_pos(tc, input, 0, is_str_array); |
1938 | 5.83k | if (piece) |
1939 | 5.83k | return piece; |
1940 | 5.83k | } |
1941 | 19.0k | |
1942 | 19.0k | /* Allocate result. */ |
1943 | 13.2k | MVMROOT2(tc, separator, input, { |
1944 | 13.2k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1945 | 13.2k | }); |
1946 | 13.2k | |
1947 | 13.2k | /* Take a first pass through the string, counting up length and the total |
1948 | 13.2k | * number of strands we encounter as well as building a flat array of the |
1949 | 13.2k | * strings (so we only have to do the indirect calls once). */ |
1950 | 13.2k | sgraphs = MVM_string_graphs_nocheck(tc, separator); |
1951 | 13.2k | if (sgraphs) |
1952 | 64 | sstrands = separator->body.storage_type == MVM_STRING_STRAND |
1953 | 0 | ? separator->body.num_strands |
1954 | 64 | : 1; |
1955 | 13.2k | else |
1956 | 13.1k | sstrands = 1; |
1957 | 13.2k | pieces = MVM_fixed_size_alloc(tc, tc->instance->fsa, bytes); |
1958 | 13.2k | num_pieces = 0; |
1959 | 13.2k | total_graphs = 0; |
1960 | 13.2k | total_strands = 0; |
1961 | 13.2k | /* Is the separator a strand? */ |
1962 | 13.2k | all_strands = separator->body.storage_type == MVM_STRING_STRAND; |
1963 | 41.8k | for (i = 0; i < elems; i++) { |
1964 | 28.6k | /* Get piece of the string. */ |
1965 | 28.6k | MVMString *piece = join_get_str_from_pos(tc, input, i, is_str_array); |
1966 | 28.6k | MVMint64 piece_graphs; |
1967 | 28.6k | if (!piece) |
1968 | 0 | continue; |
1969 | 28.6k | |
1970 | 28.6k | /* Check that all the pieces are strands. */ |
1971 | 28.6k | if (all_strands) |
1972 | 0 | all_strands = piece->body.storage_type == MVM_STRING_STRAND; |
1973 | 28.6k | |
1974 | 28.6k | /* If it wasn't the first piece, add separator here. */ |
1975 | 28.6k | if (num_pieces) { |
1976 | 15.4k | total_strands += sstrands; |
1977 | 15.4k | total_graphs += sgraphs; |
1978 | 15.4k | } |
1979 | 28.6k | |
1980 | 28.6k | /* Add on the piece's strands and graphs. */ |
1981 | 28.6k | piece_graphs = MVM_string_graphs(tc, piece); |
1982 | 28.6k | if (piece_graphs) { |
1983 | 28.4k | total_strands += piece->body.storage_type == MVM_STRING_STRAND |
1984 | 25.5k | ? piece->body.num_strands |
1985 | 2.91k | : 1; |
1986 | 28.4k | total_graphs += piece_graphs; |
1987 | 28.4k | } |
1988 | 28.6k | |
1989 | 28.6k | /* Store piece. */ |
1990 | 28.6k | pieces[num_pieces++] = piece; |
1991 | 28.6k | } |
1992 | 13.2k | /* This guards the joining by method of multiple concats, and will be faster |
1993 | 13.2k | * if we only end up with one piece after going through each element of the array */ |
1994 | 13.2k | if (num_pieces == 1) |
1995 | 0 | return pieces[0]; |
1996 | 13.2k | /* We now know the total eventual number of graphemes. */ |
1997 | 13.2k | if (total_graphs == 0) { |
1998 | 10 | MVM_fixed_size_free(tc, tc->instance->fsa, bytes, pieces); |
1999 | 10 | return tc->instance->str_consts.empty; |
2000 | 10 | } |
2001 | 13.2k | result->body.num_graphs = total_graphs; |
2002 | 13.2k | |
2003 | 13.2k | MVMROOT(tc, result, { |
2004 | 13.2k | /* If the separator and pieces are all strands, and there are |
2005 | 13.2k | * on average at least 16 graphemes in each of the strands. */ |
2006 | 13.2k | if (all_strands && total_strands < MVM_STRING_MAX_STRANDS |
2007 | 13.2k | && total_strands * 16 <= total_graphs) { |
2008 | 13.2k | MVMuint16 offset = 0; |
2009 | 13.2k | result->body.storage_type = MVM_STRING_STRAND; |
2010 | 13.2k | result->body.storage.strands = allocate_strands(tc, total_strands); |
2011 | 13.2k | result->body.num_strands = total_strands; |
2012 | 13.2k | for (i = 0; i < num_pieces; i++) { |
2013 | 13.2k | MVMString *piece = pieces[i]; |
2014 | 13.2k | if (0 < i) { |
2015 | 13.2k | /* No more checks unless still stable */ |
2016 | 13.2k | if (concats_stable) |
2017 | 13.2k | join_check_stability(tc, piece, separator, pieces, |
2018 | 13.2k | &concats_stable, num_pieces, sgraphs, i); |
2019 | 13.2k | copy_strands(tc, separator, 0, result, offset, separator->body.num_strands); |
2020 | 13.2k | offset += separator->body.num_strands; |
2021 | 13.2k | } |
2022 | 13.2k | copy_strands(tc, piece, 0, result, offset, piece->body.num_strands); |
2023 | 13.2k | offset += piece->body.num_strands; |
2024 | 13.2k | } |
2025 | 13.2k | } |
2026 | 13.2k | /* Doing multiple concats is only faster if we have about 300 graphemes per |
2027 | 13.2k | piece or if we have less than for pieces and more than 150 graphemes per piece */ |
2028 | 13.2k | else if (total_strands < MVM_STRING_MAX_STRANDS && (300 < num_pieces/total_graphs || (num_pieces < 4 && 150 < num_pieces/total_graphs))) { |
2029 | 13.2k | MVMString *result = NULL; |
2030 | 13.2k | MVMROOT(tc, result, { |
2031 | 13.2k | if (sgraphs) { |
2032 | 13.2k | i = 0; |
2033 | 13.2k | result = MVM_string_concatenate(tc, pieces[i++], separator); |
2034 | 13.2k | result = MVM_string_concatenate(tc, result, pieces[i++]); |
2035 | 13.2k | for (; i < num_pieces;) { |
2036 | 13.2k | result = MVM_string_concatenate(tc, result, separator); |
2037 | 13.2k | result = MVM_string_concatenate(tc, result, pieces[i++]); |
2038 | 13.2k | } |
2039 | 13.2k | |
2040 | 13.2k | } |
2041 | 13.2k | else { |
2042 | 13.2k | result = MVM_string_concatenate(tc, pieces[0], pieces[1]); |
2043 | 13.2k | i = 2; |
2044 | 13.2k | for (; i < num_pieces;) { |
2045 | 13.2k | result = MVM_string_concatenate(tc, result, pieces[i++]); |
2046 | 13.2k | } |
2047 | 13.2k | } |
2048 | 13.2k | }); |
2049 | 13.2k | return result; |
2050 | 13.2k | } |
2051 | 13.2k | else { |
2052 | 13.2k | /* We'll produce a single, flat string. */ |
2053 | 13.2k | MVMint64 position = 0; |
2054 | 13.2k | MVMGraphemeIter gi; |
2055 | 13.2k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
2056 | 13.2k | result->body.storage.blob_32 = MVM_malloc(total_graphs * sizeof(MVMGrapheme32)); |
2057 | 13.2k | for (i = 0; i < num_pieces; i++) { |
2058 | 13.2k | /* Get piece. */ |
2059 | 13.2k | MVMString *piece = pieces[i]; |
2060 | 13.2k | |
2061 | 13.2k | /* Add separator if needed. */ |
2062 | 13.2k | if (0 < i) { |
2063 | 13.2k | /* No more checks unless still stable */ |
2064 | 13.2k | if (concats_stable) |
2065 | 13.2k | join_check_stability(tc, piece, separator, pieces, |
2066 | 13.2k | &concats_stable, num_pieces, sgraphs, i); |
2067 | 13.2k | /* Add separator */ |
2068 | 13.2k | if (sgraphs) |
2069 | 13.2k | copy_to_32bit(tc, separator, result, &position, &gi); |
2070 | 13.2k | } |
2071 | 13.2k | /* Add piece */ |
2072 | 13.2k | copy_to_32bit(tc, piece, result, &position, &gi); |
2073 | 13.2k | } |
2074 | 13.2k | } |
2075 | 13.2k | |
2076 | 13.2k | MVM_fixed_size_free(tc, tc->instance->fsa, bytes, pieces); |
2077 | 13.2k | STRAND_CHECK(tc, result); |
2078 | 13.2k | /* if concat is stable and NFG_CHECK on, run a NFG_CHECK on it since it |
2079 | 13.2k | * should be properly constructed now */ |
2080 | 13.2k | if (concats_stable) |
2081 | 13.2k | NFG_CHECK(tc, result, "MVM_string_join"); |
2082 | 13.2k | }); |
2083 | 13.2k | return concats_stable ? result : re_nfg(tc, result); |
2084 | 13.2k | } |
2085 | | |
2086 | | /* Returning nonzero means it found the char at the position specified in 'a' in 'Haystack'. |
2087 | | * For character enumerations in regexes. */ |
2088 | 307k | MVMint64 MVM_string_char_at_in_string(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMString *Haystack) { |
2089 | 307k | MVMuint32 H_graphs; |
2090 | 307k | MVMGrapheme32 search; |
2091 | 307k | |
2092 | 307k | MVM_string_check_arg(tc, a, "char_at_in_string"); |
2093 | 307k | MVM_string_check_arg(tc, Haystack, "char_at_in_string"); |
2094 | 307k | |
2095 | 307k | /* We return -2 here only to be able to distinguish between "out of bounds" and "not in string". */ |
2096 | 307k | if (offset < 0 || MVM_string_graphs_nocheck(tc, a) <= offset) |
2097 | 12.8k | return -2; |
2098 | 307k | |
2099 | 294k | search = MVM_string_get_grapheme_at_nocheck(tc, a, offset); |
2100 | 294k | H_graphs = MVM_string_graphs_nocheck(tc, Haystack); |
2101 | 294k | switch (Haystack->body.storage_type) { |
2102 | 161k | case MVM_STRING_GRAPHEME_32: |
2103 | 161k | return MVM_string_memmem_grapheme32(tc, Haystack->body.storage.blob_32, &search, 0, H_graphs, 1); |
2104 | 161k | |
2105 | 0 | case MVM_STRING_GRAPHEME_ASCII: |
2106 | 0 | if (can_fit_into_ascii(search)) { |
2107 | 0 | MVMStringIndex i; |
2108 | 0 | for (i = 0; i < H_graphs; i++) |
2109 | 0 | if (Haystack->body.storage.blob_ascii[i] == search) |
2110 | 0 | return i; |
2111 | 0 | } |
2112 | 0 | break; |
2113 | 132k | case MVM_STRING_GRAPHEME_8: |
2114 | 132k | if (can_fit_into_8bit(search)) { |
2115 | 132k | MVMStringIndex i; |
2116 | 301k | for (i = 0; i < H_graphs; i++) |
2117 | 206k | if (Haystack->body.storage.blob_8[i] == search) |
2118 | 37.5k | return i; |
2119 | 132k | } |
2120 | 95.3k | break; |
2121 | 0 | case MVM_STRING_STRAND: { |
2122 | 0 | MVMGraphemeIter gi; |
2123 | 0 | MVMStringIndex i; |
2124 | 0 | MVM_string_gi_init(tc, &gi, Haystack); |
2125 | 0 | for (i = 0; i < H_graphs; i++) |
2126 | 0 | if (MVM_string_gi_get_grapheme(tc, &gi) == search) |
2127 | 0 | return i; |
2128 | 0 | } |
2129 | 294k | } |
2130 | 95.3k | return -1; |
2131 | 294k | } |
2132 | | |
2133 | 8 | MVMint64 MVM_string_offset_has_unicode_property_value(MVMThreadContext *tc, MVMString *s, MVMint64 offset, MVMint64 property_code, MVMint64 property_value_code) { |
2134 | 8 | MVMGrapheme32 g; |
2135 | 8 | MVMCodepoint cp; |
2136 | 8 | |
2137 | 8 | MVM_string_check_arg(tc, s, "uniprop"); |
2138 | 8 | |
2139 | 8 | if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s)) |
2140 | 0 | return 0; |
2141 | 8 | |
2142 | 8 | g = MVM_string_get_grapheme_at_nocheck(tc, s, offset); |
2143 | 8 | if (g >= 0) |
2144 | 8 | cp = (MVMCodepoint)g; |
2145 | 8 | else |
2146 | 0 | cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
2147 | 8 | return MVM_unicode_codepoint_has_property_value(tc, cp, property_code, property_value_code); |
2148 | 8 | } |
2149 | | |
2150 | | /* If the string is made up of strands, then produces a flattend string |
2151 | | * representing the exact same graphemes but without strands. Otherwise, |
2152 | | * returns the input string. Intended for strings that will be indexed |
2153 | | * into heavily (when evaluating regexes, for example). */ |
2154 | 22.8k | MVMString * MVM_string_indexing_optimized(MVMThreadContext *tc, MVMString *s) { |
2155 | 22.8k | MVM_string_check_arg(tc, s, "indexingoptimized"); |
2156 | 22.8k | if (s->body.storage_type == MVM_STRING_STRAND) |
2157 | 21.8k | return collapse_strands(tc, s); |
2158 | 22.8k | else |
2159 | 1.04k | return s; |
2160 | 22.8k | } |
2161 | | |
2162 | | /* Escapes a string, replacing various chars like \n with \\n. Can no doubt be |
2163 | | * further optimized. */ |
2164 | 561 | MVMString * MVM_string_escape(MVMThreadContext *tc, MVMString *s) { |
2165 | 561 | MVMString *res = NULL; |
2166 | 561 | MVMStringIndex spos = 0; |
2167 | 561 | MVMStringIndex bpos = 0; |
2168 | 561 | MVMStringIndex sgraphs, balloc; |
2169 | 561 | MVMGrapheme32 *buffer = NULL; |
2170 | 561 | MVMGrapheme32 crlf; |
2171 | 561 | MVMint8 string_can_fit_into_8bit = 1; |
2172 | 561 | |
2173 | 561 | MVM_string_check_arg(tc, s, "escape"); |
2174 | 561 | |
2175 | 561 | sgraphs = MVM_string_graphs_nocheck(tc, s); |
2176 | 561 | balloc = sgraphs; |
2177 | 561 | buffer = MVM_malloc(sizeof(MVMGrapheme32) * balloc); |
2178 | 561 | |
2179 | 561 | crlf = MVM_nfg_crlf_grapheme(tc); |
2180 | 561 | |
2181 | 6.75k | for (; spos < sgraphs; spos++) { |
2182 | 6.19k | MVMGrapheme32 graph = MVM_string_get_grapheme_at_nocheck(tc, s, spos); |
2183 | 6.19k | MVMGrapheme32 esc = 0; |
2184 | 6.19k | switch (graph) { |
2185 | 11 | case '\\': esc = '\\'; break; |
2186 | 0 | case 7: esc = 'a'; break; |
2187 | 1 | case '\b': esc = 'b'; break; |
2188 | 4 | case '\n': esc = 'n'; break; |
2189 | 3 | case '\r': esc = 'r'; break; |
2190 | 13 | case '\t': esc = 't'; break; |
2191 | 1 | case '\f': esc = 'f'; break; |
2192 | 2 | case '"': esc = '"'; break; |
2193 | 1 | case 27: esc = 'e'; break; |
2194 | 6.19k | } |
2195 | 6.19k | if (esc) { |
2196 | 36 | if (bpos + 2 > balloc) { |
2197 | 13 | balloc += 32; |
2198 | 13 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc); |
2199 | 13 | } |
2200 | 36 | buffer[bpos++] = '\\'; |
2201 | 36 | buffer[bpos++] = esc; |
2202 | 36 | } |
2203 | 6.15k | else if (graph == crlf) { |
2204 | 1 | if (bpos + 4 > balloc) { |
2205 | 1 | balloc += 32; |
2206 | 1 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc); |
2207 | 1 | } |
2208 | 1 | buffer[bpos++] = '\\'; |
2209 | 1 | buffer[bpos++] = 'r'; |
2210 | 1 | buffer[bpos++] = '\\'; |
2211 | 1 | buffer[bpos++] = 'n'; |
2212 | 1 | } |
2213 | 6.15k | else { |
2214 | 6.15k | if (bpos + 1 > balloc) { |
2215 | 11 | balloc += 32; |
2216 | 11 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc); |
2217 | 11 | } |
2218 | 6.15k | if (!can_fit_into_8bit(graph)) |
2219 | 0 | string_can_fit_into_8bit = 0; |
2220 | 6.15k | buffer[bpos++] = graph; |
2221 | 6.15k | } |
2222 | 6.19k | } |
2223 | 561 | |
2224 | 561 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
2225 | 561 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
2226 | 561 | res->body.storage.blob_32 = buffer; |
2227 | 561 | res->body.num_graphs = bpos; |
2228 | 561 | |
2229 | 561 | if (string_can_fit_into_8bit) |
2230 | 561 | turn_32bit_into_8bit_unchecked(tc, res); |
2231 | 561 | |
2232 | 561 | STRAND_CHECK(tc, res); |
2233 | 561 | return res; |
2234 | 561 | } |
2235 | | |
2236 | | /* Takes a string and reverses its characters. */ |
2237 | 1 | MVMString * MVM_string_flip(MVMThreadContext *tc, MVMString *s) { |
2238 | 1 | MVMString *res = NULL; |
2239 | 1 | MVMStringIndex spos = 0; |
2240 | 1 | MVMStringIndex sgraphs; |
2241 | 1 | MVMStringIndex rpos; |
2242 | 1 | |
2243 | 1 | MVM_string_check_arg(tc, s, "flip"); |
2244 | 1 | sgraphs = MVM_string_graphs_nocheck(tc, s); |
2245 | 1 | rpos = sgraphs; |
2246 | 1 | |
2247 | 1 | switch (s->body.storage_type) { |
2248 | 1 | case MVM_STRING_GRAPHEME_ASCII: |
2249 | 1 | case MVM_STRING_GRAPHEME_8: { |
2250 | 1 | MVMGrapheme8 *rbuffer; |
2251 | 1 | /* Copy the variables, so we can use them just for the loop. This way |
2252 | 1 | * the loop will vectorize since we won't refer to their values except |
2253 | 1 | * in the loop. Use size_t to coerce vectorization. */ |
2254 | 1 | size_t spos_l = spos, rpos_l = rpos; |
2255 | 1 | rbuffer = MVM_malloc(sizeof(MVMGrapheme8) * sgraphs); |
2256 | 1 | MVM_VECTORIZE_LOOP |
2257 | 4 | while (spos_l < s->body.num_graphs) |
2258 | 3 | rbuffer[--rpos_l] = s->body.storage.blob_8[spos_l++]; |
2259 | 1 | |
2260 | 1 | spos += sgraphs - spos; |
2261 | 1 | rpos -= sgraphs - spos; |
2262 | 1 | |
2263 | 1 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
2264 | 1 | res->body.storage_type = s->body.storage_type; |
2265 | 1 | res->body.storage.blob_8 = rbuffer; |
2266 | 1 | break; |
2267 | 1 | } |
2268 | 0 | default: { |
2269 | 0 | MVMGrapheme32 *rbuffer; |
2270 | 0 | rbuffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs); |
2271 | 0 |
|
2272 | 0 | if (s->body.storage_type == MVM_STRING_GRAPHEME_32) { |
2273 | 0 | size_t spos_l = spos, rpos_l = rpos; |
2274 | 0 | MVM_VECTORIZE_LOOP |
2275 | 0 | while (spos_l < s->body.num_graphs) |
2276 | 0 | rbuffer[--rpos_l] = s->body.storage.blob_32[spos_l++]; |
2277 | 0 |
|
2278 | 0 | spos += sgraphs - spos; |
2279 | 0 | rpos -= sgraphs - spos; |
2280 | 0 | } |
2281 | 0 | else |
2282 | 0 | for (; spos < sgraphs; spos++) |
2283 | 0 | rbuffer[--rpos] = MVM_string_get_grapheme_at_nocheck(tc, s, spos); |
2284 | 0 |
|
2285 | 0 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
2286 | 0 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
2287 | 0 | res->body.storage.blob_32 = rbuffer; |
2288 | 0 | }} |
2289 | 1 | |
2290 | 1 | res->body.num_graphs = sgraphs; |
2291 | 1 | |
2292 | 1 | STRAND_CHECK(tc, res); |
2293 | 1 | return res; |
2294 | 1 | } |
2295 | | |
2296 | | /* Compares two strings, returning -1, 0 or 1 to indicate less than, |
2297 | | * equal or greater than. */ |
2298 | 46.6k | MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
2299 | 46.6k | MVMStringIndex alen, blen, i = 0, scanlen; |
2300 | 46.6k | MVMGraphemeIter gi_a, gi_b; |
2301 | 46.6k | |
2302 | 46.6k | MVM_string_check_arg(tc, a, "compare"); |
2303 | 46.6k | MVM_string_check_arg(tc, b, "compare"); |
2304 | 46.6k | |
2305 | 46.6k | /* Simple cases when one or both are zero length. */ |
2306 | 46.6k | alen = MVM_string_graphs_nocheck(tc, a); |
2307 | 46.6k | blen = MVM_string_graphs_nocheck(tc, b); |
2308 | 46.6k | if (alen == 0) |
2309 | 356 | return blen == 0 ? 0 : -1; |
2310 | 46.3k | if (blen == 0) |
2311 | 10.8k | return 1; |
2312 | 46.3k | |
2313 | 46.3k | /* Otherwise, need to scan them. */ |
2314 | 35.5k | scanlen = blen < alen ? blen : alen; |
2315 | 35.5k | |
2316 | 35.5k | /* Short circuit a case where the other conditionals won't speed it up */ |
2317 | 35.5k | if (a->body.storage_type == MVM_STRING_STRAND || b->body.storage_type == MVM_STRING_STRAND) { |
2318 | 8.56k | |
2319 | 8.56k | } |
2320 | 26.9k | else if ((a->body.storage_type == MVM_STRING_GRAPHEME_8 || a->body.storage_type == MVM_STRING_GRAPHEME_ASCII) |
2321 | 22.4k | && (b->body.storage_type == MVM_STRING_GRAPHEME_8 || b->body.storage_type == MVM_STRING_GRAPHEME_ASCII)) { |
2322 | 22.3k | MVMGrapheme8 *a_blob8 = a->body.storage.blob_8; |
2323 | 22.3k | MVMGrapheme8 *b_blob8 = b->body.storage.blob_8; |
2324 | 31.4k | while (i < scanlen && a_blob8[i] == b_blob8[i]) { |
2325 | 9.18k | i++; |
2326 | 9.18k | } |
2327 | 22.3k | } |
2328 | 4.66k | else if (a->body.storage_type == MVM_STRING_GRAPHEME_32 && b->body.storage_type == MVM_STRING_GRAPHEME_32) { |
2329 | 4.43k | MVMGrapheme32 *a_blob32 = a->body.storage.blob_32; |
2330 | 4.43k | MVMGrapheme32 *b_blob32 = b->body.storage.blob_32; |
2331 | 6.28k | while (i < scanlen && a_blob32[i] == b_blob32[i]) { |
2332 | 1.85k | i++; |
2333 | 1.85k | } |
2334 | 4.43k | } |
2335 | 231 | else { |
2336 | 231 | MVMGrapheme32 *blob32 = NULL; |
2337 | 231 | MVMGrapheme8 *blob8 = NULL; |
2338 | 231 | switch (a->body.storage_type) { |
2339 | 162 | case MVM_STRING_GRAPHEME_8: |
2340 | 162 | case MVM_STRING_GRAPHEME_ASCII: |
2341 | 162 | blob8 = a->body.storage.blob_8; |
2342 | 162 | break; |
2343 | 69 | case MVM_STRING_GRAPHEME_32: |
2344 | 69 | blob32 = a->body.storage.blob_32; |
2345 | 69 | break; |
2346 | 0 | default: |
2347 | 0 | MVM_exception_throw_adhoc(tc, |
2348 | 0 | "String corruption in string compare. Unknown string type."); |
2349 | 231 | } |
2350 | 231 | switch (b->body.storage_type) { |
2351 | 69 | case MVM_STRING_GRAPHEME_8: |
2352 | 69 | case MVM_STRING_GRAPHEME_ASCII: |
2353 | 69 | blob8 = b->body.storage.blob_8; |
2354 | 69 | break; |
2355 | 162 | case MVM_STRING_GRAPHEME_32: |
2356 | 162 | blob32 = b->body.storage.blob_32; |
2357 | 162 | break; |
2358 | 0 | default: |
2359 | 0 | MVM_exception_throw_adhoc(tc, |
2360 | 0 | "String corruption in string compare. Unknown string type."); |
2361 | 231 | } |
2362 | 343 | while (i < scanlen && blob32[i] == blob8[i]) { |
2363 | 112 | i++; |
2364 | 112 | } |
2365 | 231 | } |
2366 | 35.5k | /* If one of the strings was a strand or we encountered a differing character |
2367 | 35.5k | * while scanning in the loops above. */ |
2368 | 35.5k | if (i < scanlen) { |
2369 | 31.5k | MVM_string_gi_init(tc, &gi_a, a); |
2370 | 31.5k | MVM_string_gi_init(tc, &gi_b, b); |
2371 | 31.5k | if (i) { |
2372 | 2.15k | MVM_string_gi_move_to(tc, &gi_a, i); |
2373 | 2.15k | MVM_string_gi_move_to(tc, &gi_b, i); |
2374 | 2.15k | } |
2375 | 31.5k | } |
2376 | 44.2k | for (; i < scanlen; i++) { |
2377 | 40.0k | MVMGrapheme32 g_a = MVM_string_gi_get_grapheme(tc, &gi_a); |
2378 | 40.0k | MVMGrapheme32 g_b = MVM_string_gi_get_grapheme(tc, &gi_b); |
2379 | 40.0k | if (g_a != g_b) { |
2380 | 31.3k | MVMint64 rtrn; |
2381 | 31.3k | /* If one of the deciding graphemes is a synthetic then we need to |
2382 | 31.3k | * iterate the codepoints inside it */ |
2383 | 31.3k | if (g_a < 0 || g_b < 0) { |
2384 | 0 | MVMCodepointIter ci_a, ci_b; |
2385 | 0 | MVM_string_grapheme_ci_init(tc, &ci_a, g_a, 0); |
2386 | 0 | MVM_string_grapheme_ci_init(tc, &ci_b, g_b, 0); |
2387 | 0 | while (MVM_string_grapheme_ci_has_more(tc, &ci_a) && MVM_string_grapheme_ci_has_more(tc, &ci_b)) { |
2388 | 0 | g_a = MVM_string_grapheme_ci_get_codepoint(tc, &ci_a); |
2389 | 0 | g_b = MVM_string_grapheme_ci_get_codepoint(tc, &ci_b); |
2390 | 0 | if (g_a != g_b) |
2391 | 0 | break; |
2392 | 0 | } |
2393 | 0 | rtrn = g_a < g_b ? -1 : |
2394 | 0 | g_b < g_a ? 1 : |
2395 | 0 | 0 ; |
2396 | 0 | /* If we get here, all the codepoints in the synthetics have matched |
2397 | 0 | * so go based on which has more codepoints left in that grapheme */ |
2398 | 0 | if (!rtrn) { |
2399 | 0 | MVMint32 a_has_more = MVM_string_grapheme_ci_has_more(tc, &ci_a), |
2400 | 0 | b_has_more = MVM_string_grapheme_ci_has_more(tc, &ci_b); |
2401 | 0 |
|
2402 | 0 | return a_has_more < b_has_more ? -1 : |
2403 | 0 | b_has_more < a_has_more ? 1 : |
2404 | 0 | 0 ; |
2405 | 0 | } |
2406 | 0 | return rtrn; |
2407 | 0 | } |
2408 | 31.3k | return g_a < g_b ? -1 : |
2409 | 24.6k | g_b < g_a ? 1 : |
2410 | 0 | 0 ; |
2411 | 31.3k | } |
2412 | 40.0k | } |
2413 | 35.5k | |
2414 | 35.5k | /* All shared chars equal, so go on length. */ |
2415 | 4.16k | return alen < blen ? -1 : |
2416 | 4.02k | blen < alen ? 1 : |
2417 | 3.88k | 0 ; |
2418 | 35.5k | } |
2419 | 42 | #define nfg_ok(cp) ((cp) < MVM_NORMALIZE_FIRST_SIG_NFC) |
2420 | | /* BITOP is the operator that is done to the two strings. GO_FULL_LEN determines |
2421 | | * if we do the operation for the longest strong. If GO_FULL_LEN == 0 then |
2422 | | * it only goes to the end of the shortest string (we do this for the string AND |
2423 | | * op, but the other ones go for the length of the longest string. */ |
2424 | | #define MVM_STRING_BITOP(BITOP, GO_FULL_LEN, OP_DESC) \ |
2425 | 3 | MVMString *res = NULL;\ |
2426 | 3 | MVMGrapheme32 *buffer = NULL;\ |
2427 | 3 | MVMStringIndex alen, blen, sgraphs = 0;\ |
2428 | 3 | size_t buf_size;\ |
2429 | 3 | MVMCodepointIter ci_a, ci_b;\ |
2430 | 3 | int nfg_is_safe = 1;\ |
2431 | 3 | MVM_string_check_arg(tc, a, (OP_DESC));\ |
2432 | 3 | MVM_string_check_arg(tc, b, (OP_DESC));\ |
2433 | 3 | \ |
2434 | 3 | alen = MVM_string_graphs_nocheck(tc, a);\ |
2435 | 3 | blen = MVM_string_graphs_nocheck(tc, b);\ |
2436 | 3 | buf_size = blen < alen ? alen : blen;\ |
2437 | 3 | buffer = MVM_malloc(sizeof(MVMGrapheme32) * buf_size);\ |
2438 | 3 | MVM_string_ci_init(tc, &ci_a, a, 0, 0);\ |
2439 | 3 | MVM_string_ci_init(tc, &ci_b, b, 0, 0);\ |
2440 | 3 | \ |
2441 | 3 | /* First, binary-or up to the length of the shortest string. */\ |
2442 | 24 | while (MVM_string_ci_has_more(tc, &ci_a) && MVM_string_ci_has_more(tc, &ci_b)) {\ |
2443 | 21 | const MVMGrapheme32 g_a = MVM_string_ci_get_codepoint(tc, &ci_a);\ |
2444 | 21 | const MVMGrapheme32 g_b = MVM_string_ci_get_codepoint(tc, &ci_b);\ |
2445 | 21 | buffer[sgraphs++] = g_a BITOP g_b;\ |
2446 | 21 | if (nfg_is_safe && (!nfg_ok(g_a) || !nfg_ok(g_b)))\ |
2447 | 1 | nfg_is_safe = 0;\ |
2448 | 21 | if (sgraphs == buf_size) {\ |
2449 | 0 | buf_size += 16;\ |
2450 | 0 | buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\ |
2451 | 0 | }\ |
2452 | 21 | }\ |
2453 | 3 | if (GO_FULL_LEN) {\ |
2454 | 4 | while (MVM_string_ci_has_more(tc, &ci_a)) {\ |
2455 | 2 | const MVMGrapheme32 g_a = MVM_string_ci_get_codepoint(tc, &ci_a);\ |
2456 | 2 | buffer[sgraphs++] = g_a;\ |
2457 | 2 | if (nfg_is_safe && !nfg_ok(g_a))\ |
2458 | 0 | nfg_is_safe = 0;\ |
2459 | 2 | if (sgraphs == buf_size) {\ |
2460 | 2 | buf_size += 16;\ |
2461 | 2 | buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\ |
2462 | 2 | }\ |
2463 | 2 | }\ |
2464 | 2 | while (MVM_string_ci_has_more(tc, &ci_b)) {\ |
2465 | 0 | const MVMGrapheme32 g_b = MVM_string_ci_get_codepoint(tc, &ci_b);\ |
2466 | 0 | buffer[sgraphs++] = g_b;\ |
2467 | 0 | if (nfg_is_safe && !nfg_ok(g_b))\ |
2468 | 0 | nfg_is_safe = 0;\ |
2469 | 0 | if (sgraphs == buf_size) {\ |
2470 | 0 | buf_size += 16;\ |
2471 | 0 | buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\ |
2472 | 0 | }\ |
2473 | 0 | }\ |
2474 | 2 | }\ |
2475 | 3 | \ |
2476 | 3 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);\ |
2477 | 3 | res->body.storage_type = MVM_STRING_GRAPHEME_32;\ |
2478 | 3 | res->body.storage.blob_32 = buffer;\ |
2479 | 3 | res->body.num_graphs = sgraphs;\ |
2480 | 3 | \ |
2481 | 2 | res = nfg_is_safe ? res : re_nfg(tc, res);\ |
2482 | 3 | STRAND_CHECK(tc, res);\ |
2483 | 3 | return res;\ |
2484 | | |
2485 | | /* Takes two strings and AND's their characters. */ |
2486 | 1 | MVMString * MVM_string_bitand(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
2487 | 1 | MVM_STRING_BITOP( & , 0, "bitwise and") |
2488 | 1 | } |
2489 | | |
2490 | | /* Takes two strings and OR's their characters. */ |
2491 | 1 | MVMString * MVM_string_bitor(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
2492 | 1 | MVM_STRING_BITOP( | , 1, "bitwise or") |
2493 | 1 | } |
2494 | | /* Takes two strings and XOR's their characters. */ |
2495 | 1 | MVMString * MVM_string_bitxor(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
2496 | 1 | MVM_STRING_BITOP( ^ , 1, "bitwise xor"); |
2497 | 0 | } |
2498 | | |
2499 | | /* Shortcuts for some unicode general category pvalues */ |
2500 | 218k | #define UPV_Nd MVM_UNICODE_PVALUE_GC_ND |
2501 | 230 | #define UPV_Lu MVM_UNICODE_PVALUE_GC_LU |
2502 | 243 | #define UPV_Ll MVM_UNICODE_PVALUE_GC_LL |
2503 | | #define UPV_Lt MVM_UNICODE_PVALUE_GC_LT |
2504 | | #define UPV_Lm MVM_UNICODE_PVALUE_GC_LM |
2505 | | #define UPV_Lo MVM_UNICODE_PVALUE_GC_LO |
2506 | 56 | #define UPV_Zs MVM_UNICODE_PVALUE_GC_ZS |
2507 | | #define UPV_Zl MVM_UNICODE_PVALUE_GC_ZL |
2508 | | #define UPV_Pc MVM_UNICODE_PVALUE_GC_PC |
2509 | | #define UPV_Pd MVM_UNICODE_PVALUE_GC_PD |
2510 | | #define UPV_Ps MVM_UNICODE_PVALUE_GC_PS |
2511 | | #define UPV_Pe MVM_UNICODE_PVALUE_GC_PE |
2512 | | #define UPV_Pi MVM_UNICODE_PVALUE_GC_PI |
2513 | | #define UPV_Pf MVM_UNICODE_PVALUE_GC_PF |
2514 | | #define UPV_Po MVM_UNICODE_PVALUE_GC_PO |
2515 | | |
2516 | | /* concatenating with "" ensures that only literal strings are accepted as argument. */ |
2517 | | #define STR_WITH_LEN(str) ("" str ""), (sizeof(str) - 1) |
2518 | | |
2519 | | #include "strings/unicode_prop_macros.h" |
2520 | | /* Checks if the specified grapheme is in the given character class. */ |
2521 | 1.57M | MVMint64 MVM_string_grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) { |
2522 | 1.57M | /* If it's a synthetic, then grab the base codepoint. */ |
2523 | 1.57M | MVMCodepoint cp; |
2524 | 1.57M | if (0 <= g) |
2525 | 1.57M | cp = (MVMCodepoint)g; |
2526 | 1.57M | else |
2527 | 5 | cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
2528 | 1.57M | |
2529 | 1.57M | switch (cclass) { |
2530 | 30.3k | case MVM_CCLASS_ANY: |
2531 | 30.3k | return 1; |
2532 | 30.3k | |
2533 | 230 | case MVM_CCLASS_UPPERCASE: |
2534 | 230 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
2535 | 230 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu); |
2536 | 30.3k | |
2537 | 243 | case MVM_CCLASS_LOWERCASE: |
2538 | 243 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
2539 | 243 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll); |
2540 | 30.3k | |
2541 | 659k | case MVM_CCLASS_WORD: |
2542 | 659k | if (cp <= 'z') { /* short circuit common case */ |
2543 | 658k | if (cp >= 'a' || cp == '_' || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9')) |
2544 | 447k | return 1; |
2545 | 658k | else |
2546 | 211k | return 0; |
2547 | 658k | } |
2548 | 659k | /* Deliberate fall-through; word is _ or digit or alphabetic. */ |
2549 | 659k | |
2550 | 1.07k | case MVM_CCLASS_ALPHANUMERIC: |
2551 | 1.07k | if (cp <= '9' && cp >= '0') /* short circuit common case */ |
2552 | 5 | return 1; |
2553 | 1.06k | if (MVM_unicode_codepoint_has_property_value(tc, cp, |
2554 | 1.06k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd)) |
2555 | 1 | return 1; |
2556 | 1.06k | /* Deliberate fall-through; alphanumeric is digit or alphabetic. */ |
2557 | 1.06k | |
2558 | 208k | case MVM_CCLASS_ALPHABETIC: |
2559 | 208k | if (cp <= 'z') { /* short circuit common case */ |
2560 | 205k | if (cp >= 'a' || (cp >= 'A' && cp <= 'Z')) |
2561 | 113k | return 1; |
2562 | 205k | else |
2563 | 91.9k | return 0; |
2564 | 205k | } |
2565 | 208k | /* Property L covers Lo, Ll, Lu, Lt, Lm */ |
2566 | 2.93k | return !!MVM_unicode_codepoint_get_property_int(tc, cp, |
2567 | 2.93k | MVM_UNICODE_PROPERTY_L); |
2568 | 208k | /* TODO: Maybe we want MVM_UNICODE_PROPERTY_ALPHABETIC instead? */ |
2569 | 208k | |
2570 | 345k | case MVM_CCLASS_NUMERIC: |
2571 | 345k | if (cp <= '9' && cp >= '0') /* short circuit common case */ |
2572 | 128k | return 1; |
2573 | 217k | return MVM_unicode_codepoint_has_property_value(tc, cp, |
2574 | 217k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd); |
2575 | 345k | |
2576 | 185 | case MVM_CCLASS_HEXADECIMAL: |
2577 | 185 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
2578 | 185 | MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1); |
2579 | 345k | |
2580 | 145k | case MVM_CCLASS_WHITESPACE: |
2581 | 145k | return MVM_CP_is_White_Space(cp); |
2582 | 345k | |
2583 | 63 | case MVM_CCLASS_BLANK: |
2584 | 63 | if (cp == '\t') |
2585 | 7 | return 1; |
2586 | 56 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
2587 | 56 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zs); |
2588 | 63 | |
2589 | 60 | case MVM_CCLASS_CONTROL: { |
2590 | 60 | return (cp >= 0 && cp < 32) || (cp >= 127 && cp < 160); |
2591 | 63 | } |
2592 | 63 | |
2593 | 55 | case MVM_CCLASS_PRINTING: { |
2594 | 55 | return !((cp >= 0 && cp < 32) || (cp >= 127 && cp < 160)); |
2595 | 63 | } |
2596 | 63 | |
2597 | 9.15k | case MVM_CCLASS_PUNCTUATION: |
2598 | 9.15k | return !!MVM_unicode_codepoint_get_property_int(tc, cp, |
2599 | 9.15k | MVM_UNICODE_PROPERTY_P); |
2600 | 63 | |
2601 | 172k | case MVM_CCLASS_NEWLINE: { |
2602 | 172k | if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' || |
2603 | 165k | cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp)) |
2604 | 6.92k | return 1; |
2605 | 172k | } |
2606 | 172k | |
2607 | 165k | default: |
2608 | 165k | return 0; |
2609 | 1.57M | } |
2610 | 1.57M | } |
2611 | | |
2612 | | /* Checks if the character at the specified offset is a member of the |
2613 | | * indicated character class. */ |
2614 | 237k | MVMint64 MVM_string_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset) { |
2615 | 237k | MVM_string_check_arg(tc, s, "is_cclass"); |
2616 | 237k | if (MVM_UNLIKELY(offset < 0 || MVM_string_graphs_nocheck(tc, s) <= offset)) |
2617 | 18 | return 0; |
2618 | 237k | return MVM_string_grapheme_is_cclass(tc, cclass, MVM_string_get_grapheme_at_nocheck(tc, s, offset)); |
2619 | 237k | } |
2620 | | |
2621 | | /* Searches for the next char that is in the specified character class. */ |
2622 | 18.0k | MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) { |
2623 | 18.0k | MVMGraphemeIter gi; |
2624 | 18.0k | MVMint64 length, end, pos; |
2625 | 18.0k | |
2626 | 18.0k | MVM_string_check_arg(tc, s, "find_cclass"); |
2627 | 18.0k | |
2628 | 18.0k | length = MVM_string_graphs_nocheck(tc, s); |
2629 | 18.0k | end = offset + count; |
2630 | 17.2k | end = length < end ? length : end; |
2631 | 18.0k | if (offset < 0 || offset >= length) |
2632 | 149 | return end; |
2633 | 18.0k | |
2634 | 17.8k | MVM_string_gi_init(tc, &gi, s); |
2635 | 17.8k | MVM_string_gi_move_to(tc, &gi, offset); |
2636 | 17.8k | switch (cclass) { |
2637 | 609 | case MVM_CCLASS_WHITESPACE: |
2638 | 4.14k | for (pos = offset; pos < end; pos++) { |
2639 | 3.55k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
2640 | 3.55k | MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
2641 | 3.55k | if (MVM_CP_is_White_Space(cp)) |
2642 | 20 | return pos; |
2643 | 3.55k | } |
2644 | 589 | break; |
2645 | 17.1k | case MVM_CCLASS_NEWLINE: |
2646 | 563k | for (pos = offset; pos < end; pos++) { |
2647 | 563k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
2648 | 563k | MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
2649 | 563k | if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' || |
2650 | 546k | cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp)) |
2651 | 17.1k | return pos; |
2652 | 563k | } |
2653 | 53 | break; |
2654 | 120 | default: |
2655 | 369 | for (pos = offset; pos < end; pos++) { |
2656 | 345 | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
2657 | 345 | if (MVM_string_grapheme_is_cclass(tc, cclass, g) > 0) |
2658 | 96 | return pos; |
2659 | 345 | } |
2660 | 17.8k | } |
2661 | 17.8k | |
2662 | 666 | return end; |
2663 | 17.8k | } |
2664 | | |
2665 | | /* Searches for the next char that is not in the specified character class. */ |
2666 | 24.8k | MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) { |
2667 | 24.8k | MVMGraphemeIter gi; |
2668 | 24.8k | MVMint64 length, end, pos; |
2669 | 24.8k | |
2670 | 24.8k | MVM_string_check_arg(tc, s, "find_not_cclass"); |
2671 | 24.8k | |
2672 | 24.8k | length = MVM_string_graphs_nocheck(tc, s); |
2673 | 24.8k | end = offset + count; |
2674 | 24.2k | end = length < end ? length : end; |
2675 | 24.8k | if (offset < 0 || offset >= length) |
2676 | 589 | return end; |
2677 | 24.8k | |
2678 | 24.3k | MVM_string_gi_init(tc, &gi, s); |
2679 | 24.3k | MVM_string_gi_move_to(tc, &gi, offset); |
2680 | 24.3k | switch (cclass) { |
2681 | 663 | case MVM_CCLASS_WHITESPACE: |
2682 | 735 | for (pos = offset; pos < end; pos++) { |
2683 | 730 | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
2684 | 730 | MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
2685 | 730 | if (!MVM_CP_is_White_Space(cp)) |
2686 | 658 | return pos; |
2687 | 730 | } |
2688 | 5 | break; |
2689 | 12 | case MVM_CCLASS_NEWLINE: |
2690 | 13 | for (pos = offset; pos < end; pos++) { |
2691 | 13 | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
2692 | 13 | MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0]; |
2693 | 13 | if (!(cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' || |
2694 | 12 | cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))) |
2695 | 12 | return pos; |
2696 | 13 | } |
2697 | 0 | break; |
2698 | 23.6k | default: |
2699 | 124k | for (pos = offset; pos < end; pos++) { |
2700 | 124k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
2701 | 124k | if (!MVM_string_grapheme_is_cclass(tc, cclass, g)) |
2702 | 23.6k | return pos; |
2703 | 124k | } |
2704 | 24.3k | } |
2705 | 24.3k | |
2706 | 26 | return end; |
2707 | 24.3k | } |
2708 | | |
2709 | | static MVMint16 encoding_name_init = 0; |
2710 | | static MVMString *encoding_utf8_name = NULL; |
2711 | | static MVMString *encoding_ascii_name = NULL; |
2712 | | static MVMString *encoding_latin1_name = NULL; |
2713 | | static MVMString *encoding_utf16_name = NULL; |
2714 | | static MVMString *encoding_windows1252_name = NULL; |
2715 | | static MVMString *encoding_windows1251_name = NULL; |
2716 | | static MVMString *encoding_shiftjis_name = NULL; |
2717 | | static MVMString *encoding_utf8_c8_name = NULL; |
2718 | 13.7k | MVMuint8 MVM_string_find_encoding(MVMThreadContext *tc, MVMString *name) { |
2719 | 13.7k | MVM_string_check_arg(tc, name, "find encoding"); |
2720 | 13.7k | if (!encoding_name_init) { |
2721 | 144 | MVM_gc_allocate_gen2_default_set(tc); |
2722 | 144 | encoding_utf8_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8"); |
2723 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_name, "Encoding name"); |
2724 | 144 | encoding_ascii_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "ascii"); |
2725 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_ascii_name, "Encoding name"); |
2726 | 144 | encoding_latin1_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "iso-8859-1"); |
2727 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_latin1_name, "Encoding name"); |
2728 | 144 | encoding_utf16_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf16"); |
2729 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf16_name, "Encoding name"); |
2730 | 144 | encoding_windows1252_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1252"); |
2731 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1252_name, "Encoding name"); |
2732 | 144 | encoding_windows1251_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1251"); |
2733 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1251_name, "Encoding name"); |
2734 | 144 | encoding_shiftjis_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-932"); |
2735 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_shiftjis_name, "Encoding name"); |
2736 | 144 | encoding_utf8_c8_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8-c8"); |
2737 | 144 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_c8_name, "Encoding name"); |
2738 | 144 | encoding_name_init = 1; |
2739 | 144 | MVM_gc_allocate_gen2_default_clear(tc); |
2740 | 144 | } |
2741 | 13.7k | |
2742 | 13.7k | if (MVM_string_equal(tc, name, encoding_utf8_name)) { |
2743 | 13.6k | return MVM_encoding_type_utf8; |
2744 | 13.6k | } |
2745 | 48 | else if (MVM_string_equal(tc, name, encoding_ascii_name)) { |
2746 | 7 | return MVM_encoding_type_ascii; |
2747 | 7 | } |
2748 | 41 | else if (MVM_string_equal(tc, name, encoding_latin1_name)) { |
2749 | 5 | return MVM_encoding_type_latin1; |
2750 | 5 | } |
2751 | 36 | else if (MVM_string_equal(tc, name, encoding_windows1252_name)) { |
2752 | 16 | return MVM_encoding_type_windows1252; |
2753 | 16 | } |
2754 | 20 | else if (MVM_string_equal(tc, name, encoding_windows1251_name)) { |
2755 | 0 | return MVM_encoding_type_windows1251; |
2756 | 0 | } |
2757 | 20 | else if (MVM_string_equal(tc, name, encoding_utf16_name)) { |
2758 | 2 | return MVM_encoding_type_utf16; |
2759 | 2 | } |
2760 | 18 | else if (MVM_string_equal(tc, name, encoding_utf8_c8_name)) { |
2761 | 17 | return MVM_encoding_type_utf8_c8; |
2762 | 17 | } |
2763 | 1 | else if (MVM_string_equal(tc, name, encoding_shiftjis_name)) { |
2764 | 0 | return MVM_encoding_type_shiftjis; |
2765 | 0 | } |
2766 | 1 | else { |
2767 | 1 | char *c_name = MVM_string_utf8_encode_C_string(tc, name); |
2768 | 1 | char *waste[] = { c_name, NULL }; |
2769 | 1 | MVM_exception_throw_adhoc_free(tc, waste, "Unknown string encoding: '%s'", |
2770 | 1 | c_name); |
2771 | 1 | } |
2772 | 13.7k | } |
2773 | | |
2774 | | /* Turns a codepoint into a string. If required uses the normalizer to ensure |
2775 | | * that we get a valid NFG string (NFG is a superset of NFC, and singleton |
2776 | | * decompositions exist). */ |
2777 | 8.53k | MVMString * MVM_string_chr(MVMThreadContext *tc, MVMint64 cp) { |
2778 | 8.53k | MVMString *s = NULL; |
2779 | 8.53k | MVMGrapheme32 g; |
2780 | 8.53k | |
2781 | 8.53k | if (cp < 0) |
2782 | 0 | MVM_exception_throw_adhoc(tc, "chr codepoint cannot be negative"); |
2783 | 8.53k | /* If the codepoint decomposes we may need to normalize it. |
2784 | 8.53k | * The first cp that decomposes is U+0340, but to be on the safe side |
2785 | 8.53k | * for now we go with the first significant character which at the time |
2786 | 8.53k | * of writing (Unicode 9.0) is COMBINING GRAVE ACCENT U+300 */ |
2787 | 8.53k | if (cp >= MVM_NORMALIZE_FIRST_SIG_NFC |
2788 | 220 | && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE) |
2789 | 220 | != MVM_UNICODE_PVALUE_DT_NONE) { |
2790 | 80 | |
2791 | 80 | MVMNormalizer norm; |
2792 | 80 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
2793 | 80 | if (!MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp, &g)) { |
2794 | 80 | MVM_unicode_normalizer_eof(tc, &norm); |
2795 | 80 | g = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
2796 | 80 | } |
2797 | 80 | MVM_unicode_normalizer_cleanup(tc, &norm); |
2798 | 80 | } |
2799 | 8.45k | else { |
2800 | 8.45k | g = (MVMGrapheme32) cp; |
2801 | 8.45k | } |
2802 | 8.53k | |
2803 | 8.53k | s = (MVMString *)REPR(tc->instance->VMString)->allocate(tc, STABLE(tc->instance->VMString)); |
2804 | 8.53k | if (can_fit_into_8bit(g)) { |
2805 | 8.29k | s->body.storage_type = MVM_STRING_GRAPHEME_8; |
2806 | 8.29k | s->body.storage.blob_8 = MVM_malloc(sizeof(MVMGrapheme8)); |
2807 | 8.29k | s->body.storage.blob_8[0] = g; |
2808 | 243 | } else { |
2809 | 243 | s->body.storage_type = MVM_STRING_GRAPHEME_32; |
2810 | 243 | s->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32)); |
2811 | 243 | s->body.storage.blob_32[0] = g; |
2812 | 243 | } |
2813 | 8.53k | s->body.num_graphs = 1; |
2814 | 8.53k | return s; |
2815 | 8.53k | } |
2816 | | |
2817 | | /* Takes a string and computes a hash code for it, storing it in the hash code |
2818 | | * cache field of the string. Hashing code is derived from the Jenkins hash |
2819 | | * implementation in uthash.h. */ |
2820 | | typedef union { |
2821 | | MVMint32 graphs[3]; |
2822 | | unsigned char bytes[12]; |
2823 | | } MVMJenHashGraphemeView; |
2824 | | |
2825 | 2.61M | MVM_STATIC_INLINE void MVM_hash_add_three (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv) { |
2826 | 2.61M | *hj_i += hash_block->graphs[0]; |
2827 | 2.61M | *hj_j += hash_block->graphs[1]; |
2828 | 2.61M | *hashv += hash_block->graphs[2]; |
2829 | 2.61M | HASH_JEN_MIX(*hj_i, *hj_j, *hashv); |
2830 | 2.61M | } |
2831 | 915k | MVM_STATIC_INLINE void MVM_hash_finish (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv, MVMStringIndex sgraphs, MVMStringIndex graphs_remaining) { |
2832 | 915k | /* Mix in key length (in bytes, not graphemes). */ |
2833 | 915k | *hashv += sgraphs * sizeof(MVMGrapheme32); |
2834 | 915k | |
2835 | 915k | /* Now handle trailing graphemes (must be 2, 1, or 0). */ |
2836 | 915k | /* NOTE: this is weird since it changes the order in different cases. This |
2837 | 915k | * is just replicating old functionality. */ |
2838 | 915k | switch (graphs_remaining) { |
2839 | 304k | case 2: |
2840 | 304k | *hj_j += hash_block->graphs[0]; |
2841 | 304k | *hj_i += hash_block->graphs[1]; |
2842 | 304k | break; |
2843 | 304k | /* Fallthrough */ |
2844 | 296k | case 1: |
2845 | 296k | *hj_i += hash_block->graphs[0]; |
2846 | 915k | } |
2847 | 915k | HASH_JEN_MIX(*hj_i, *hj_j, *hashv); |
2848 | 915k | /* Because we check if MVMString->body.cached_hash_code == 0 to tell if |
2849 | 915k | * we have not yet computed the hash code, ensure that hashv is never 0 |
2850 | 915k | * by adding the length of the string to hashv iff hashv == 0. Since both |
2851 | 915k | * the hashv and MVMStringIndex are both uint32, there should never be any |
2852 | 915k | * overflow. Only problematic case is if the string is of length 0 and |
2853 | 915k | * hashv is zero, though this is very very unlikely (if possible at all) |
2854 | 915k | * and it should be very fast to calculate the hash so as to be negligible. */ |
2855 | 915k | if (*hashv == 0) { |
2856 | 0 | *hashv += sgraphs; |
2857 | 0 | } |
2858 | 915k | } |
2859 | 915k | void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) { |
2860 | 915k | /* The hash algorithm works in bytes. Since we can represent strings in a |
2861 | 915k | * number of ways, and we want consistent hashing, then we'll read the |
2862 | 915k | * strings using the grapheme iterator in groups of 3, using 32-bit ints |
2863 | 915k | * for the graphemes no matter what the string really holds them as. Then |
2864 | 915k | * we'll use the bytes view of that in the hashing function. */ |
2865 | 915k | |
2866 | 915k | MVMStringIndex graphs_remaining, sgraphs; |
2867 | 915k | |
2868 | 915k | /* Initialize hash state. */ |
2869 | 915k | MVMhashv hashv = tc->instance->hashSecret; |
2870 | 915k | MVMuint32 hj_i, hj_j; |
2871 | 915k | hj_i = hj_j = 0x9e3779b9; |
2872 | 915k | graphs_remaining = sgraphs = MVM_string_graphs(tc, s); |
2873 | 915k | |
2874 | 915k | switch (s->body.storage_type) { |
2875 | 673k | case MVM_STRING_GRAPHEME_ASCII: |
2876 | 673k | case MVM_STRING_GRAPHEME_8: { |
2877 | 673k | int i; |
2878 | 673k | MVMJenHashGraphemeView hash_block; |
2879 | 2.84M | for (i = 0; 3 <= sgraphs - i; i += 3) { |
2880 | 2.16M | hash_block.graphs[0] = s->body.storage.blob_8[i]; |
2881 | 2.16M | hash_block.graphs[1] = s->body.storage.blob_8[i+1]; |
2882 | 2.16M | hash_block.graphs[2] = s->body.storage.blob_8[i+2]; |
2883 | 2.16M | MVM_hash_add_three( |
2884 | 2.16M | &hash_block, |
2885 | 2.16M | &hj_i, &hj_j, &hashv); |
2886 | 2.16M | } |
2887 | 673k | graphs_remaining = sgraphs - i; |
2888 | 673k | switch (graphs_remaining) { |
2889 | 217k | case 1: |
2890 | 217k | hash_block.graphs[0] = s->body.storage.blob_8[i]; |
2891 | 217k | break; |
2892 | 222k | case 2: |
2893 | 222k | hash_block.graphs[0] = s->body.storage.blob_8[i]; |
2894 | 222k | hash_block.graphs[1] = s->body.storage.blob_8[i+1]; |
2895 | 222k | break; |
2896 | 673k | } |
2897 | 673k | MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining); |
2898 | 673k | break; |
2899 | 673k | } |
2900 | 94.3k | case MVM_STRING_GRAPHEME_32: { |
2901 | 94.3k | int i; |
2902 | 218k | for (i = 0; 3 <= sgraphs - i; i += 3) { |
2903 | 123k | MVM_hash_add_three( |
2904 | 123k | (MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i), |
2905 | 123k | &hj_i, &hj_j, &hashv); |
2906 | 123k | } |
2907 | 94.3k | graphs_remaining = sgraphs - i; |
2908 | 94.3k | MVM_hash_finish((MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i), &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining); |
2909 | 94.3k | break; |
2910 | 673k | } |
2911 | 147k | default: { |
2912 | 147k | MVMGraphemeIter gi; |
2913 | 147k | MVMJenHashGraphemeView hash_block; |
2914 | 147k | /* Work through the string 3 graphemes at a time. */ |
2915 | 147k | MVM_string_gi_init(tc, &gi, s); |
2916 | 471k | while (3 <= graphs_remaining) { |
2917 | 323k | hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi); |
2918 | 323k | hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi); |
2919 | 323k | hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi); |
2920 | 323k | MVM_hash_add_three( |
2921 | 323k | &hash_block, |
2922 | 323k | &hj_i, &hj_j, &hashv); |
2923 | 323k | graphs_remaining -= 3; |
2924 | 323k | } |
2925 | 147k | /* Now handle trailing graphemes (must be 2, 1, or 0). */ |
2926 | 147k | switch (graphs_remaining) { |
2927 | 61.4k | case 1: |
2928 | 61.4k | hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi); |
2929 | 61.4k | break; |
2930 | 50.1k | case 2: |
2931 | 50.1k | hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi); |
2932 | 50.1k | hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi); |
2933 | 50.1k | break; |
2934 | 147k | |
2935 | 147k | } |
2936 | 147k | MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining); |
2937 | 147k | } |
2938 | 915k | } |
2939 | 915k | /* Store computed hash value. */ |
2940 | 915k | s->body.cached_hash_code = hashv; |
2941 | 915k | } |