/home/travis/build/MoarVM/MoarVM/src/strings/ops.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "platform/memmem.h" |
2 | | #include "moar.h" |
3 | | #define MVM_DEBUG_STRANDS 0 |
4 | | |
5 | | #if MVM_DEBUG_STRANDS |
6 | | static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) { |
7 | | MVMGraphemeIter gi; |
8 | | MVMuint32 len; |
9 | | MVM_string_gi_init(tc, &gi, s); |
10 | | len = 0; |
11 | | while (MVM_string_gi_has_more(tc, &gi)) { |
12 | | MVM_string_gi_get_grapheme(tc, &gi); |
13 | | len++; |
14 | | } |
15 | | if (len != MVM_string_graphs(tc, s)) |
16 | | MVM_exception_throw_adhoc(tc, |
17 | | "Strand sanity check failed (stand length %d != num_graphs %d)", |
18 | | len, MVM_string_graphs(tc, s)); |
19 | | } |
20 | | #define STRAND_CHECK(tc, s) check_strand_sanity(tc, s); |
21 | | #else |
22 | | #define STRAND_CHECK(tc, s) |
23 | | #endif |
24 | | |
25 | | /* Allocates strand storage. */ |
26 | 693k | static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) { |
27 | 693k | return MVM_malloc(num_strands * sizeof(MVMStringStrand)); |
28 | 693k | } |
29 | | |
30 | | /* Copies strands from one strand string to another. */ |
31 | | static void copy_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset, |
32 | 148k | MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) { |
33 | 148k | assert(from->body.storage_type == MVM_STRING_STRAND); |
34 | 148k | assert(to->body.storage_type == MVM_STRING_STRAND); |
35 | 148k | memcpy( |
36 | 148k | to->body.storage.strands + to_offset, |
37 | 148k | from->body.storage.strands + from_offset, |
38 | 148k | num_strands * sizeof(MVMStringStrand)); |
39 | 148k | } |
40 | | |
41 | | /* If a string is currently using 32bit storage, turn it into using |
42 | | * 8 bit storage. Doesn't do any checks at all. */ |
43 | 24.2k | static void turn_32bit_into_8bit_unchecked(MVMThreadContext *tc, MVMString *str) { |
44 | 24.2k | MVMGrapheme32 *old_buf = str->body.storage.blob_32; |
45 | 24.2k | MVMStringIndex i; |
46 | 24.2k | str->body.storage_type = MVM_STRING_GRAPHEME_8; |
47 | 24.2k | str->body.storage.blob_8 = MVM_malloc(str->body.num_graphs * sizeof(MVMGrapheme8)); |
48 | 24.2k | |
49 | 405k | for (i = 0; i < str->body.num_graphs; i++) { |
50 | 381k | str->body.storage.blob_8[i] = old_buf[i]; |
51 | 381k | } |
52 | 24.2k | |
53 | 24.2k | MVM_free(old_buf); |
54 | 24.2k | } |
55 | | |
56 | | /* Collapses a bunch of strands into a single blob string. */ |
57 | 6 | static MVMString * collapse_strands(MVMThreadContext *tc, MVMString *orig) { |
58 | 6 | MVMString *result; |
59 | 6 | MVMStringIndex i; |
60 | 6 | MVMuint32 ographs; |
61 | 6 | MVMGraphemeIter gi; |
62 | 6 | |
63 | 6 | MVMROOT(tc, orig, { |
64 | 6 | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
65 | 6 | }); |
66 | 6 | ographs = MVM_string_graphs(tc, orig); |
67 | 6 | result->body.num_graphs = ographs; |
68 | 6 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
69 | 6 | result->body.storage.blob_32 = MVM_malloc(ographs * sizeof(MVMGrapheme32)); |
70 | 6 | |
71 | 6 | MVM_string_gi_init(tc, &gi, orig); |
72 | 493 | for (i = 0; i < ographs; i++) { |
73 | 487 | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
74 | 487 | result->body.storage.blob_32[i] = g; |
75 | 487 | if (g < -127 || g > 127) { |
76 | 0 | /* If we know we can't fit into 8 bits, enter a tighter loop for maximum speed */ |
77 | 0 | for (i++; i < ographs; i++) { |
78 | 0 | result->body.storage.blob_32[i] = MVM_string_gi_get_grapheme(tc, &gi); |
79 | 0 | } |
80 | 0 | return result; |
81 | 0 | } |
82 | 487 | } |
83 | 6 | /* If we get here, we didn't see any cp's lower than -127 or higher than 127 |
84 | 6 | * so turn it into an 8 bit string */ |
85 | 6 | turn_32bit_into_8bit_unchecked(tc, result); |
86 | 6 | return result; |
87 | 6 | } |
88 | | |
89 | | /* Takes a string that is no longer in NFG form after some concatenation-style |
90 | | * operation, and returns a new string that is in NFG. Note that we could do a |
91 | | * much, much, smarter thing in the future that doesn't involve all of this |
92 | | * copying and allocation and re-doing the whole string, but cases like this |
93 | | * should be fairly rare anyway, so go with simplicity for now. */ |
94 | 40 | static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) { |
95 | 40 | MVMNormalizer norm; |
96 | 40 | MVMCodepointIter ci; |
97 | 40 | MVMint32 ready; |
98 | 40 | MVMString *out; |
99 | 40 | MVMuint32 bufsize = in->body.num_graphs; |
100 | 40 | |
101 | 40 | /* Create the output buffer. We used to believe it can't ever be bigger |
102 | 40 | * than the initial estimate, but utf8-c8 showed us otherwise. */ |
103 | 40 | MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
104 | 40 | MVMint64 out_pos = 0; |
105 | 40 | |
106 | 40 | /* Iterate codepoints and normalizer. */ |
107 | 40 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
108 | 40 | MVM_string_ci_init(tc, &ci, in, 0); |
109 | 274 | while (MVM_string_ci_has_more(tc, &ci)) { |
110 | 234 | MVMGrapheme32 g; |
111 | 234 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g); |
112 | 234 | if (ready) { |
113 | 153 | if (out_pos + ready > bufsize) { |
114 | 0 | /* Doubling up the buffer size seems excessive, so just |
115 | 0 | * add a generous amount of storage */ |
116 | 0 | bufsize += ready + 32; |
117 | 0 | out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32)); |
118 | 0 | } |
119 | 153 | out_buffer[out_pos++] = g; |
120 | 189 | while (--ready > 0) { |
121 | 36 | out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
122 | 36 | } |
123 | 153 | } |
124 | 234 | } |
125 | 40 | MVM_unicode_normalizer_eof(tc, &norm); |
126 | 40 | ready = MVM_unicode_normalizer_available(tc, &norm); |
127 | 40 | if (out_pos + ready > bufsize) { |
128 | 0 | bufsize += ready + 1; |
129 | 0 | out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32)); |
130 | 0 | } |
131 | 50 | while (ready--) { |
132 | 10 | out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
133 | 10 | } |
134 | 40 | MVM_unicode_normalizer_cleanup(tc, &norm); |
135 | 40 | |
136 | 40 | /* Build result string. */ |
137 | 40 | out = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
138 | 40 | out->body.storage.blob_32 = out_buffer; |
139 | 40 | out->body.storage_type = MVM_STRING_GRAPHEME_32; |
140 | 40 | out->body.num_graphs = out_pos; |
141 | 40 | return out; |
142 | 40 | } |
143 | | |
144 | | /* Returns nonzero if two substrings are equal, doesn't check bounds */ |
145 | | MVMint64 MVM_string_substrings_equal_nocheck(MVMThreadContext *tc, MVMString *a, |
146 | 15.9M | MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) { |
147 | 15.9M | MVMGraphemeIter gia; |
148 | 15.9M | MVMGraphemeIter gib; |
149 | 15.9M | MVMint64 i; |
150 | 15.9M | |
151 | 15.9M | /* Fast paths when storage types are identical. */ |
152 | 15.9M | switch (a->body.storage_type) { |
153 | 395k | case MVM_STRING_GRAPHEME_32: |
154 | 395k | if (b->body.storage_type == MVM_STRING_GRAPHEME_32) |
155 | 142k | return 0 == memcmp( |
156 | 142k | a->body.storage.blob_32 + starta, |
157 | 142k | b->body.storage.blob_32 + startb, |
158 | 142k | length * sizeof(MVMGrapheme32)); |
159 | 253k | break; |
160 | 12.3M | case MVM_STRING_GRAPHEME_ASCII: |
161 | 12.3M | case MVM_STRING_GRAPHEME_8: |
162 | 12.3M | if (b->body.storage_type == MVM_STRING_GRAPHEME_ASCII || |
163 | 12.3M | b->body.storage_type == MVM_STRING_GRAPHEME_8) |
164 | 11.9M | return 0 == memcmp( |
165 | 11.9M | a->body.storage.blob_8 + starta, |
166 | 11.9M | b->body.storage.blob_8 + startb, |
167 | 11.9M | length); |
168 | 380k | break; |
169 | 15.9M | } |
170 | 15.9M | |
171 | 15.9M | /* Normal path, for the rest of the time. */ |
172 | 3.86M | MVM_string_gi_init(tc, &gia, a); |
173 | 3.86M | MVM_string_gi_init(tc, &gib, b); |
174 | 3.86M | MVM_string_gi_move_to(tc, &gia, starta); |
175 | 3.86M | MVM_string_gi_move_to(tc, &gib, startb); |
176 | 6.14M | for (i = 0; i < length; i++) |
177 | 5.62M | if (MVM_string_gi_get_grapheme(tc, &gia) != MVM_string_gi_get_grapheme(tc, &gib)) |
178 | 3.34M | return 0; |
179 | 518k | return 1; |
180 | 3.86M | } |
181 | | |
182 | | /* Returns the codepoint without doing checks, for internal VM use only. */ |
183 | 10.4M | MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString *a, MVMint64 index) { |
184 | 10.4M | switch (a->body.storage_type) { |
185 | 8.73M | case MVM_STRING_GRAPHEME_32: |
186 | 8.73M | return a->body.storage.blob_32[index]; |
187 | 0 | case MVM_STRING_GRAPHEME_ASCII: |
188 | 0 | return a->body.storage.blob_ascii[index]; |
189 | 1.49M | case MVM_STRING_GRAPHEME_8: |
190 | 1.49M | return a->body.storage.blob_8[index]; |
191 | 234k | case MVM_STRING_STRAND: { |
192 | 234k | MVMGraphemeIter gi; |
193 | 234k | MVM_string_gi_init(tc, &gi, a); |
194 | 234k | MVM_string_gi_move_to(tc, &gi, index); |
195 | 234k | return MVM_string_gi_get_grapheme(tc, &gi); |
196 | 8.73M | } |
197 | 0 | default: |
198 | 0 | MVM_exception_throw_adhoc(tc, "String corruption detected: bad storage type"); |
199 | 10.4M | } |
200 | 10.4M | } |
201 | | |
202 | | /* Returns the location of one string in another or -1 */ |
203 | 405k | MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) { |
204 | 405k | size_t index = (size_t)start; |
205 | 405k | MVMStringIndex hgraphs = MVM_string_graphs(tc, haystack), ngraphs = MVM_string_graphs(tc, needle); |
206 | 405k | MVM_string_check_arg(tc, haystack, "index search target"); |
207 | 405k | MVM_string_check_arg(tc, needle, "index search term"); |
208 | 405k | |
209 | 405k | if (!ngraphs) |
210 | 48 | return start <= hgraphs ? start : -1; /* the empty string is in any other string */ |
211 | 405k | |
212 | 405k | if (!hgraphs) |
213 | 41 | return -1; |
214 | 405k | |
215 | 405k | if (start < 0 || start >= hgraphs) |
216 | 148 | return -1; |
217 | 405k | |
218 | 405k | if (ngraphs > hgraphs || ngraphs < 1) |
219 | 3 | return -1; |
220 | 405k | |
221 | 405k | /* Fast paths when storage types are identical. Uses memmem function, which |
222 | 405k | * uses Knuth-Morris-Pratt algorithm on Linux and on others |
223 | 405k | * Crochemore+Perrin two-way string matching */ |
224 | 405k | switch (haystack->body.storage_type) { |
225 | 18.1k | case MVM_STRING_GRAPHEME_32: |
226 | 18.1k | if (needle->body.storage_type == MVM_STRING_GRAPHEME_32) { |
227 | 15.3k | void *start_ptr = haystack->body.storage.blob_32 + start; |
228 | 15.3k | void *mm_return_32; |
229 | 15.3k | void *end_ptr = (char*)start_ptr + sizeof(MVMGrapheme32) * (hgraphs - start); |
230 | 15.3k | do { |
231 | 15.3k | /* Keep as void* to not lose precision */ |
232 | 15.3k | mm_return_32 = MVM_memmem( |
233 | 15.3k | start_ptr, /* start position */ |
234 | 15.3k | (char*)end_ptr - (char*)start_ptr, /* length of haystack from start position to end */ |
235 | 15.3k | needle->body.storage.blob_32, /* needle start */ |
236 | 15.3k | ngraphs * sizeof(MVMGrapheme32) /* needle length */ |
237 | 15.3k | ); |
238 | 15.3k | if (mm_return_32 == NULL) |
239 | 25 | return -1; |
240 | 15.3k | } /* If we aren't on a 32 bit boundary then continue from where we left off (unlikely but possible) */ |
241 | 15.3k | while ( ( (char*)mm_return_32 - (char*)haystack->body.storage.blob_32) % sizeof(MVMGrapheme32) |
242 | 0 | && ( start_ptr = mm_return_32 ) /* Set the new start pointer at where we left off */ |
243 | 0 | && ( end_ptr > start_ptr ) /* Check we aren't past the end of the string just in case */ |
244 | 15.3k | ); |
245 | 15.3k | |
246 | 15.3k | return (MVMGrapheme32*)mm_return_32 - haystack->body.storage.blob_32; |
247 | 15.3k | } |
248 | 2.80k | break; |
249 | 354k | case MVM_STRING_GRAPHEME_8: |
250 | 354k | if (needle->body.storage_type == MVM_STRING_GRAPHEME_8) { |
251 | 347k | void *mm_return_8 = MVM_memmem( |
252 | 347k | haystack->body.storage.blob_8 + start, /* start position */ |
253 | 347k | (hgraphs - start) * sizeof(MVMGrapheme8), /* length of haystack from start position to end */ |
254 | 347k | needle->body.storage.blob_8, /* needle start */ |
255 | 347k | ngraphs * sizeof(MVMGrapheme8) /* needle length */ |
256 | 347k | ); |
257 | 347k | if (mm_return_8 == NULL) |
258 | 296k | return -1; |
259 | 347k | else |
260 | 51.0k | return (MVMGrapheme8*)mm_return_8 - haystack->body.storage.blob_8; |
261 | 347k | } |
262 | 6.90k | break; |
263 | 405k | } |
264 | 405k | |
265 | 405k | /* brute force for now. horrible, yes. halp. */ |
266 | 2.98M | while (index <= hgraphs - ngraphs) { |
267 | 2.96M | if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) { |
268 | 19.3k | return (MVMint64)index; |
269 | 19.3k | } |
270 | 2.94M | index++; |
271 | 2.94M | } |
272 | 23.1k | return -1; |
273 | 42.5k | } |
274 | | |
275 | | /* Returns the location of one string in another or -1 */ |
276 | 12 | MVMint64 MVM_string_index_from_end(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) { |
277 | 12 | MVMint64 result = -1; |
278 | 12 | size_t index; |
279 | 12 | MVMStringIndex hgraphs, ngraphs; |
280 | 12 | |
281 | 12 | MVM_string_check_arg(tc, haystack, "rindex search target"); |
282 | 12 | MVM_string_check_arg(tc, needle, "rindex search term"); |
283 | 12 | hgraphs = MVM_string_graphs_nocheck(tc, haystack); |
284 | 12 | ngraphs = MVM_string_graphs_nocheck(tc, needle); |
285 | 12 | |
286 | 12 | if (!ngraphs) { |
287 | 4 | if (start >= 0) |
288 | 3 | return start <= hgraphs ? start : -1; /* the empty string is in any other string */ |
289 | 4 | else |
290 | 1 | return hgraphs; /* no start, so return end */ |
291 | 4 | } |
292 | 12 | |
293 | 8 | if (!hgraphs) |
294 | 0 | return -1; |
295 | 8 | |
296 | 8 | if (ngraphs > hgraphs || ngraphs < 1) |
297 | 0 | return -1; |
298 | 8 | |
299 | 8 | if (start == -1) |
300 | 2 | start = hgraphs - ngraphs; |
301 | 8 | |
302 | 8 | if (start < 0 || start >= hgraphs) |
303 | 8 | /* maybe return -1 instead? */ |
304 | 0 | MVM_exception_throw_adhoc(tc, "index start offset out of range"); |
305 | 8 | |
306 | 8 | index = start; |
307 | 8 | |
308 | 8 | if (index + ngraphs > hgraphs) { |
309 | 1 | index = hgraphs - ngraphs; |
310 | 1 | } |
311 | 8 | |
312 | 8 | /* brute force for now. horrible, yes. halp. */ |
313 | 19 | do { |
314 | 19 | if (MVM_string_substrings_equal_nocheck(tc, needle, 0, ngraphs, haystack, index)) { |
315 | 6 | result = (MVMint64)index; |
316 | 6 | break; |
317 | 6 | } |
318 | 13 | } while (index-- > 0); |
319 | 8 | return result; |
320 | 8 | } |
321 | | |
322 | | /* Returns a substring of the given string */ |
323 | 563k | MVMString * MVM_string_substring(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMint64 length) { |
324 | 563k | MVMString *result; |
325 | 563k | MVMint64 start_pos, end_pos; |
326 | 563k | |
327 | 563k | MVMint64 agraphs; |
328 | 563k | |
329 | 563k | MVM_string_check_arg(tc, a, "substring"); |
330 | 563k | /* convert to signed to avoid implicit arithmetic conversions */ |
331 | 563k | agraphs = (MVMint64)MVM_string_graphs_nocheck(tc, a); |
332 | 563k | |
333 | 563k | /* -1 signifies go to the end of the string; anything less is a bug */ |
334 | 563k | if (length < -1) |
335 | 0 | MVM_exception_throw_adhoc(tc, "Substring length (%"PRId64") cannot be negative", length); |
336 | 563k | |
337 | 563k | /* negative offsets count from the end */ |
338 | 563k | start_pos = offset < 0 ? offset + agraphs : offset; |
339 | 530k | end_pos = length == -1 ? agraphs : start_pos + length; |
340 | 563k | |
341 | 563k | /* return an empty string if start_pos is out of bounds but positive */ |
342 | 563k | if (start_pos > agraphs) { |
343 | 0 | start_pos = 0; |
344 | 0 | end_pos = 0; |
345 | 0 | } |
346 | 563k | |
347 | 563k | if (end_pos < 0) |
348 | 0 | MVM_exception_throw_adhoc(tc, "Substring end (%"PRId64") cannot be less than 0", end_pos); |
349 | 563k | |
350 | 563k | /* Ensure we're within bounds. */ |
351 | 563k | if (start_pos < 0) |
352 | 0 | start_pos = 0; |
353 | 563k | if (end_pos > agraphs) |
354 | 7.02k | end_pos = agraphs; |
355 | 563k | |
356 | 563k | /* Check trivial cases: empty string and whole string. */ |
357 | 563k | if (start_pos == end_pos) |
358 | 9.98k | return tc->instance->str_consts.empty; |
359 | 553k | if (start_pos == 0 && end_pos == agraphs) |
360 | 22.9k | return a; |
361 | 553k | |
362 | 553k | /* Construct a result; how we efficiently do so will vary based on the |
363 | 553k | * input string. */ |
364 | 530k | MVMROOT(tc, a, { |
365 | 530k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
366 | 530k | result->body.num_graphs = end_pos - start_pos; |
367 | 530k | if (a->body.storage_type != MVM_STRING_STRAND) { |
368 | 530k | /* It's some kind of buffer. Construct a strand view into it. */ |
369 | 530k | result->body.storage_type = MVM_STRING_STRAND; |
370 | 530k | result->body.storage.strands = allocate_strands(tc, 1); |
371 | 530k | result->body.num_strands = 1; |
372 | 530k | result->body.storage.strands[0].blob_string = a; |
373 | 530k | result->body.storage.strands[0].start = start_pos; |
374 | 530k | result->body.storage.strands[0].end = end_pos; |
375 | 530k | result->body.storage.strands[0].repetitions = 0; |
376 | 530k | } |
377 | 530k | else if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) { |
378 | 530k | /* Single strand string; quite possibly already a substring. We'll |
379 | 530k | * just produce an updated view. */ |
380 | 530k | MVMStringStrand *orig_strand = &(a->body.storage.strands[0]); |
381 | 530k | result->body.storage_type = MVM_STRING_STRAND; |
382 | 530k | result->body.storage.strands = allocate_strands(tc, 1); |
383 | 530k | result->body.num_strands = 1; |
384 | 530k | result->body.storage.strands[0].blob_string = orig_strand->blob_string; |
385 | 530k | result->body.storage.strands[0].start = orig_strand->start + start_pos; |
386 | 530k | result->body.storage.strands[0].end = orig_strand->start + end_pos; |
387 | 530k | result->body.storage.strands[0].repetitions = 0; |
388 | 530k | } |
389 | 530k | else { |
390 | 530k | /* Produce a new blob string, collapsing the strands. */ |
391 | 530k | MVMGraphemeIter gi; |
392 | 530k | MVMint32 i; |
393 | 530k | MVMint8 can_fit_into_8 = 1; |
394 | 530k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
395 | 530k | result->body.storage.blob_32 = MVM_malloc(result->body.num_graphs * sizeof(MVMGrapheme32)); |
396 | 530k | MVM_string_gi_init(tc, &gi, a); |
397 | 530k | MVM_string_gi_move_to(tc, &gi, start_pos); |
398 | 530k | for (i = 0; i < result->body.num_graphs; i++) { |
399 | 530k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
400 | 530k | if (g < -128 || g >= 128) |
401 | 530k | can_fit_into_8 = 0; |
402 | 530k | result->body.storage.blob_32[i] = g; |
403 | 530k | } |
404 | 530k | if (can_fit_into_8) |
405 | 530k | turn_32bit_into_8bit_unchecked(tc, result); |
406 | 530k | } |
407 | 530k | }); |
408 | 530k | |
409 | 530k | STRAND_CHECK(tc, result); |
410 | 530k | return result; |
411 | 553k | } |
412 | | |
413 | 1 | MVMString * MVM_string_replace(MVMThreadContext *tc, MVMString *original, MVMint64 start, MVMint64 count, MVMString *replacement) { |
414 | 1 | /* XXX this could probably be done more efficiently directly. */ |
415 | 1 | MVMString *first_part; |
416 | 1 | MVMString *rest_part; |
417 | 1 | MVMString *result; |
418 | 1 | |
419 | 1 | MVM_gc_root_temp_push(tc, (MVMCollectable **)&replacement); |
420 | 1 | MVM_gc_root_temp_push(tc, (MVMCollectable **)&original); |
421 | 1 | first_part = MVM_string_substring(tc, original, 0, start); |
422 | 1 | MVM_gc_root_temp_push(tc, (MVMCollectable **)&first_part); |
423 | 1 | |
424 | 1 | rest_part = MVM_string_substring(tc, original, start + count, -1); |
425 | 1 | rest_part = MVM_string_concatenate(tc, replacement, rest_part); |
426 | 1 | result = MVM_string_concatenate(tc, first_part, rest_part); |
427 | 1 | |
428 | 1 | MVM_gc_root_temp_pop_n(tc, 3); |
429 | 1 | |
430 | 1 | STRAND_CHECK(tc, result); |
431 | 1 | return result; |
432 | 1 | } |
433 | | |
434 | | /* Append one string to another. */ |
435 | 168k | static MVMint32 final_strand_matches(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
436 | 168k | if (a->body.storage_type == MVM_STRING_STRAND) { |
437 | 88.2k | MVMStringStrand *ss = &(a->body.storage.strands[a->body.num_strands - 1]); |
438 | 88.2k | if (ss->end - ss->start == MVM_string_graphs(tc, b)) |
439 | 26.5k | if (MVM_string_equal_at(tc, ss->blob_string, b, ss->start)) |
440 | 143 | return 1; |
441 | 88.2k | } |
442 | 168k | return 0; |
443 | 168k | } |
444 | 177k | MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
445 | 177k | MVMString *result; |
446 | 177k | MVMuint32 agraphs, bgraphs; |
447 | 177k | MVMuint64 total_graphs; |
448 | 177k | |
449 | 177k | MVM_string_check_arg(tc, a, "concatenate"); |
450 | 177k | MVM_string_check_arg(tc, b, "concatenate"); |
451 | 177k | |
452 | 177k | /* Simple empty-string cases. */ |
453 | 177k | agraphs = MVM_string_graphs_nocheck(tc, a); |
454 | 177k | if (agraphs == 0) |
455 | 8.91k | return b; |
456 | 168k | bgraphs = MVM_string_graphs(tc, b); |
457 | 168k | if (bgraphs == 0) |
458 | 159 | return a; |
459 | 168k | |
460 | 168k | /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */ |
461 | 168k | total_graphs = (MVMuint64)agraphs + (MVMuint64)bgraphs; |
462 | 168k | if (total_graphs > 0xFFFFFFFF) |
463 | 0 | MVM_exception_throw_adhoc(tc, |
464 | 0 | "Can't concatenate strings, required number of graphemes %"PRIu64" > max allowed of %u", |
465 | 0 | total_graphs, 0xFFFFFFFF); |
466 | 168k | |
467 | 168k | /* Otherwise, we'll assemble a result string. */ |
468 | 168k | MVMROOT(tc, a, { |
469 | 168k | MVMROOT(tc, b, { |
470 | 168k | /* Allocate it. */ |
471 | 168k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
472 | 168k | |
473 | 168k | /* Total graphemes is trivial; just total up inputs. */ |
474 | 168k | result->body.num_graphs = (MVMuint32)total_graphs; |
475 | 168k | |
476 | 168k | /* Result string will be made of strands. */ |
477 | 168k | result->body.storage_type = MVM_STRING_STRAND; |
478 | 168k | |
479 | 168k | /* Detect the wonderful case where we're repeatedly concating the same |
480 | 168k | * string again and again, and thus can just bump a repetition. */ |
481 | 168k | if (final_strand_matches(tc, a, b)) { |
482 | 168k | /* We have it; just copy the strands to a new string and bump the |
483 | 168k | * repetitions count of the last one. */ |
484 | 168k | result->body.storage.strands = allocate_strands(tc, a->body.num_strands); |
485 | 168k | copy_strands(tc, a, 0, result, 0, a->body.num_strands); |
486 | 168k | result->body.storage.strands[a->body.num_strands - 1].repetitions++; |
487 | 168k | result->body.num_strands = a->body.num_strands; |
488 | 168k | } |
489 | 168k | |
490 | 168k | /* Otherwise, construct a new strand string. */ |
491 | 168k | else { |
492 | 168k | /* See if we have too many strands between the two. If so, we will |
493 | 168k | * collapse the biggest side. */ |
494 | 168k | MVMuint16 strands_a = a->body.storage_type == MVM_STRING_STRAND |
495 | 168k | ? a->body.num_strands |
496 | 168k | : 1; |
497 | 168k | MVMuint16 strands_b = b->body.storage_type == MVM_STRING_STRAND |
498 | 168k | ? b->body.num_strands |
499 | 168k | : 1; |
500 | 168k | MVMString *effective_a = a; |
501 | 168k | MVMString *effective_b = b; |
502 | 168k | if (strands_a + strands_b > MVM_STRING_MAX_STRANDS) { |
503 | 168k | MVMROOT(tc, result, { |
504 | 168k | if (strands_a >= strands_b) { |
505 | 168k | effective_a = collapse_strands(tc, effective_a); |
506 | 168k | strands_a = 1; |
507 | 168k | } |
508 | 168k | else { |
509 | 168k | effective_b = collapse_strands(tc, effective_b); |
510 | 168k | strands_b = 1; |
511 | 168k | } |
512 | 168k | }); |
513 | 168k | } |
514 | 168k | |
515 | 168k | /* Assemble the result. */ |
516 | 168k | result->body.num_strands = strands_a + strands_b; |
517 | 168k | result->body.storage.strands = allocate_strands(tc, strands_a + strands_b); |
518 | 168k | if (effective_a->body.storage_type == MVM_STRING_STRAND) { |
519 | 168k | copy_strands(tc, effective_a, 0, result, 0, strands_a); |
520 | 168k | } |
521 | 168k | else { |
522 | 168k | MVMStringStrand *ss = &(result->body.storage.strands[0]); |
523 | 168k | ss->blob_string = effective_a; |
524 | 168k | ss->start = 0; |
525 | 168k | ss->end = effective_a->body.num_graphs; |
526 | 168k | ss->repetitions = 0; |
527 | 168k | } |
528 | 168k | if (effective_b->body.storage_type == MVM_STRING_STRAND) { |
529 | 168k | copy_strands(tc, effective_b, 0, result, strands_a, strands_b); |
530 | 168k | } |
531 | 168k | else { |
532 | 168k | MVMStringStrand *ss = &(result->body.storage.strands[strands_a]); |
533 | 168k | ss->blob_string = effective_b; |
534 | 168k | ss->start = 0; |
535 | 168k | ss->end = effective_b->body.num_graphs; |
536 | 168k | ss->repetitions = 0; |
537 | 168k | } |
538 | 168k | } |
539 | 168k | }); |
540 | 168k | }); |
541 | 168k | |
542 | 168k | STRAND_CHECK(tc, result); |
543 | 168k | return MVM_nfg_is_concat_stable(tc, a, b) ? result : re_nfg(tc, result); |
544 | 168k | } |
545 | | |
546 | 304 | MVMString * MVM_string_repeat(MVMThreadContext *tc, MVMString *a, MVMint64 count) { |
547 | 304 | MVMString *result; |
548 | 304 | MVMuint32 agraphs; |
549 | 304 | MVMuint64 total_graphs; |
550 | 304 | |
551 | 304 | MVM_string_check_arg(tc, a, "repeat"); |
552 | 304 | |
553 | 304 | /* Validate count; handle common cases. */ |
554 | 304 | if (count == 0) |
555 | 7 | return tc->instance->str_consts.empty; |
556 | 297 | if (count == 1) |
557 | 15 | return a; |
558 | 282 | if (count < 0) |
559 | 0 | MVM_exception_throw_adhoc(tc, "repeat count (%"PRId64") cannot be negative", count); |
560 | 282 | if (count > (1 << 30)) |
561 | 0 | MVM_exception_throw_adhoc(tc, "repeat count > %d arbitrarily unsupported...", (1 << 30)); |
562 | 282 | |
563 | 282 | /* If input string is empty, repeating it is empty. */ |
564 | 282 | agraphs = MVM_string_graphs_nocheck(tc, a); |
565 | 282 | if (agraphs == 0) |
566 | 0 | return tc->instance->str_consts.empty; |
567 | 282 | |
568 | 282 | /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */ |
569 | 282 | total_graphs = (MVMuint64)agraphs * (MVMuint64)count; |
570 | 282 | if (total_graphs > 0xFFFFFFFF) |
571 | 0 | MVM_exception_throw_adhoc(tc, |
572 | 0 | "Can't repeat string, required number of graphemes %"PRIu64" > max allowed of %u", |
573 | 0 | total_graphs, 0xFFFFFFFF); |
574 | 282 | |
575 | 282 | /* Now build a result string with the repetition set. */ |
576 | 282 | MVMROOT(tc, a, { |
577 | 282 | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
578 | 282 | result->body.num_graphs = agraphs * count; |
579 | 282 | result->body.storage_type = MVM_STRING_STRAND; |
580 | 282 | result->body.storage.strands = allocate_strands(tc, 1); |
581 | 282 | if (a->body.storage_type == MVM_STRING_STRAND) { |
582 | 282 | if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) { |
583 | 282 | copy_strands(tc, a, 0, result, 0, 1); |
584 | 282 | } |
585 | 282 | else { |
586 | 282 | MVMROOT(tc, result, { |
587 | 282 | a = collapse_strands(tc, a); |
588 | 282 | }); |
589 | 282 | result->body.storage.strands[0].blob_string = a; |
590 | 282 | result->body.storage.strands[0].start = 0; |
591 | 282 | result->body.storage.strands[0].end = agraphs; |
592 | 282 | } |
593 | 282 | } |
594 | 282 | else { |
595 | 282 | result->body.storage.strands[0].blob_string = a; |
596 | 282 | result->body.storage.strands[0].start = 0; |
597 | 282 | result->body.storage.strands[0].end = agraphs; |
598 | 282 | } |
599 | 282 | result->body.storage.strands[0].repetitions = count - 1; |
600 | 282 | result->body.num_strands = 1; |
601 | 282 | }); |
602 | 282 | |
603 | 282 | STRAND_CHECK(tc, result); |
604 | 282 | return result; |
605 | 282 | } |
606 | | |
607 | 11.3k | void MVM_string_say(MVMThreadContext *tc, MVMString *a) { |
608 | 11.3k | MVM_string_check_arg(tc, a, "say"); |
609 | 11.3k | MVM_io_write_string(tc, tc->instance->stdout_handle, a, 1); |
610 | 11.3k | } |
611 | | |
612 | 47 | void MVM_string_print(MVMThreadContext *tc, MVMString *a) { |
613 | 47 | MVM_string_check_arg(tc, a, "print"); |
614 | 47 | MVM_io_write_string(tc, tc->instance->stdout_handle, a, 0); |
615 | 47 | } |
616 | | |
617 | | /* Tests whether one string a has the other string b as a substring at that index */ |
618 | 12.9M | MVMint64 MVM_string_equal_at(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 offset) { |
619 | 12.9M | |
620 | 12.9M | MVMStringIndex agraphs, bgraphs; |
621 | 12.9M | |
622 | 12.9M | MVM_string_check_arg(tc, a, "equal_at"); |
623 | 12.9M | MVM_string_check_arg(tc, b, "equal_at"); |
624 | 12.9M | |
625 | 12.9M | agraphs = MVM_string_graphs_nocheck(tc, a); |
626 | 12.9M | bgraphs = MVM_string_graphs_nocheck(tc, b); |
627 | 12.9M | |
628 | 12.9M | if (offset < 0) { |
629 | 2 | offset += agraphs; |
630 | 2 | if (offset < 0) |
631 | 1 | offset = 0; /* XXX I think this is the right behavior here */ |
632 | 2 | } |
633 | 12.9M | if (agraphs - offset < bgraphs) |
634 | 1.37k | return 0; |
635 | 12.9M | return MVM_string_substrings_equal_nocheck(tc, a, offset, bgraphs, b, 0); |
636 | 12.9M | } |
637 | 1.17k | MVM_STATIC_INLINE MVMint32 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, MVMString *haystack, MVMString *needle_fc, MVMint64 h_start, MVMint64 h_graphs, MVMint64 n_graphs) { |
638 | 1.17k | MVMuint32 h_fc_cps; |
639 | 1.17k | /* An additional needle offset which is used only when codepoints expand |
640 | 1.17k | * when casefolded. The offset is the number of additional codepoints that |
641 | 1.17k | * have been seen so haystack and needle stay aligned */ |
642 | 1.17k | MVMint64 n_offset = 0; |
643 | 1.17k | MVMint64 i, j; |
644 | 1.17k | MVMGrapheme32 h_g, n_g; |
645 | 1.62k | for (i = 0; i + h_start < h_graphs && i + n_offset < n_graphs; i++) { |
646 | 1.33k | const MVMCodepoint* h_result_cps; |
647 | 1.33k | h_g = MVM_string_get_grapheme_at_nocheck(tc, haystack, h_start + i); |
648 | 1.33k | if (h_g >= 0 ) { |
649 | 1.33k | /* For codeponits we can get the case change directly */ |
650 | 1.33k | h_fc_cps = MVM_unicode_get_case_change(tc, h_g, MVM_unicode_case_change_type_fold, &h_result_cps); |
651 | 1.33k | } |
652 | 0 | else { |
653 | 0 | /* Synthetics must use this function */ |
654 | 0 | h_fc_cps = MVM_nfg_get_case_change(tc, h_g, MVM_unicode_case_change_type_fold, (MVMGrapheme32**) &h_result_cps); |
655 | 0 | } |
656 | 1.33k | /* If we get 0 for the number that means the cp doesn't change when casefolded */ |
657 | 1.33k | if (h_fc_cps == 0) { |
658 | 1.07k | n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset); |
659 | 1.07k | if (h_g != n_g) |
660 | 841 | return 0; |
661 | 1.07k | } |
662 | 258 | else if (h_fc_cps >= 1) { |
663 | 573 | for (j = 0; j < h_fc_cps; j++) { |
664 | 352 | n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset); |
665 | 352 | h_g = h_result_cps[j]; |
666 | 352 | if (h_g != n_g) |
667 | 37 | return 0; |
668 | 315 | n_offset++; |
669 | 315 | } |
670 | 221 | n_offset--; |
671 | 221 | } |
672 | 1.33k | } |
673 | 296 | return 1; |
674 | 1.17k | } |
675 | | /* Checks if needle exists at the offset, but ignores case. |
676 | | * Sometimes there is a difference in length of a string before and after foldcase, |
677 | | * because of this we must compare this differently than just foldcasing both |
678 | | * strings to ensure the offset is correct */ |
679 | 215 | MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 h_offset) { |
680 | 215 | /* Foldcase version of needle */ |
681 | 215 | MVMString *needle_fc; |
682 | 215 | MVMStringIndex h_graphs = MVM_string_graphs(tc, haystack); |
683 | 215 | MVMStringIndex n_graphs = MVM_string_graphs(tc, needle); |
684 | 215 | |
685 | 215 | if (h_offset < 0) { |
686 | 0 | h_offset += h_graphs; |
687 | 0 | if (h_offset < 0) |
688 | 0 | h_offset = 0; /* XXX I think this is the right behavior here */ |
689 | 0 | } |
690 | 215 | /* If the offset is greater or equal to the number of haystack graphemes |
691 | 215 | * return 0. Since size of graphemes could change under casefolding, we |
692 | 215 | * can't assume too much. If optimizing this be careful */ |
693 | 215 | if (h_offset >= h_graphs) |
694 | 8 | return 0; |
695 | 215 | |
696 | 207 | MVMROOT(tc, haystack, { |
697 | 207 | needle_fc = MVM_string_fc(tc, needle); |
698 | 207 | }); |
699 | 207 | |
700 | 207 | return string_equal_at_ignore_case_INTERNAL_loop(tc, haystack, needle_fc, h_offset, h_graphs, n_graphs); |
701 | 215 | } |
702 | 123 | MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *haystack, MVMString *needle, MVMint64 start) { |
703 | 123 | /* Foldcase version of needle */ |
704 | 123 | MVMString *needle_fc; |
705 | 123 | MVMStringIndex n_fc_graphs; |
706 | 123 | |
707 | 123 | size_t index = (size_t)start; |
708 | 123 | MVMStringIndex hgraphs, ngraphs; |
709 | 123 | MVMint64 return_val = -1; |
710 | 123 | MVM_string_check_arg(tc, haystack, "index search target"); |
711 | 123 | MVM_string_check_arg(tc, needle, "index search term"); |
712 | 123 | hgraphs = MVM_string_graphs(tc, haystack); |
713 | 123 | ngraphs = MVM_string_graphs(tc, needle); |
714 | 123 | if (!ngraphs) |
715 | 0 | return start <= hgraphs ? start : -1; /* Empty string is in any other string */ |
716 | 123 | if (!hgraphs) |
717 | 0 | return -1; |
718 | 123 | if (start < 0 || start >= hgraphs) |
719 | 7 | return -1; |
720 | 123 | /* Codepoints can expand into up to THREE codepoints (as of Unicode 9.0). The next check |
721 | 123 | * checks if it is at all possible for the needle grapheme number to be higher |
722 | 123 | * than the haystack */ |
723 | 116 | if (ngraphs > hgraphs * 3) |
724 | 1 | return -1; |
725 | 116 | |
726 | 115 | if (ngraphs < 1) |
727 | 0 | return -1; |
728 | 115 | |
729 | 115 | MVMROOT(tc, haystack, { |
730 | 115 | needle_fc = MVM_string_fc(tc, needle); |
731 | 115 | }); |
732 | 115 | n_fc_graphs = MVM_string_graphs(tc, needle_fc); |
733 | 115 | |
734 | 115 | /* brute force for now. horrible, yes. halp. */ |
735 | 967 | while (index <= hgraphs) { |
736 | 967 | if (string_equal_at_ignore_case_INTERNAL_loop(tc, haystack, needle_fc, index, hgraphs, n_fc_graphs)) |
737 | 115 | return (MVMint64)index; |
738 | 967 | |
739 | 852 | index++; |
740 | 852 | } |
741 | 0 | return -1; |
742 | 115 | } |
743 | 155k | MVMGrapheme32 MVM_string_ord_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) { |
744 | 155k | MVMStringIndex agraphs; |
745 | 155k | MVMGrapheme32 g; |
746 | 155k | |
747 | 155k | MVM_string_check_arg(tc, s, "grapheme_at"); |
748 | 155k | |
749 | 155k | agraphs = MVM_string_graphs(tc, s); |
750 | 155k | if (offset < 0 || offset >= agraphs) |
751 | 0 | return -1; |
752 | 155k | |
753 | 155k | g = MVM_string_get_grapheme_at_nocheck(tc, s, offset); |
754 | 155k | |
755 | 155k | return g >= 0 ? g : MVM_nfg_get_synthetic_info(tc, g)->base; |
756 | 155k | } |
757 | | |
758 | 0 | MVMGrapheme32 MVM_string_ord_basechar_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) { |
759 | 0 | MVMStringIndex agraphs; |
760 | 0 | MVMGrapheme32 g; |
761 | 0 | MVMNormalizer norm; |
762 | 0 | MVMint32 ready; |
763 | 0 |
|
764 | 0 | MVM_string_check_arg(tc, s, "ord_basechar_at"); |
765 | 0 |
|
766 | 0 | agraphs = MVM_string_graphs(tc, s); |
767 | 0 | if (offset < 0 || offset >= agraphs) |
768 | 0 | return -1; /* fixes RT #126771 */ |
769 | 0 |
|
770 | 0 | g = MVM_string_get_grapheme_at_nocheck(tc, s, offset); |
771 | 0 |
|
772 | 0 | if (g < 0) { |
773 | 0 | MVMNFGSynthetic *si = MVM_nfg_get_synthetic_info(tc, g); |
774 | 0 | g = si->base; |
775 | 0 | } |
776 | 0 | else { |
777 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
778 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, g, &g); |
779 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
780 | 0 | if (!ready) |
781 | 0 | g = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
782 | 0 |
|
783 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
784 | 0 | } |
785 | 0 |
|
786 | 0 | return g; |
787 | 0 | } |
788 | | |
789 | | /* Compares two strings for equality. */ |
790 | 59.5M | MVMint64 MVM_string_equal(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
791 | 59.5M | MVM_string_check_arg(tc, a, "equal"); |
792 | 59.5M | MVM_string_check_arg(tc, b, "equal"); |
793 | 59.5M | if (a == b) |
794 | 11.1M | return 1; |
795 | 48.4M | if (MVM_string_graphs(tc, a) != MVM_string_graphs(tc, b)) |
796 | 35.6M | return 0; |
797 | 12.8M | return MVM_string_equal_at(tc, a, b, 0); |
798 | 48.4M | } |
799 | | |
800 | | /* more general form of has_at; compares two substrings for equality */ |
801 | | MVMint64 MVM_string_have_at(MVMThreadContext *tc, MVMString *a, |
802 | 0 | MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) { |
803 | 0 |
|
804 | 0 | MVM_string_check_arg(tc, a, "have_at"); |
805 | 0 | MVM_string_check_arg(tc, b, "have_at"); |
806 | 0 |
|
807 | 0 | if (starta < 0 || startb < 0) |
808 | 0 | return 0; |
809 | 0 | if (length == 0) |
810 | 0 | return 1; |
811 | 0 | if (starta + length > MVM_string_graphs(tc, a) || startb + length > MVM_string_graphs(tc, b)) |
812 | 0 | return 0; |
813 | 0 |
|
814 | 0 | return MVM_string_substrings_equal_nocheck(tc, a, starta, length, b, startb); |
815 | 0 | } |
816 | | |
817 | | /* Returns the grapheme at a given index of the string */ |
818 | 597k | MVMint64 MVM_string_get_grapheme_at(MVMThreadContext *tc, MVMString *a, MVMint64 index) { |
819 | 597k | MVMStringIndex agraphs; |
820 | 597k | |
821 | 597k | MVM_string_check_arg(tc, a, "grapheme_at"); |
822 | 597k | |
823 | 597k | agraphs = MVM_string_graphs(tc, a); |
824 | 597k | |
825 | 597k | if (index < 0 || index >= agraphs) |
826 | 0 | MVM_exception_throw_adhoc(tc, "Invalid string index: max %"PRId32", got %"PRId64, |
827 | 0 | agraphs - 1, index); |
828 | 597k | |
829 | 597k | return (MVMint64)MVM_string_get_grapheme_at_nocheck(tc, a, index); |
830 | 597k | } |
831 | | |
832 | | /* Finds the location of a grapheme in a string. Useful for small character class lookup */ |
833 | 1.70M | MVMint64 MVM_string_index_of_grapheme(MVMThreadContext *tc, MVMString *a, MVMGrapheme32 grapheme) { |
834 | 1.70M | size_t index = -1; |
835 | 1.70M | MVMGraphemeIter gi; |
836 | 1.70M | |
837 | 1.70M | MVM_string_check_arg(tc, a, "string_index_of_grapheme"); |
838 | 1.70M | |
839 | 1.70M | MVM_string_gi_init(tc, &gi, a); |
840 | 9.57M | while (MVM_string_gi_has_more(tc, &gi)) { |
841 | 8.09M | index++; |
842 | 8.09M | if (MVM_string_gi_get_grapheme(tc, &gi) == grapheme) |
843 | 228k | return index; |
844 | 8.09M | } |
845 | 1.47M | return -1; |
846 | 1.70M | } |
847 | | |
848 | | /* Case change functions. */ |
849 | | static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g); |
850 | 10.8k | static MVMString * do_case_change(MVMThreadContext *tc, MVMString *s, MVMint32 type, char *error) { |
851 | 10.8k | MVMint64 sgraphs; |
852 | 10.8k | MVM_string_check_arg(tc, s, error); |
853 | 10.8k | sgraphs = MVM_string_graphs(tc, s); |
854 | 10.8k | if (sgraphs) { |
855 | 9.36k | MVMString *result; |
856 | 9.36k | MVMGraphemeIter gi; |
857 | 9.36k | MVMint64 result_graphs = sgraphs; |
858 | 9.36k | MVMGrapheme32 *result_buf = MVM_malloc(result_graphs * sizeof(MVMGrapheme32)); |
859 | 9.36k | MVMint32 changed = 0; |
860 | 9.36k | MVMint64 i = 0; |
861 | 9.36k | MVM_string_gi_init(tc, &gi, s); |
862 | 54.7k | while (MVM_string_gi_has_more(tc, &gi)) { |
863 | 45.4k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
864 | 45.4k | peeked: |
865 | 45.4k | if (g == 0x03A3) { |
866 | 0 | /* Greek sigma needs special handling when lowercased. */ |
867 | 0 | switch (type) { |
868 | 0 | case MVM_unicode_case_change_type_upper: |
869 | 0 | case MVM_unicode_case_change_type_title: |
870 | 0 | result_buf[i++] = g; |
871 | 0 | break; |
872 | 0 | case MVM_unicode_case_change_type_lower: |
873 | 0 | changed = 1; |
874 | 0 | if (i == 0) { |
875 | 0 | /* Start of string, so not final. */ |
876 | 0 | result_buf[i++] = 0x03C3; |
877 | 0 | } |
878 | 0 | else if (!grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, result_buf[i - 1])) { |
879 | 0 | /* Previous char is not a letter; not final (as has |
880 | 0 | * to be at end of a word and not only thing in a |
881 | 0 | * word). */ |
882 | 0 | result_buf[i++] = 0x03C3; |
883 | 0 | } |
884 | 0 | else if (!MVM_string_gi_has_more(tc, &gi)) { |
885 | 0 | /* End of string. We only reach here if we have a |
886 | 0 | * letter before us, so it must be final. */ |
887 | 0 | result_buf[i++] = 0x03C2; |
888 | 0 | } |
889 | 0 | else { |
890 | 0 | /* Letter before us, something ahead of us. Need to |
891 | 0 | * peek ahead to see if it's a letter, to decide if |
892 | 0 | * we have final sigma or not. */ |
893 | 0 | g = MVM_string_gi_get_grapheme(tc, &gi); |
894 | 0 | if (grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, g)) |
895 | 0 | result_buf[i++] = 0x03C3; |
896 | 0 | else |
897 | 0 | result_buf[i++] = 0x03C2; |
898 | 0 | goto peeked; |
899 | 0 | } |
900 | 0 | break; |
901 | 0 | case MVM_unicode_case_change_type_fold: |
902 | 0 | result_buf[i++] = 0x03C3; |
903 | 0 | changed = 1; |
904 | 0 | break; |
905 | 0 | } |
906 | 0 | } |
907 | 45.4k | else if (g >= 0) { |
908 | 45.4k | const MVMCodepoint *result_cps; |
909 | 45.4k | MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc, |
910 | 45.4k | g, type, &result_cps); |
911 | 45.4k | if (num_result_cps == 0) { |
912 | 31.9k | result_buf[i++] = g; |
913 | 31.9k | } |
914 | 13.4k | else if (num_result_cps == 1) { |
915 | 13.3k | result_buf[i++] = *result_cps; |
916 | 13.3k | changed = 1; |
917 | 13.3k | } |
918 | 98 | else { |
919 | 98 | /* To maintain NFG, we need to re-normalize when we get an |
920 | 98 | * expansion. */ |
921 | 98 | MVMNormalizer norm; |
922 | 98 | MVMint32 num_result_graphs; |
923 | 98 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
924 | 98 | MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, num_result_cps); |
925 | 98 | MVM_unicode_normalizer_eof(tc, &norm); |
926 | 98 | num_result_graphs = MVM_unicode_normalizer_available(tc, &norm); |
927 | 98 | |
928 | 98 | /* Make space for any extra graphemes. */ |
929 | 98 | if (num_result_graphs > 1) { |
930 | 98 | result_graphs += num_result_graphs - 1; |
931 | 98 | result_buf = MVM_realloc(result_buf, |
932 | 98 | result_graphs * sizeof(MVMGrapheme32)); |
933 | 98 | } |
934 | 98 | |
935 | 98 | /* Copy resulting graphemes. */ |
936 | 294 | while (num_result_graphs > 0) { |
937 | 196 | result_buf[i++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
938 | 196 | num_result_graphs--; |
939 | 196 | } |
940 | 98 | changed = 1; |
941 | 98 | |
942 | 98 | /* Clean up normalizer (we could init one per transform |
943 | 98 | * and keep it around in the future, if we find it's a |
944 | 98 | * worthwhile gain). */ |
945 | 98 | MVM_unicode_normalizer_cleanup(tc, &norm); |
946 | 98 | } |
947 | 45.4k | } |
948 | 0 | else { |
949 | 0 | MVMGrapheme32 *transformed; |
950 | 0 | MVMint32 num_transformed = MVM_nfg_get_case_change(tc, g, type, &transformed); |
951 | 0 | if (num_transformed == 0) { |
952 | 0 | result_buf[i++] = g; |
953 | 0 | } |
954 | 0 | else if (num_transformed == 1) { |
955 | 0 | result_buf[i++] = *transformed; |
956 | 0 | changed = 1; |
957 | 0 | } |
958 | 0 | else { |
959 | 0 | MVMuint32 j; |
960 | 0 | result_graphs += num_transformed - 1; |
961 | 0 | result_buf = MVM_realloc(result_buf, |
962 | 0 | result_graphs * sizeof(MVMGrapheme32)); |
963 | 0 | for (j = 0; j < num_transformed; j++) |
964 | 0 | result_buf[i++] = transformed[j]; |
965 | 0 | changed = 1; |
966 | 0 | } |
967 | 0 | } |
968 | 45.4k | } |
969 | 9.36k | if (changed) { |
970 | 2.79k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
971 | 2.79k | result->body.num_graphs = result_graphs; |
972 | 2.79k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
973 | 2.79k | result->body.storage.blob_32 = result_buf; |
974 | 2.79k | return result; |
975 | 2.79k | } |
976 | 6.56k | else { |
977 | 6.56k | MVM_free(result_buf); |
978 | 6.56k | } |
979 | 9.36k | } |
980 | 8.05k | STRAND_CHECK(tc, s); |
981 | 8.05k | return s; |
982 | 10.8k | } |
983 | 101 | MVMString * MVM_string_uc(MVMThreadContext *tc, MVMString *s) { |
984 | 101 | return do_case_change(tc, s, MVM_unicode_case_change_type_upper, "uc"); |
985 | 101 | } |
986 | 10.4k | MVMString * MVM_string_lc(MVMThreadContext *tc, MVMString *s) { |
987 | 10.4k | return do_case_change(tc, s, MVM_unicode_case_change_type_lower, "lc"); |
988 | 10.4k | } |
989 | 12 | MVMString * MVM_string_tc(MVMThreadContext *tc, MVMString *s) { |
990 | 12 | return do_case_change(tc, s, MVM_unicode_case_change_type_title, "tc"); |
991 | 12 | } |
992 | 322 | MVMString * MVM_string_fc(MVMThreadContext *tc, MVMString *s) { |
993 | 322 | return do_case_change(tc, s, MVM_unicode_case_change_type_fold, "fc"); |
994 | 322 | } |
995 | | |
996 | | /* Decodes a C buffer to an MVMString, dependent on the encoding type flag. */ |
997 | | MVMString * MVM_string_decode(MVMThreadContext *tc, |
998 | 5 | const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, MVMint64 encoding_flag) { |
999 | 5 | switch(encoding_flag) { |
1000 | 4 | case MVM_encoding_type_utf8: |
1001 | 4 | return MVM_string_utf8_decode_strip_bom(tc, type_object, Cbuf, byte_length); |
1002 | 0 | case MVM_encoding_type_ascii: |
1003 | 0 | return MVM_string_ascii_decode(tc, type_object, Cbuf, byte_length); |
1004 | 0 | case MVM_encoding_type_latin1: |
1005 | 0 | return MVM_string_latin1_decode(tc, type_object, Cbuf, byte_length); |
1006 | 1 | case MVM_encoding_type_utf16: |
1007 | 1 | return MVM_string_utf16_decode(tc, type_object, Cbuf, byte_length); |
1008 | 0 | case MVM_encoding_type_windows1252: |
1009 | 0 | return MVM_string_windows1252_decode(tc, type_object, Cbuf, byte_length); |
1010 | 0 | case MVM_encoding_type_utf8_c8: |
1011 | 0 | return MVM_string_utf8_c8_decode(tc, type_object, Cbuf, byte_length); |
1012 | 0 | default: |
1013 | 0 | MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag); |
1014 | 5 | } |
1015 | 5 | } |
1016 | | |
1017 | | /* Encodes an MVMString to a C buffer, dependent on the encoding type flag */ |
1018 | | char * MVM_string_encode(MVMThreadContext *tc, MVMString *s, MVMint64 start, |
1019 | | MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag, |
1020 | 11.4k | MVMString *replacement, MVMint32 translate_newlines) { |
1021 | 11.4k | switch(encoding_flag) { |
1022 | 11.4k | case MVM_encoding_type_utf8: |
1023 | 11.4k | return MVM_string_utf8_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1024 | 2 | case MVM_encoding_type_ascii: |
1025 | 2 | return MVM_string_ascii_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1026 | 0 | case MVM_encoding_type_latin1: |
1027 | 0 | return MVM_string_latin1_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1028 | 1 | case MVM_encoding_type_utf16: |
1029 | 1 | return MVM_string_utf16_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1030 | 0 | case MVM_encoding_type_windows1252: |
1031 | 0 | return MVM_string_windows1252_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines); |
1032 | 0 | case MVM_encoding_type_utf8_c8: |
1033 | 0 | return MVM_string_utf8_c8_encode_substr(tc, s, output_size, start, length, replacement); |
1034 | 0 | default: |
1035 | 0 | MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag); |
1036 | 11.4k | } |
1037 | 11.4k | } |
1038 | | |
1039 | | /* Encodes a string, and writes the encoding string into the supplied Buf |
1040 | | * instance, which should be an integer array with MVMArray REPR. */ |
1041 | 14 | void MVM_string_encode_to_buf(MVMThreadContext *tc, MVMString *s, MVMString *enc_name, MVMObject *buf, MVMString *replacement) { |
1042 | 14 | MVMuint64 output_size; |
1043 | 14 | MVMuint8 *encoded; |
1044 | 14 | MVMArrayREPRData *buf_rd; |
1045 | 14 | MVMuint8 elem_size = 0; |
1046 | 14 | |
1047 | 14 | /* Ensure the target is in the correct form. */ |
1048 | 14 | MVM_string_check_arg(tc, s, "encode"); |
1049 | 14 | if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray) |
1050 | 0 | MVM_exception_throw_adhoc(tc, "encode requires a native array to write into"); |
1051 | 14 | buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data; |
1052 | 14 | if (buf_rd) { |
1053 | 14 | switch (buf_rd->slot_type) { |
1054 | 0 | case MVM_ARRAY_I64: elem_size = 8; break; |
1055 | 0 | case MVM_ARRAY_I32: elem_size = 4; break; |
1056 | 0 | case MVM_ARRAY_I16: elem_size = 2; break; |
1057 | 0 | case MVM_ARRAY_I8: elem_size = 1; break; |
1058 | 0 | case MVM_ARRAY_U64: elem_size = 8; break; |
1059 | 0 | case MVM_ARRAY_U32: elem_size = 4; break; |
1060 | 1 | case MVM_ARRAY_U16: elem_size = 2; break; |
1061 | 13 | case MVM_ARRAY_U8: elem_size = 1; break; |
1062 | 14 | } |
1063 | 14 | } |
1064 | 14 | if (!elem_size) |
1065 | 0 | MVM_exception_throw_adhoc(tc, "encode requires a native int array"); |
1066 | 14 | if (((MVMArray *)buf)->body.slots.any) |
1067 | 0 | MVM_exception_throw_adhoc(tc, "encode requires an empty array"); |
1068 | 14 | |
1069 | 14 | /* At least find_encoding may allocate on first call, so root just |
1070 | 14 | * in case. */ |
1071 | 14 | MVMROOT(tc, buf, { |
1072 | 14 | MVMROOT(tc, s, { |
1073 | 14 | const MVMuint8 encoding_flag = MVM_string_find_encoding(tc, enc_name); |
1074 | 14 | encoded = (MVMuint8 *)MVM_string_encode(tc, s, 0, MVM_string_graphs(tc, s), &output_size, |
1075 | 14 | encoding_flag, replacement, 0); |
1076 | 14 | }); |
1077 | 14 | }); |
1078 | 14 | |
1079 | 14 | /* Stash the encoded data in the VMArray. */ |
1080 | 14 | ((MVMArray *)buf)->body.slots.i8 = (MVMint8 *)encoded; |
1081 | 14 | ((MVMArray *)buf)->body.start = 0; |
1082 | 14 | ((MVMArray *)buf)->body.ssize = output_size / elem_size; |
1083 | 14 | ((MVMArray *)buf)->body.elems = output_size / elem_size; |
1084 | 14 | } |
1085 | | |
1086 | | /* Decodes a string using the data from the specified Buf. */ |
1087 | 5 | MVMString * MVM_string_decode_from_buf(MVMThreadContext *tc, MVMObject *buf, MVMString *enc_name) { |
1088 | 5 | MVMArrayREPRData *buf_rd; |
1089 | 5 | MVMuint8 encoding_flag; |
1090 | 5 | MVMuint8 elem_size = 0; |
1091 | 5 | |
1092 | 5 | /* Ensure the source is in the correct form. */ |
1093 | 5 | if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray) |
1094 | 0 | MVM_exception_throw_adhoc(tc, "decode requires a native array to read from"); |
1095 | 5 | buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data; |
1096 | 5 | if (buf_rd) { |
1097 | 5 | switch (buf_rd->slot_type) { |
1098 | 0 | case MVM_ARRAY_I64: elem_size = 8; break; |
1099 | 0 | case MVM_ARRAY_I32: elem_size = 4; break; |
1100 | 0 | case MVM_ARRAY_I16: elem_size = 2; break; |
1101 | 1 | case MVM_ARRAY_I8: elem_size = 1; break; |
1102 | 0 | case MVM_ARRAY_U64: elem_size = 8; break; |
1103 | 0 | case MVM_ARRAY_U32: elem_size = 4; break; |
1104 | 1 | case MVM_ARRAY_U16: elem_size = 2; break; |
1105 | 3 | case MVM_ARRAY_U8: elem_size = 1; break; |
1106 | 5 | } |
1107 | 5 | } |
1108 | 5 | if (!elem_size) |
1109 | 0 | MVM_exception_throw_adhoc(tc, "encode requires a native int array"); |
1110 | 5 | |
1111 | 5 | /* Decode. */ |
1112 | 5 | MVMROOT(tc, buf, { |
1113 | 5 | encoding_flag = MVM_string_find_encoding(tc, enc_name); |
1114 | 5 | }); |
1115 | 5 | return MVM_string_decode(tc, tc->instance->VMString, |
1116 | 5 | (char *)(((MVMArray *)buf)->body.slots.i8 + ((MVMArray *)buf)->body.start), |
1117 | 5 | ((MVMArray *)buf)->body.elems * elem_size, |
1118 | 5 | encoding_flag); |
1119 | 5 | } |
1120 | | |
1121 | 35.0k | MVMObject * MVM_string_split(MVMThreadContext *tc, MVMString *separator, MVMString *input) { |
1122 | 35.0k | MVMObject *result; |
1123 | 35.0k | MVMStringIndex start, end, sep_length; |
1124 | 35.0k | MVMHLLConfig *hll = MVM_hll_current(tc); |
1125 | 35.0k | |
1126 | 35.0k | MVM_string_check_arg(tc, separator, "split separator"); |
1127 | 35.0k | MVM_string_check_arg(tc, input, "split input"); |
1128 | 35.0k | |
1129 | 35.0k | MVMROOT(tc, input, { |
1130 | 35.0k | MVMROOT(tc, separator, { |
1131 | 35.0k | result = MVM_repr_alloc_init(tc, hll->slurpy_array_type); |
1132 | 35.0k | MVMROOT(tc, result, { |
1133 | 35.0k | start = 0; |
1134 | 35.0k | end = MVM_string_graphs_nocheck(tc, input); |
1135 | 35.0k | sep_length = MVM_string_graphs_nocheck(tc, separator); |
1136 | 35.0k | |
1137 | 35.0k | while (start < end) { |
1138 | 35.0k | MVMString *portion; |
1139 | 35.0k | MVMStringIndex index; |
1140 | 35.0k | MVMStringIndex length; |
1141 | 35.0k | |
1142 | 35.0k | /* XXX make this use the dual-traverse iterator, but such that it |
1143 | 35.0k | can reset the index of what it's comparing... <!> */ |
1144 | 35.0k | index = MVM_string_index(tc, input, separator, start); |
1145 | 35.0k | length = sep_length ? (index == -1 ? end : index) - start : 1; |
1146 | 35.0k | if (length > 0 || (sep_length && length == 0)) { |
1147 | 35.0k | portion = MVM_string_substring(tc, input, start, length); |
1148 | 35.0k | MVMROOT(tc, portion, { |
1149 | 35.0k | MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type); |
1150 | 35.0k | MVM_repr_set_str(tc, pobj, portion); |
1151 | 35.0k | MVM_repr_push_o(tc, result, pobj); |
1152 | 35.0k | }); |
1153 | 35.0k | } |
1154 | 35.0k | start += length + sep_length; |
1155 | 35.0k | /* Gather an empty string if the delimiter is found at the end. */ |
1156 | 35.0k | if (sep_length && start == end) { |
1157 | 35.0k | MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type); |
1158 | 35.0k | MVM_repr_set_str(tc, pobj, tc->instance->str_consts.empty); |
1159 | 35.0k | MVM_repr_push_o(tc, result, pobj); |
1160 | 35.0k | } |
1161 | 35.0k | } |
1162 | 35.0k | }); |
1163 | 35.0k | }); |
1164 | 35.0k | }); |
1165 | 35.0k | |
1166 | 35.0k | return result; |
1167 | 35.0k | } |
1168 | | |
1169 | 16.8k | MVMString * MVM_string_join(MVMThreadContext *tc, MVMString *separator, MVMObject *input) { |
1170 | 16.8k | MVMString *result; |
1171 | 16.8k | MVMString **pieces; |
1172 | 16.8k | MVMint64 elems, num_pieces, sgraphs, i, is_str_array, total_graphs; |
1173 | 16.8k | MVMuint16 sstrands, total_strands; |
1174 | 16.8k | MVMint32 concats_stable = 1; |
1175 | 16.8k | |
1176 | 16.8k | MVM_string_check_arg(tc, separator, "join separator"); |
1177 | 16.8k | if (!IS_CONCRETE(input)) |
1178 | 0 | MVM_exception_throw_adhoc(tc, "join needs a concrete array to join"); |
1179 | 16.8k | |
1180 | 16.8k | /* See how many things we have to join; if the answer is "none" then we |
1181 | 16.8k | * can make a hasty escape. */ |
1182 | 16.8k | elems = MVM_repr_elems(tc, input); |
1183 | 16.8k | if (elems == 0) |
1184 | 163 | return tc->instance->str_consts.empty; |
1185 | 16.7k | is_str_array = REPR(input)->pos_funcs.get_elem_storage_spec(tc, |
1186 | 16.7k | STABLE(input)).boxed_primitive == MVM_STORAGE_SPEC_BP_STR; |
1187 | 16.7k | |
1188 | 16.7k | /* Allocate result. */ |
1189 | 16.7k | MVMROOT(tc, separator, { |
1190 | 16.7k | MVMROOT(tc, input, { |
1191 | 16.7k | result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1192 | 16.7k | }); |
1193 | 16.7k | }); |
1194 | 16.7k | |
1195 | 16.7k | /* Take a first pass through the string, counting up length and the total |
1196 | 16.7k | * number of strands we encounter as well as building a flat array of the |
1197 | 16.7k | * strings (to we only have to do the indirect calls once). */ |
1198 | 16.7k | sgraphs = MVM_string_graphs_nocheck(tc, separator); |
1199 | 16.7k | if (sgraphs) |
1200 | 2.61k | sstrands = separator->body.storage_type == MVM_STRING_STRAND |
1201 | 0 | ? separator->body.num_strands |
1202 | 2.61k | : 1; |
1203 | 16.7k | else |
1204 | 14.1k | sstrands = 1; |
1205 | 16.7k | pieces = MVM_malloc(elems * sizeof(MVMString *)); |
1206 | 16.7k | num_pieces = 0; |
1207 | 16.7k | total_graphs = 0; |
1208 | 16.7k | total_strands = 0; |
1209 | 46.8k | for (i = 0; i < elems; i++) { |
1210 | 30.0k | /* Get piece of the string. */ |
1211 | 30.0k | MVMString *piece; |
1212 | 30.0k | MVMint64 piece_graphs; |
1213 | 30.0k | if (is_str_array) { |
1214 | 23.1k | piece = MVM_repr_at_pos_s(tc, input, i); |
1215 | 23.1k | if (!piece) |
1216 | 0 | continue; |
1217 | 23.1k | } |
1218 | 6.93k | else { |
1219 | 6.93k | MVMObject *item = MVM_repr_at_pos_o(tc, input, i); |
1220 | 6.93k | if (!item || !IS_CONCRETE(item)) |
1221 | 0 | continue; |
1222 | 6.93k | piece = MVM_repr_get_str(tc, item); |
1223 | 6.93k | } |
1224 | 30.0k | |
1225 | 30.0k | /* If it wasn't the first piece, add separator here. */ |
1226 | 30.0k | if (num_pieces) { |
1227 | 13.3k | total_strands += sstrands; |
1228 | 13.3k | total_graphs += sgraphs; |
1229 | 13.3k | } |
1230 | 30.0k | |
1231 | 30.0k | /* Add on the piece's strands and graphs. */ |
1232 | 30.0k | piece_graphs = MVM_string_graphs(tc, piece); |
1233 | 30.0k | if (piece_graphs) { |
1234 | 29.9k | total_strands += piece->body.storage_type == MVM_STRING_STRAND |
1235 | 22.9k | ? piece->body.num_strands |
1236 | 6.96k | : 1; |
1237 | 29.9k | total_graphs += piece_graphs; |
1238 | 29.9k | } |
1239 | 30.0k | |
1240 | 30.0k | /* Store piece. */ |
1241 | 30.0k | pieces[num_pieces++] = piece; |
1242 | 30.0k | } |
1243 | 16.7k | |
1244 | 16.7k | /* We now know the total eventual number of graphemes. */ |
1245 | 16.7k | if (total_graphs == 0) { |
1246 | 18 | free(pieces); |
1247 | 18 | return tc->instance->str_consts.empty; |
1248 | 18 | } |
1249 | 16.7k | result->body.num_graphs = total_graphs; |
1250 | 16.7k | |
1251 | 16.7k | /* If we just collect all the things as strands, are we within bounds, and |
1252 | 16.7k | * will be come out ahead? */ |
1253 | 16.7k | if (total_strands < MVM_STRING_MAX_STRANDS && total_graphs / total_strands >= 16) { |
1254 | 727 | /* XXX TODO: Implement this, conditionalize branch thing below. */ |
1255 | 727 | } |
1256 | 16.7k | /*else {*/ |
1257 | 16.7k | if (1) { |
1258 | 16.7k | /* We'll produce a single, flat string. */ |
1259 | 16.7k | MVMint64 position = 0; |
1260 | 16.7k | MVMGraphemeIter gi; |
1261 | 16.7k | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
1262 | 16.7k | result->body.storage.blob_32 = MVM_malloc(total_graphs * sizeof(MVMGrapheme32)); |
1263 | 46.7k | for (i = 0; i < num_pieces; i++) { |
1264 | 30.0k | /* Get piece. */ |
1265 | 30.0k | MVMString *piece = pieces[i]; |
1266 | 30.0k | |
1267 | 30.0k | /* Add separator if needed. */ |
1268 | 30.0k | if (i > 0) { |
1269 | 13.3k | /* If there's no separator and one piece is The Empty String we |
1270 | 13.3k | * have to be extra careful about concat stability */ |
1271 | 13.3k | if (sgraphs == 0 && MVM_string_graphs_nocheck(tc, piece) == 0 && concats_stable |
1272 | 18 | && i + 1 < num_pieces |
1273 | 15 | && !MVM_nfg_is_concat_stable(tc, pieces[i - 1], pieces[i + 1])) { |
1274 | 0 | concats_stable = 0; |
1275 | 0 | } |
1276 | 13.3k | |
1277 | 13.3k | if (sgraphs) { |
1278 | 115 | if (!concats_stable) |
1279 | 0 | /* Already unstable; no more checks. */; |
1280 | 115 | else if (!MVM_nfg_is_concat_stable(tc, pieces[i - 1], separator)) |
1281 | 0 | concats_stable = 0; |
1282 | 115 | else if (!MVM_nfg_is_concat_stable(tc, separator, piece)) |
1283 | 0 | concats_stable = 0; |
1284 | 115 | |
1285 | 115 | switch (separator->body.storage_type) { |
1286 | 0 | case MVM_STRING_GRAPHEME_32: |
1287 | 0 | memcpy( |
1288 | 0 | result->body.storage.blob_32 + position, |
1289 | 0 | separator->body.storage.blob_32, |
1290 | 0 | sgraphs * sizeof(MVMGrapheme32)); |
1291 | 0 | position += sgraphs; |
1292 | 0 | break; |
1293 | 0 | /* XXX Can special-case 8-bit NFG and ASCII here too. */ |
1294 | 115 | default: |
1295 | 115 | MVM_string_gi_init(tc, &gi, separator); |
1296 | 233 | while (MVM_string_gi_has_more(tc, &gi)) |
1297 | 118 | result->body.storage.blob_32[position++] = |
1298 | 118 | MVM_string_gi_get_grapheme(tc, &gi); |
1299 | 115 | break; |
1300 | 115 | } |
1301 | 115 | } |
1302 | 13.2k | else { |
1303 | 13.2k | /* Separator has no graphemes, so NFG stability check |
1304 | 13.2k | * should consider pieces. */ |
1305 | 13.2k | if (!concats_stable) |
1306 | 4 | /* Already stable; no more checks. */; |
1307 | 13.2k | else if (!MVM_nfg_is_concat_stable(tc, pieces[i - 1], piece)) |
1308 | 4 | concats_stable = 0; |
1309 | 13.2k | } |
1310 | 13.3k | } |
1311 | 30.0k | |
1312 | 30.0k | /* Add piece. */ |
1313 | 30.0k | switch (piece->body.storage_type) { |
1314 | 2.23k | case MVM_STRING_GRAPHEME_32: { |
1315 | 2.23k | MVMint64 pgraphs = MVM_string_graphs(tc, piece); |
1316 | 2.23k | memcpy( |
1317 | 2.23k | result->body.storage.blob_32 + position, |
1318 | 2.23k | piece->body.storage.blob_32, |
1319 | 2.23k | pgraphs * sizeof(MVMGrapheme32)); |
1320 | 2.23k | position += pgraphs; |
1321 | 2.23k | break; |
1322 | 2.23k | } |
1323 | 2.23k | /* XXX Can special-case 8-bit NFG and ASCII here too. */ |
1324 | 27.8k | default: |
1325 | 27.8k | MVM_string_gi_init(tc, &gi, piece); |
1326 | 711k | while (MVM_string_gi_has_more(tc, &gi)) |
1327 | 683k | result->body.storage.blob_32[position++] = |
1328 | 683k | MVM_string_gi_get_grapheme(tc, &gi); |
1329 | 27.8k | break; |
1330 | 30.0k | } |
1331 | 30.0k | } |
1332 | 16.7k | } |
1333 | 16.7k | |
1334 | 16.7k | MVM_free(pieces); |
1335 | 16.7k | STRAND_CHECK(tc, result); |
1336 | 16.7k | return concats_stable ? result : re_nfg(tc, result); |
1337 | 16.7k | } |
1338 | | |
1339 | | /* Returning nonzero means it found the char at the position specified in 'a' in 'b'. |
1340 | | * For character enumerations in regexes. */ |
1341 | 250k | MVMint64 MVM_string_char_at_in_string(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMString *b) { |
1342 | 250k | MVMuint32 bgraphs; |
1343 | 250k | MVMGrapheme32 search; |
1344 | 250k | |
1345 | 250k | MVM_string_check_arg(tc, a, "char_at_in_string"); |
1346 | 250k | MVM_string_check_arg(tc, b, "char_at_in_string"); |
1347 | 250k | |
1348 | 250k | /* We return -2 here only to be able to distinguish between "out of bounds" and "not in string". */ |
1349 | 250k | if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, a)) |
1350 | 10.6k | return -2; |
1351 | 250k | |
1352 | 239k | search = MVM_string_get_grapheme_at_nocheck(tc, a, offset); |
1353 | 239k | bgraphs = MVM_string_graphs_nocheck(tc, b); |
1354 | 239k | switch (b->body.storage_type) { |
1355 | 124k | case MVM_STRING_GRAPHEME_32: { |
1356 | 124k | MVMStringIndex i; |
1357 | 810k | for (i = 0; i < bgraphs; i++) |
1358 | 771k | if (b->body.storage.blob_32[i] == search) |
1359 | 85.0k | return i; |
1360 | 39.3k | break; |
1361 | 124k | } |
1362 | 0 | case MVM_STRING_GRAPHEME_ASCII: |
1363 | 0 | if (search >= 0 && search <= 127) { |
1364 | 0 | MVMStringIndex i; |
1365 | 0 | for (i = 0; i < bgraphs; i++) |
1366 | 0 | if (b->body.storage.blob_ascii[i] == search) |
1367 | 0 | return i; |
1368 | 0 | } |
1369 | 0 | break; |
1370 | 115k | case MVM_STRING_GRAPHEME_8: |
1371 | 115k | if (search >= -128 && search <= 127) { |
1372 | 115k | MVMStringIndex i; |
1373 | 256k | for (i = 0; i < bgraphs; i++) |
1374 | 172k | if (b->body.storage.blob_8[i] == search) |
1375 | 31.7k | return i; |
1376 | 115k | } |
1377 | 83.6k | break; |
1378 | 0 | case MVM_STRING_STRAND: { |
1379 | 0 | MVMGraphemeIter gi; |
1380 | 0 | MVMStringIndex i; |
1381 | 0 | MVM_string_gi_init(tc, &gi, b); |
1382 | 0 | for (i = 0; i < bgraphs; i++) |
1383 | 0 | if (MVM_string_gi_get_grapheme(tc, &gi) == search) |
1384 | 0 | return i; |
1385 | 0 | } |
1386 | 239k | } |
1387 | 123k | return -1; |
1388 | 239k | } |
1389 | | |
1390 | 0 | MVMint64 MVM_string_offset_has_unicode_property_value(MVMThreadContext *tc, MVMString *s, MVMint64 offset, MVMint64 property_code, MVMint64 property_value_code) { |
1391 | 0 | MVMGrapheme32 g; |
1392 | 0 | MVMCodepoint cp; |
1393 | 0 |
|
1394 | 0 | MVM_string_check_arg(tc, s, "uniprop"); |
1395 | 0 |
|
1396 | 0 | if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s)) |
1397 | 0 | return 0; |
1398 | 0 |
|
1399 | 0 | g = MVM_string_get_grapheme_at_nocheck(tc, s, offset); |
1400 | 0 | if (g >= 0) |
1401 | 0 | cp = (MVMCodepoint)g; |
1402 | 0 | else |
1403 | 0 | cp = MVM_nfg_get_synthetic_info(tc, g)->base; |
1404 | 0 | return MVM_unicode_codepoint_has_property_value(tc, cp, property_code, property_value_code); |
1405 | 0 | } |
1406 | | |
1407 | | /* If the string is made up of strands, then produces a flattend string |
1408 | | * representing the exact same graphemes but without strands. Otherwise, |
1409 | | * returns the input string. Intended for strings that will be indexed |
1410 | | * into heavily (when evaluating regexes, for example). */ |
1411 | 20.5k | MVMString * MVM_string_indexing_optimized(MVMThreadContext *tc, MVMString *s) { |
1412 | 20.5k | MVM_string_check_arg(tc, s, "indexingoptimized"); |
1413 | 20.5k | if (s->body.storage_type == MVM_STRING_STRAND) { |
1414 | 17.8k | MVMuint32 chars = MVM_string_graphs_nocheck(tc, s); |
1415 | 17.8k | MVMGrapheme32 *flat = MVM_malloc(chars * sizeof(MVMGrapheme32)); |
1416 | 17.8k | MVMuint32 i = 0; |
1417 | 17.8k | MVMString *res; |
1418 | 17.8k | MVMint8 can_fit_into_8bit = 1; |
1419 | 17.8k | |
1420 | 17.8k | MVMGraphemeIter gi; |
1421 | 17.8k | MVM_string_gi_init(tc, &gi, s); |
1422 | 380k | while (MVM_string_gi_has_more(tc, &gi)) { |
1423 | 362k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
1424 | 362k | if (g < -128 || g >= 128) |
1425 | 106 | can_fit_into_8bit = 0; |
1426 | 362k | flat[i++] = g; |
1427 | 362k | } |
1428 | 17.8k | |
1429 | 17.8k | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1430 | 17.8k | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
1431 | 17.8k | res->body.storage.blob_32 = flat; |
1432 | 17.8k | res->body.num_graphs = chars; |
1433 | 17.8k | |
1434 | 17.8k | if (can_fit_into_8bit) |
1435 | 17.8k | turn_32bit_into_8bit_unchecked(tc, res); |
1436 | 17.8k | |
1437 | 17.8k | return res; |
1438 | 17.8k | } |
1439 | 2.69k | else { |
1440 | 2.69k | return s; |
1441 | 2.69k | } |
1442 | 20.5k | } |
1443 | | |
1444 | | /* Escapes a string, replacing various chars like \n with \\n. Can no doubt be |
1445 | | * further optimized. */ |
1446 | 561 | MVMString * MVM_string_escape(MVMThreadContext *tc, MVMString *s) { |
1447 | 561 | MVMString *res = NULL; |
1448 | 561 | MVMStringIndex spos = 0; |
1449 | 561 | MVMStringIndex bpos = 0; |
1450 | 561 | MVMStringIndex sgraphs, balloc; |
1451 | 561 | MVMGrapheme32 *buffer; |
1452 | 561 | MVMGrapheme32 crlf; |
1453 | 561 | MVMint8 can_fit_into_8bit = 1; |
1454 | 561 | |
1455 | 561 | MVM_string_check_arg(tc, s, "escape"); |
1456 | 561 | |
1457 | 561 | sgraphs = MVM_string_graphs_nocheck(tc, s); |
1458 | 561 | balloc = sgraphs; |
1459 | 561 | buffer = MVM_malloc(sizeof(MVMGrapheme32) * balloc); |
1460 | 561 | |
1461 | 561 | crlf = MVM_nfg_crlf_grapheme(tc); |
1462 | 561 | |
1463 | 6.75k | for (; spos < sgraphs; spos++) { |
1464 | 6.19k | MVMGrapheme32 graph = MVM_string_get_grapheme_at_nocheck(tc, s, spos); |
1465 | 6.19k | MVMGrapheme32 esc = 0; |
1466 | 6.19k | switch (graph) { |
1467 | 11 | case '\\': esc = '\\'; break; |
1468 | 0 | case 7: esc = 'a'; break; |
1469 | 1 | case '\b': esc = 'b'; break; |
1470 | 3 | case '\n': esc = 'n'; break; |
1471 | 3 | case '\r': esc = 'r'; break; |
1472 | 13 | case '\t': esc = 't'; break; |
1473 | 1 | case '\f': esc = 'f'; break; |
1474 | 2 | case '"': esc = '"'; break; |
1475 | 1 | case 27: esc = 'e'; break; |
1476 | 6.19k | } |
1477 | 6.19k | if (esc) { |
1478 | 35 | if (bpos + 2 > balloc) { |
1479 | 12 | balloc += 32; |
1480 | 12 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc); |
1481 | 12 | } |
1482 | 35 | buffer[bpos++] = '\\'; |
1483 | 35 | buffer[bpos++] = esc; |
1484 | 35 | } |
1485 | 6.15k | else if (graph == crlf) { |
1486 | 1 | if (bpos + 4 > balloc) { |
1487 | 1 | balloc += 32; |
1488 | 1 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc); |
1489 | 1 | } |
1490 | 1 | buffer[bpos++] = '\\'; |
1491 | 1 | buffer[bpos++] = 'r'; |
1492 | 1 | buffer[bpos++] = '\\'; |
1493 | 1 | buffer[bpos++] = 'n'; |
1494 | 1 | } |
1495 | 6.15k | else { |
1496 | 6.15k | if (bpos + 1 > balloc) { |
1497 | 11 | balloc += 32; |
1498 | 11 | buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc); |
1499 | 11 | } |
1500 | 6.15k | if (graph < -128 || graph >= 128) |
1501 | 0 | can_fit_into_8bit = 0; |
1502 | 6.15k | buffer[bpos++] = graph; |
1503 | 6.15k | } |
1504 | 6.19k | } |
1505 | 561 | |
1506 | 561 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1507 | 561 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
1508 | 561 | res->body.storage.blob_32 = buffer; |
1509 | 561 | res->body.num_graphs = bpos; |
1510 | 561 | |
1511 | 561 | if (can_fit_into_8bit) |
1512 | 561 | turn_32bit_into_8bit_unchecked(tc, res); |
1513 | 561 | |
1514 | 561 | STRAND_CHECK(tc, res); |
1515 | 561 | return res; |
1516 | 561 | } |
1517 | | |
1518 | | /* Takes a string and reverses its characters. */ |
1519 | 1 | MVMString * MVM_string_flip(MVMThreadContext *tc, MVMString *s) { |
1520 | 1 | MVMString *res = NULL; |
1521 | 1 | MVMStringIndex spos = 0; |
1522 | 1 | MVMStringIndex sgraphs; |
1523 | 1 | MVMStringIndex rpos; |
1524 | 1 | |
1525 | 1 | MVM_string_check_arg(tc, s, "flip"); |
1526 | 1 | sgraphs = MVM_string_graphs_nocheck(tc, s); |
1527 | 1 | rpos = sgraphs; |
1528 | 1 | |
1529 | 1 | if (s->body.storage_type == MVM_STRING_GRAPHEME_8) { |
1530 | 1 | MVMGrapheme8 *rbuffer; |
1531 | 1 | rbuffer = MVM_malloc(sizeof(MVMGrapheme8) * sgraphs); |
1532 | 1 | |
1533 | 4 | for (; spos < sgraphs; spos++) |
1534 | 3 | rbuffer[--rpos] = s->body.storage.blob_8[spos]; |
1535 | 1 | |
1536 | 1 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1537 | 1 | res->body.storage_type = MVM_STRING_GRAPHEME_8; |
1538 | 1 | res->body.storage.blob_8 = rbuffer; |
1539 | 0 | } else { |
1540 | 0 | MVMGrapheme32 *rbuffer; |
1541 | 0 | rbuffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs); |
1542 | 0 |
|
1543 | 0 | if (s->body.storage_type == MVM_STRING_GRAPHEME_32) |
1544 | 0 | for (; spos < sgraphs; spos++) |
1545 | 0 | rbuffer[--rpos] = s->body.storage.blob_32[spos]; |
1546 | 0 | else |
1547 | 0 | for (; spos < sgraphs; spos++) |
1548 | 0 | rbuffer[--rpos] = MVM_string_get_grapheme_at_nocheck(tc, s, spos); |
1549 | 0 |
|
1550 | 0 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1551 | 0 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
1552 | 0 | res->body.storage.blob_32 = rbuffer; |
1553 | 0 | } |
1554 | 1 | |
1555 | 1 | res->body.num_graphs = sgraphs; |
1556 | 1 | |
1557 | 1 | STRAND_CHECK(tc, res); |
1558 | 1 | return res; |
1559 | 1 | } |
1560 | | |
1561 | | /* Compares two strings, returning -1, 0 or 1 to indicate less than, |
1562 | | * equal or greater than. */ |
1563 | 25.1k | MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
1564 | 25.1k | MVMStringIndex alen, blen, i, scanlen; |
1565 | 25.1k | |
1566 | 25.1k | MVM_string_check_arg(tc, a, "compare"); |
1567 | 25.1k | MVM_string_check_arg(tc, b, "compare"); |
1568 | 25.1k | |
1569 | 25.1k | /* Simple cases when one or both are zero length. */ |
1570 | 25.1k | alen = MVM_string_graphs_nocheck(tc, a); |
1571 | 25.1k | blen = MVM_string_graphs_nocheck(tc, b); |
1572 | 25.1k | if (alen == 0) |
1573 | 286 | return blen == 0 ? 0 : -1; |
1574 | 24.8k | if (blen == 0) |
1575 | 8.61k | return 1; |
1576 | 24.8k | |
1577 | 24.8k | /* Otherwise, need to scan them. */ |
1578 | 16.1k | scanlen = alen > blen ? blen : alen; |
1579 | 22.3k | for (i = 0; i < scanlen; i++) { |
1580 | 19.3k | MVMGrapheme32 ai = MVM_string_get_grapheme_at_nocheck(tc, a, i); |
1581 | 19.3k | MVMGrapheme32 bi = MVM_string_get_grapheme_at_nocheck(tc, b, i); |
1582 | 19.3k | if (ai != bi) |
1583 | 13.1k | return ai < bi ? -1 : 1; |
1584 | 19.3k | } |
1585 | 16.1k | |
1586 | 16.1k | /* All shared chars equal, so go on length. */ |
1587 | 3.07k | return alen < blen ? -1 : |
1588 | 3.07k | alen > blen ? 1 : |
1589 | 3.07k | 0 ; |
1590 | 16.1k | } |
1591 | | |
1592 | | /* Takes two strings and AND's their characters. */ |
1593 | 1 | MVMString * MVM_string_bitand(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
1594 | 1 | MVMString *res; |
1595 | 1 | MVMGrapheme32 *buffer; |
1596 | 1 | MVMStringIndex i, alen, blen, sgraphs; |
1597 | 1 | |
1598 | 1 | MVM_string_check_arg(tc, a, "bitwise and"); |
1599 | 1 | MVM_string_check_arg(tc, b, "bitwise and"); |
1600 | 1 | |
1601 | 1 | alen = MVM_string_graphs_nocheck(tc, a); |
1602 | 1 | blen = MVM_string_graphs_nocheck(tc, b); |
1603 | 1 | sgraphs = alen < blen ? alen : blen; |
1604 | 1 | buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs); |
1605 | 1 | |
1606 | 1 | /* Binary-and up to the length of the shortest string. */ |
1607 | 11 | for (i = 0; i < sgraphs; i++) |
1608 | 10 | buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i) |
1609 | 10 | & MVM_string_get_grapheme_at_nocheck(tc, b, i)); |
1610 | 1 | |
1611 | 1 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1612 | 1 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
1613 | 1 | res->body.storage.blob_32 = buffer; |
1614 | 1 | res->body.num_graphs = sgraphs; |
1615 | 1 | |
1616 | 1 | STRAND_CHECK(tc, res); |
1617 | 1 | return res; |
1618 | 1 | } |
1619 | | |
1620 | | /* Takes two strings and OR's their characters. */ |
1621 | 1 | MVMString * MVM_string_bitor(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
1622 | 1 | MVMString *res; |
1623 | 1 | MVMGrapheme32 *buffer; |
1624 | 1 | MVMStringIndex alen, blen, sgraphs, i, scanlen; |
1625 | 1 | |
1626 | 1 | MVM_string_check_arg(tc, a, "bitwise or"); |
1627 | 1 | MVM_string_check_arg(tc, b, "bitwise or"); |
1628 | 1 | |
1629 | 1 | alen = MVM_string_graphs_nocheck(tc, a); |
1630 | 1 | blen = MVM_string_graphs_nocheck(tc, b); |
1631 | 1 | sgraphs = (alen > blen ? alen : blen); |
1632 | 1 | buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs); |
1633 | 1 | |
1634 | 1 | /* First, binary-or up to the length of the shortest string. */ |
1635 | 1 | scanlen = alen > blen ? blen : alen; |
1636 | 11 | for (i = 0; i < scanlen; i++) |
1637 | 10 | buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i) |
1638 | 10 | | MVM_string_get_grapheme_at_nocheck(tc, b, i)); |
1639 | 1 | |
1640 | 1 | /* Second pass, fill with characters of the longest string. */ |
1641 | 1 | if (alen > blen) |
1642 | 2 | for (; i < sgraphs; i++) |
1643 | 1 | buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, a, i); |
1644 | 1 | else |
1645 | 0 | for (; i < sgraphs; i++) |
1646 | 0 | buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, b, i); |
1647 | 1 | |
1648 | 1 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1649 | 1 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
1650 | 1 | res->body.storage.blob_32 = buffer; |
1651 | 1 | res->body.num_graphs = sgraphs; |
1652 | 1 | |
1653 | 1 | STRAND_CHECK(tc, res); |
1654 | 1 | return res; |
1655 | 1 | } |
1656 | | |
1657 | | /* Takes two strings and XOR's their characters. */ |
1658 | 1 | MVMString * MVM_string_bitxor(MVMThreadContext *tc, MVMString *a, MVMString *b) { |
1659 | 1 | MVMString *res; |
1660 | 1 | MVMGrapheme32 *buffer; |
1661 | 1 | MVMStringIndex alen, blen, sgraphs, i, scanlen; |
1662 | 1 | |
1663 | 1 | MVM_string_check_arg(tc, a, "bitwise xor"); |
1664 | 1 | MVM_string_check_arg(tc, b, "bitwise xor"); |
1665 | 1 | |
1666 | 1 | alen = MVM_string_graphs_nocheck(tc, a); |
1667 | 1 | blen = MVM_string_graphs_nocheck(tc, b); |
1668 | 1 | sgraphs = (alen > blen ? alen : blen); |
1669 | 1 | buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs); |
1670 | 1 | |
1671 | 1 | /* First, binary-xor up to the length of the shorter string. */ |
1672 | 1 | scanlen = alen > blen ? blen : alen; |
1673 | 2 | for (i = 0; i < scanlen; i++) |
1674 | 1 | buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i) |
1675 | 1 | ^ MVM_string_get_grapheme_at_nocheck(tc, b, i)); |
1676 | 1 | |
1677 | 1 | /* Second pass, fill with characters of the longest string. */ |
1678 | 1 | if (alen > blen) |
1679 | 2 | for (; i < sgraphs; i++) |
1680 | 1 | buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, a, i); |
1681 | 1 | else |
1682 | 0 | for (; i < sgraphs; i++) |
1683 | 0 | buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, b, i); |
1684 | 1 | |
1685 | 1 | res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
1686 | 1 | res->body.storage_type = MVM_STRING_GRAPHEME_32; |
1687 | 1 | res->body.storage.blob_32 = buffer; |
1688 | 1 | res->body.num_graphs = sgraphs; |
1689 | 1 | |
1690 | 1 | STRAND_CHECK(tc, res); |
1691 | 1 | return res; |
1692 | 1 | } |
1693 | | |
1694 | | /* The following statics hold on to various unicode property values we will |
1695 | | * resolve once so we don't have to do it repeatedly. */ |
1696 | | static MVMint64 UPV_Nd = 0; |
1697 | | static MVMint64 UPV_Lu = 0; |
1698 | | static MVMint64 UPV_Ll = 0; |
1699 | | static MVMint64 UPV_Lt = 0; |
1700 | | static MVMint64 UPV_Lm = 0; |
1701 | | static MVMint64 UPV_Lo = 0; |
1702 | | static MVMint64 UPV_Zs = 0; |
1703 | | static MVMint64 UPV_Zl = 0; |
1704 | | static MVMint64 UPV_Pc = 0; |
1705 | | static MVMint64 UPV_Pd = 0; |
1706 | | static MVMint64 UPV_Ps = 0; |
1707 | | static MVMint64 UPV_Pe = 0; |
1708 | | static MVMint64 UPV_Pi = 0; |
1709 | | static MVMint64 UPV_Pf = 0; |
1710 | | static MVMint64 UPV_Po = 0; |
1711 | | |
1712 | | /* concatenating with "" ensures that only literal strings are accepted as argument. */ |
1713 | 1.95k | #define STR_WITH_LEN(str) ("" str ""), (sizeof(str) - 1) |
1714 | | |
1715 | | /* Resolves various unicode property values that we'll need. */ |
1716 | 130 | void MVM_string_cclass_init(MVMThreadContext *tc) { |
1717 | 130 | UPV_Nd = MVM_unicode_cname_to_property_value_code(tc, |
1718 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Nd")); |
1719 | 130 | UPV_Lu = MVM_unicode_cname_to_property_value_code(tc, |
1720 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lu")); |
1721 | 130 | UPV_Ll = MVM_unicode_cname_to_property_value_code(tc, |
1722 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Ll")); |
1723 | 130 | UPV_Lt = MVM_unicode_cname_to_property_value_code(tc, |
1724 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lt")); |
1725 | 130 | UPV_Lm = MVM_unicode_cname_to_property_value_code(tc, |
1726 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lm")); |
1727 | 130 | UPV_Lo = MVM_unicode_cname_to_property_value_code(tc, |
1728 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lo")); |
1729 | 130 | UPV_Zs = MVM_unicode_cname_to_property_value_code(tc, |
1730 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Zs")); |
1731 | 130 | UPV_Zl = MVM_unicode_cname_to_property_value_code(tc, |
1732 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Zl")); |
1733 | 130 | UPV_Pc = MVM_unicode_cname_to_property_value_code(tc, |
1734 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pc")); |
1735 | 130 | UPV_Pd = MVM_unicode_cname_to_property_value_code(tc, |
1736 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pd")); |
1737 | 130 | UPV_Ps = MVM_unicode_cname_to_property_value_code(tc, |
1738 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Ps")); |
1739 | 130 | UPV_Pe = MVM_unicode_cname_to_property_value_code(tc, |
1740 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pe")); |
1741 | 130 | UPV_Pi = MVM_unicode_cname_to_property_value_code(tc, |
1742 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pi")); |
1743 | 130 | UPV_Pf = MVM_unicode_cname_to_property_value_code(tc, |
1744 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pf")); |
1745 | 130 | UPV_Po = MVM_unicode_cname_to_property_value_code(tc, |
1746 | 130 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Po")); |
1747 | 130 | } |
1748 | | |
1749 | | /* Checks if the specified grapheme is in the given character class. */ |
1750 | 1.68M | static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) { |
1751 | 1.68M | /* If it's a synthetic, then grab the base codepoint. */ |
1752 | 1.68M | MVMCodepoint cp; |
1753 | 1.68M | if (g >= 0) |
1754 | 1.68M | cp = (MVMCodepoint)g; |
1755 | 1.68M | else |
1756 | 4 | cp = MVM_nfg_get_synthetic_info(tc, g)->base; |
1757 | 1.68M | |
1758 | 1.68M | switch (cclass) { |
1759 | 5.33k | case MVM_CCLASS_ANY: |
1760 | 5.33k | return 1; |
1761 | 5.33k | |
1762 | 230 | case MVM_CCLASS_UPPERCASE: |
1763 | 230 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1764 | 230 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu); |
1765 | 5.33k | |
1766 | 243 | case MVM_CCLASS_LOWERCASE: |
1767 | 243 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1768 | 243 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll); |
1769 | 5.33k | |
1770 | 519k | case MVM_CCLASS_WORD: |
1771 | 519k | if (cp <= 'z') { /* short circuit common case */ |
1772 | 518k | if (cp >= 'a' || cp == '_' || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9')) |
1773 | 351k | return 1; |
1774 | 518k | else |
1775 | 167k | return 0; |
1776 | 518k | } |
1777 | 519k | /* Deliberate fall-through; word is _ or digit or alphabetic. */ |
1778 | 519k | |
1779 | 870 | case MVM_CCLASS_ALPHANUMERIC: |
1780 | 870 | if (cp <= '9' && cp >= '0') /* short circuit common case */ |
1781 | 5 | return 1; |
1782 | 865 | if (MVM_unicode_codepoint_has_property_value(tc, cp, |
1783 | 865 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd)) |
1784 | 0 | return 1; |
1785 | 865 | /* Deliberate fall-through; alphanumeric is digit or alphabetic. */ |
1786 | 865 | |
1787 | 165k | case MVM_CCLASS_ALPHABETIC: |
1788 | 165k | if (cp <= 'z') { /* short circuit common case */ |
1789 | 163k | if (cp >= 'a' || (cp >= 'A' && cp <= 'Z')) |
1790 | 89.6k | return 1; |
1791 | 163k | else |
1792 | 73.3k | return 0; |
1793 | 163k | } |
1794 | 2.07k | return |
1795 | 2.07k | MVM_unicode_codepoint_has_property_value(tc, cp, |
1796 | 2.07k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lo) /* lots of CJK chars */ |
1797 | 2.07k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1798 | 2.07k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll) /* (ascii handled above) */ |
1799 | 2.04k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1800 | 2.04k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu) |
1801 | 2.04k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1802 | 2.04k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lt) |
1803 | 2.04k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1804 | 2.04k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lm); |
1805 | 165k | |
1806 | 281k | case MVM_CCLASS_NUMERIC: |
1807 | 281k | if (cp <= '9' && cp >= '0') /* short circuit common case */ |
1808 | 106k | return 1; |
1809 | 174k | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1810 | 174k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd); |
1811 | 281k | |
1812 | 185 | case MVM_CCLASS_HEXADECIMAL: |
1813 | 185 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1814 | 185 | MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1); |
1815 | 281k | |
1816 | 147k | case MVM_CCLASS_WHITESPACE: |
1817 | 147k | if (cp <= '~') { |
1818 | 147k | if (cp == ' ' || (cp <= 13 && cp >= 9)) |
1819 | 33.4k | return 1; |
1820 | 147k | else |
1821 | 114k | return 0; |
1822 | 147k | } |
1823 | 102 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1824 | 102 | MVM_UNICODE_PROPERTY_WHITE_SPACE, 1); |
1825 | 147k | |
1826 | 63 | case MVM_CCLASS_BLANK: |
1827 | 63 | if (cp == '\t') |
1828 | 7 | return 1; |
1829 | 56 | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1830 | 56 | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zs); |
1831 | 63 | |
1832 | 60 | case MVM_CCLASS_CONTROL: { |
1833 | 60 | return (cp >= 0 && cp < 32) || (cp >= 127 && cp < 160); |
1834 | 63 | } |
1835 | 63 | |
1836 | 55 | case MVM_CCLASS_PRINTING: { |
1837 | 55 | return !((cp >= 0 && cp < 32) || (cp >= 127 && cp < 160)); |
1838 | 63 | } |
1839 | 63 | |
1840 | 7.45k | case MVM_CCLASS_PUNCTUATION: |
1841 | 7.45k | return |
1842 | 7.45k | MVM_unicode_codepoint_has_property_value(tc, cp, |
1843 | 7.45k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pc) |
1844 | 7.45k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1845 | 7.45k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pd) |
1846 | 7.45k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1847 | 7.45k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ps) |
1848 | 7.44k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1849 | 7.44k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pe) |
1850 | 7.44k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1851 | 7.44k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pi) |
1852 | 7.44k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1853 | 7.44k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pf) |
1854 | 7.44k | || MVM_unicode_codepoint_has_property_value(tc, cp, |
1855 | 7.44k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Po); |
1856 | 63 | |
1857 | 559k | case MVM_CCLASS_NEWLINE: { |
1858 | 559k | if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' || |
1859 | 541k | cp == 0x85 || cp == 0x2028 || cp == 0x2029) |
1860 | 18.0k | return 1; |
1861 | 541k | return MVM_unicode_codepoint_has_property_value(tc, cp, |
1862 | 541k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zl); |
1863 | 559k | } |
1864 | 559k | |
1865 | 0 | default: |
1866 | 0 | return 0; |
1867 | 1.68M | } |
1868 | 1.68M | } |
1869 | | |
1870 | | /* Checks if the character at the specified offset is a member of the |
1871 | | * indicated character class. */ |
1872 | 1.14M | MVMint64 MVM_string_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset) { |
1873 | 1.14M | MVMGrapheme32 g; |
1874 | 1.14M | MVM_string_check_arg(tc, s, "is_cclass"); |
1875 | 1.14M | if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s)) |
1876 | 16 | return 0; |
1877 | 1.14M | g = MVM_string_get_grapheme_at_nocheck(tc, s, offset); |
1878 | 1.14M | return grapheme_is_cclass(tc, cclass, g); |
1879 | 1.14M | } |
1880 | | |
1881 | | /* Searches for the next char that is in the specified character class. */ |
1882 | 14.2k | MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) { |
1883 | 14.2k | MVMGraphemeIter gi; |
1884 | 14.2k | MVMint64 length, end, pos; |
1885 | 14.2k | |
1886 | 14.2k | MVM_string_check_arg(tc, s, "find_cclass"); |
1887 | 14.2k | |
1888 | 14.2k | length = MVM_string_graphs_nocheck(tc, s); |
1889 | 14.2k | end = offset + count; |
1890 | 13.4k | end = length < end ? length : end; |
1891 | 14.2k | if (offset < 0 || offset >= length) |
1892 | 135 | return end; |
1893 | 14.2k | |
1894 | 14.0k | MVM_string_gi_init(tc, &gi, s); |
1895 | 14.0k | MVM_string_gi_move_to(tc, &gi, offset); |
1896 | 445k | for (pos = offset; pos < end; pos++) { |
1897 | 444k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
1898 | 444k | if (grapheme_is_cclass(tc, cclass, g) > 0) |
1899 | 13.4k | return pos; |
1900 | 444k | } |
1901 | 14.0k | |
1902 | 609 | return end; |
1903 | 14.0k | } |
1904 | | |
1905 | | /* Searches for the next char that is not in the specified character class. */ |
1906 | 19.7k | MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) { |
1907 | 19.7k | MVMGraphemeIter gi; |
1908 | 19.7k | MVMint64 length, end, pos; |
1909 | 19.7k | |
1910 | 19.7k | MVM_string_check_arg(tc, s, "find_not_cclass"); |
1911 | 19.7k | |
1912 | 19.7k | length = MVM_string_graphs_nocheck(tc, s); |
1913 | 19.7k | end = offset + count; |
1914 | 19.1k | end = length < end ? length : end; |
1915 | 19.7k | if (offset < 0 || offset >= length) |
1916 | 534 | return end; |
1917 | 19.7k | |
1918 | 19.2k | MVM_string_gi_init(tc, &gi, s); |
1919 | 19.2k | MVM_string_gi_move_to(tc, &gi, offset); |
1920 | 97.5k | for (pos = offset; pos < end; pos++) { |
1921 | 97.5k | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi); |
1922 | 97.5k | if (grapheme_is_cclass(tc, cclass, g) == 0) |
1923 | 19.2k | return pos; |
1924 | 97.5k | } |
1925 | 19.2k | |
1926 | 26 | return end; |
1927 | 19.2k | } |
1928 | | |
1929 | | static MVMint16 encoding_name_init = 0; |
1930 | | static MVMString *encoding_utf8_name = NULL; |
1931 | | static MVMString *encoding_ascii_name = NULL; |
1932 | | static MVMString *encoding_latin1_name = NULL; |
1933 | | static MVMString *encoding_utf16_name = NULL; |
1934 | | static MVMString *encoding_windows1252_name = NULL; |
1935 | | static MVMString *encoding_utf8_c8_name = NULL; |
1936 | 189 | MVMuint8 MVM_string_find_encoding(MVMThreadContext *tc, MVMString *name) { |
1937 | 189 | MVM_string_check_arg(tc, name, "find encoding"); |
1938 | 189 | if (!encoding_name_init) { |
1939 | 130 | MVM_gc_allocate_gen2_default_set(tc); |
1940 | 130 | encoding_utf8_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8"); |
1941 | 130 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_name, "Encoding name"); |
1942 | 130 | encoding_ascii_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "ascii"); |
1943 | 130 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_ascii_name, "Encoding name"); |
1944 | 130 | encoding_latin1_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "iso-8859-1"); |
1945 | 130 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_latin1_name, "Encoding name"); |
1946 | 130 | encoding_utf16_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf16"); |
1947 | 130 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf16_name, "Encoding name"); |
1948 | 130 | encoding_windows1252_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1252"); |
1949 | 130 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1252_name, "Encoding name"); |
1950 | 130 | encoding_utf8_c8_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8-c8"); |
1951 | 130 | MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_c8_name, "Encoding name"); |
1952 | 130 | encoding_name_init = 1; |
1953 | 130 | MVM_gc_allocate_gen2_default_clear(tc); |
1954 | 130 | } |
1955 | 189 | |
1956 | 189 | if (MVM_string_equal(tc, name, encoding_utf8_name)) { |
1957 | 182 | return MVM_encoding_type_utf8; |
1958 | 182 | } |
1959 | 7 | else if (MVM_string_equal(tc, name, encoding_ascii_name)) { |
1960 | 3 | return MVM_encoding_type_ascii; |
1961 | 3 | } |
1962 | 4 | else if (MVM_string_equal(tc, name, encoding_latin1_name)) { |
1963 | 1 | return MVM_encoding_type_latin1; |
1964 | 1 | } |
1965 | 3 | else if (MVM_string_equal(tc, name, encoding_windows1252_name)) { |
1966 | 0 | return MVM_encoding_type_windows1252; |
1967 | 0 | } |
1968 | 3 | else if (MVM_string_equal(tc, name, encoding_utf16_name)) { |
1969 | 2 | return MVM_encoding_type_utf16; |
1970 | 2 | } |
1971 | 1 | else if (MVM_string_equal(tc, name, encoding_utf8_c8_name)) { |
1972 | 0 | return MVM_encoding_type_utf8_c8; |
1973 | 0 | } |
1974 | 1 | else { |
1975 | 1 | char *c_name = MVM_string_utf8_encode_C_string(tc, name); |
1976 | 1 | char *waste[] = { c_name, NULL }; |
1977 | 1 | MVM_exception_throw_adhoc_free(tc, waste, "Unknown string encoding: '%s'", |
1978 | 1 | c_name); |
1979 | 1 | } |
1980 | 189 | } |
1981 | | |
1982 | | /* Turns a codepoint into a string. If required uses the normalizer to ensure |
1983 | | * that we get a valid NFG string (NFG is a superset of NFC, and singleton |
1984 | | * decompositions exist). */ |
1985 | 8.43k | MVMString * MVM_string_chr(MVMThreadContext *tc, MVMCodepoint cp) { |
1986 | 8.43k | MVMString *s; |
1987 | 8.43k | MVMGrapheme32 g; |
1988 | 8.43k | |
1989 | 8.43k | if (cp < 0) |
1990 | 0 | MVM_exception_throw_adhoc(tc, "chr codepoint cannot be negative"); |
1991 | 8.43k | /* If the codepoint decomposes we may need to normalize it. |
1992 | 8.43k | * The first cp that decomposes is U+0340, but to be on the safe side |
1993 | 8.43k | * for now we go with the first significant character which at the time |
1994 | 8.43k | * of writing (Unicode 9.0) is COMBINING GRAVE ACCENT U+300 */ |
1995 | 8.43k | if (cp >= MVM_NORMALIZE_FIRST_SIG_NFC |
1996 | 189 | && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE) |
1997 | 189 | != MVM_UNICODE_PVALUE_DT_NONE) { |
1998 | 72 | |
1999 | 72 | MVMNormalizer norm; |
2000 | 72 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
2001 | 72 | if (!MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp, &g)) { |
2002 | 72 | MVM_unicode_normalizer_eof(tc, &norm); |
2003 | 72 | g = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
2004 | 72 | } |
2005 | 72 | MVM_unicode_normalizer_cleanup(tc, &norm); |
2006 | 72 | } |
2007 | 8.36k | else { |
2008 | 8.36k | g = (MVMGrapheme32) cp; |
2009 | 8.36k | } |
2010 | 8.43k | |
2011 | 8.43k | s = (MVMString *)REPR(tc->instance->VMString)->allocate(tc, STABLE(tc->instance->VMString)); |
2012 | 8.43k | if (g >= -128 && g < 128) { |
2013 | 8.23k | s->body.storage_type = MVM_STRING_GRAPHEME_8; |
2014 | 8.23k | s->body.storage.blob_8 = MVM_malloc(sizeof(MVMGrapheme8)); |
2015 | 8.23k | s->body.storage.blob_8[0] = g; |
2016 | 202 | } else { |
2017 | 202 | s->body.storage_type = MVM_STRING_GRAPHEME_32; |
2018 | 202 | s->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32)); |
2019 | 202 | s->body.storage.blob_32[0] = g; |
2020 | 202 | } |
2021 | 8.43k | s->body.num_graphs = 1; |
2022 | 8.43k | return s; |
2023 | 8.43k | } |
2024 | | |
2025 | | /* Takes a string and computes a hash code for it, storing it in the hash code |
2026 | | * cache field of the string. Hashing code is derived from the Jenkins hash |
2027 | | * implementation in uthash.h. */ |
2028 | | typedef union { |
2029 | | MVMint32 graphs[3]; |
2030 | | unsigned char bytes[12]; |
2031 | | } MVMJenHashGraphemeView; |
2032 | 784k | void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) { |
2033 | 784k | /* The hash algorithm works in bytes. Since we can represent strings in a |
2034 | 784k | * number of ways, and we want consistent hashing, then we'll read the |
2035 | 784k | * strings using the grapheme iterator in groups of 3, using 32-bit ints |
2036 | 784k | * for the graphemes no matter what the string really holds them as. Then |
2037 | 784k | * we'll use the bytes view of that in the hashing function. */ |
2038 | 784k | MVMJenHashGraphemeView hash_block; |
2039 | 784k | MVMGraphemeIter gi; |
2040 | 784k | MVMuint32 graphs_remaining = MVM_string_graphs(tc, s); |
2041 | 784k | |
2042 | 784k | /* Initialize hash state. */ |
2043 | 784k | MVMuint32 hashv = 0xfeedbeef; |
2044 | 784k | MVMuint32 _hj_i, _hj_j; |
2045 | 784k | _hj_i = _hj_j = 0x9e3779b9; |
2046 | 784k | |
2047 | 784k | /* Work through the string 3 graphemes at a time. */ |
2048 | 784k | MVM_string_gi_init(tc, &gi, s); |
2049 | 2.99M | while (graphs_remaining >= 3) { |
2050 | 2.21M | hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi); |
2051 | 2.21M | hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi); |
2052 | 2.21M | hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi); |
2053 | 2.21M | _hj_i += (hash_block.bytes[0] + ( (unsigned)hash_block.bytes[1] << 8 ) |
2054 | 2.21M | + ( (unsigned)hash_block.bytes[2] << 16 ) |
2055 | 2.21M | + ( (unsigned)hash_block.bytes[3] << 24 ) ); |
2056 | 2.21M | _hj_j += (hash_block.bytes[4] + ( (unsigned)hash_block.bytes[5] << 8 ) |
2057 | 2.21M | + ( (unsigned)hash_block.bytes[6] << 16 ) |
2058 | 2.21M | + ( (unsigned)hash_block.bytes[7] << 24 ) ); |
2059 | 2.21M | hashv += (hash_block.bytes[8] + ( (unsigned)hash_block.bytes[9] << 8 ) |
2060 | 2.21M | + ( (unsigned)hash_block.bytes[10] << 16 ) |
2061 | 2.21M | + ( (unsigned)hash_block.bytes[11] << 24 ) ); |
2062 | 2.21M | |
2063 | 2.21M | HASH_JEN_MIX(_hj_i, _hj_j, hashv); |
2064 | 2.21M | |
2065 | 2.21M | graphs_remaining -= 3; |
2066 | 2.21M | } |
2067 | 784k | |
2068 | 784k | /* Mix in key length (in bytes, not graphemes). */ |
2069 | 784k | hashv += MVM_string_graphs(tc, s) * sizeof(MVMGrapheme32); |
2070 | 784k | |
2071 | 784k | /* Now handle trailing graphemes (must be 2, 1, or 0). */ |
2072 | 784k | switch (graphs_remaining) { |
2073 | 260k | case 2: |
2074 | 260k | hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi); |
2075 | 260k | _hj_j += ( (unsigned)hash_block.bytes[7] << 24 ) + |
2076 | 260k | ( (unsigned)hash_block.bytes[6] << 16 ) + |
2077 | 260k | ( (unsigned)hash_block.bytes[5] << 8 ) + |
2078 | 260k | hash_block.bytes[4]; |
2079 | 260k | /* Fallthrough */ |
2080 | 516k | case 1: |
2081 | 516k | hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi); |
2082 | 516k | _hj_i += ( (unsigned)hash_block.bytes[3] << 24 ) + |
2083 | 516k | ( (unsigned)hash_block.bytes[2] << 16 ) + |
2084 | 516k | ( (unsigned)hash_block.bytes[1] << 8 ) + |
2085 | 516k | hash_block.bytes[0]; |
2086 | 784k | } |
2087 | 784k | HASH_JEN_MIX(_hj_i, _hj_j, hashv); |
2088 | 784k | |
2089 | 784k | /* Store computed hash value. */ |
2090 | 784k | s->body.cached_hash_code = hashv; |
2091 | 784k | } |