/home/travis/build/MoarVM/MoarVM/src/strings/iter.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Grapheme iterator structure; iterates through graphemes in a string. */ |
2 | | struct MVMGraphemeIter { |
3 | | /* The blob we're currently iterating over. */ |
4 | | union { |
5 | | MVMGrapheme32 *blob_32; |
6 | | MVMGraphemeASCII *blob_ascii; |
7 | | MVMGrapheme8 *blob_8; |
8 | | void *any; |
9 | | } active_blob; |
10 | | |
11 | | /* The type of blob we have. */ |
12 | | MVMuint16 blob_type; |
13 | | |
14 | | /* The number of strands remaining, if any. */ |
15 | | MVMuint16 strands_remaining; |
16 | | |
17 | | /* The current position, and the end position. */ |
18 | | MVMStringIndex pos; |
19 | | MVMStringIndex end; |
20 | | |
21 | | /* Repetition count, and the start index in the blob (only needed if we're |
22 | | * doing an iteration over a repetition). */ |
23 | | MVMStringIndex start; |
24 | | MVMuint32 repetitions; |
25 | | |
26 | | /* The next strand, if we're doing a strand-based iteration. */ |
27 | | MVMStringStrand *next_strand; |
28 | | }; |
29 | | |
30 | | /* Initializes a grapheme iterator. */ |
31 | 0 | MVM_STATIC_INLINE void MVM_string_gi_init(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *s) { |
32 | 0 | if (s->body.storage_type == MVM_STRING_STRAND) { |
33 | 0 | MVMStringStrand *strands = s->body.storage.strands; |
34 | 0 | MVMString *first = strands[0].blob_string; |
35 | 0 | gi->active_blob.any = first->body.storage.any; |
36 | 0 | gi->blob_type = first->body.storage_type; |
37 | 0 | gi->strands_remaining = s->body.num_strands - 1; |
38 | 0 | gi->pos = gi->start = strands[0].start; |
39 | 0 | gi->end = strands[0].end; |
40 | 0 | gi->repetitions = strands[0].repetitions; |
41 | 0 | gi->next_strand = strands + 1; |
42 | 0 | } |
43 | 0 | else { |
44 | 0 | gi->active_blob.any = s->body.storage.any; |
45 | 0 | gi->blob_type = s->body.storage_type; |
46 | 0 | gi->end = s->body.num_graphs; |
47 | 0 | gi->strands_remaining = gi->start = gi->pos = gi->repetitions = 0; |
48 | 0 | } |
49 | 0 | }; |
50 | | /* Gets the number of graphemes remaining in the current strand of the grapheme |
51 | | * iterator, including repetitions */ |
52 | | #define MVM_string_gi_graphs_left_in_strand_rep(gi) \ |
53 | 0 | (gi->end - gi->pos + gi->repetitions * (gi->end - gi->start)) |
54 | | /* graphs left in strand + graphs left in repetitions of current strand */ |
55 | | |
56 | | /* Moves to the next strand, or repetition if there is one. */ |
57 | 0 | static void MVM_string_gi_next_strand_rep(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
58 | 0 | MVMStringStrand *next = NULL; |
59 | 0 | if (gi->repetitions) { |
60 | 0 | gi->pos = gi->start; |
61 | 0 | gi->repetitions--; |
62 | 0 | return; |
63 | 0 | } |
64 | 0 | if (gi->strands_remaining <= 0) |
65 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator\n"); |
66 | 0 |
|
67 | 0 | next = (gi->next_strand)++; |
68 | 0 | gi->pos = gi->start = next->start; |
69 | 0 | gi->end = next->end; |
70 | 0 | gi->repetitions = next->repetitions; |
71 | 0 | gi->blob_type = next->blob_string->body.storage_type; |
72 | 0 | gi->active_blob.any = next->blob_string->body.storage.any; |
73 | 0 | gi->strands_remaining--; |
74 | 0 | } |
75 | | /* Sets the position of the iterator. (Can be optimized in many ways in the |
76 | | * repetitions and strands branches.) */ |
77 | 0 | MVM_STATIC_INLINE void MVM_string_gi_move_to(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMuint32 pos) { |
78 | 0 | MVMuint32 remaining = pos; |
79 | 0 | MVMuint32 strand_graphs; |
80 | 0 | MVMStringStrand *next = NULL; |
81 | 0 |
|
82 | 0 | /* Find the appropriate strand. */ |
83 | 0 | /* Set strand_graphs to the number of graphemes */ |
84 | 0 | while (remaining > (strand_graphs = MVM_string_gi_graphs_left_in_strand_rep(gi))) { |
85 | 0 | remaining -= strand_graphs; |
86 | 0 | if (!(gi->strands_remaining--)) |
87 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator"); |
88 | 0 | next = (gi->next_strand)++; |
89 | 0 | gi->pos = gi->start = next->start; |
90 | 0 | gi->end = next->end; |
91 | 0 | gi->repetitions = next->repetitions; |
92 | 0 | } |
93 | 0 | if (next) { |
94 | 0 | gi->blob_type = next->blob_string->body.storage_type; |
95 | 0 | gi->active_blob.any = next->blob_string->body.storage.any; |
96 | 0 | } |
97 | 0 |
|
98 | 0 | /* Now look within the strand. */ |
99 | 0 | if (!remaining) |
100 | 0 | return; |
101 | 0 | /* Most common case where we move within the strand */ |
102 | 0 | if (gi->pos + remaining <= gi->end) { |
103 | 0 | gi->pos += remaining; |
104 | 0 | return; |
105 | 0 | } |
106 | 0 | /* If we are here we are encountering a repetition */ |
107 | 0 | if (gi->repetitions) { |
108 | 0 | MVMuint32 rep_graphs = gi->end - gi->start; |
109 | 0 | MVMuint32 remaining_reps; |
110 | 0 | /* If we aren't at the end of the repetition, move to the end */ |
111 | 0 | if (gi->pos < gi->end) { |
112 | 0 | remaining -= gi->end - gi->pos; |
113 | 0 | gi->pos = gi->end; |
114 | 0 | } |
115 | 0 | remaining_reps = remaining / rep_graphs; |
116 | 0 | if (gi->repetitions < remaining_reps) |
117 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator:" |
118 | 0 | " no more repetitions remaining\n"); |
119 | 0 | gi->repetitions -= remaining_reps; |
120 | 0 | /* Since we're still at the end, if there's repetitions left over |
121 | 0 | * we are going to have to seek forward */ |
122 | 0 | if (remaining -= remaining_reps * rep_graphs) { |
123 | 0 | gi->repetitions--; /* Move to the next repetition. */ |
124 | 0 | gi->pos = gi->start + remaining; |
125 | 0 | /* remaining = 0 now for all purposes now, but since we return, no |
126 | 0 | * need to set it */ |
127 | 0 | } |
128 | 0 | return; |
129 | 0 | } |
130 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator"); |
131 | 0 | } |
132 | | |
133 | | /* Checks if there is more to read from a grapheme iterator. */ |
134 | 0 | MVM_STATIC_INLINE MVMint32 MVM_string_gi_has_more(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
135 | 0 | return gi->pos < gi->end || gi->repetitions || gi->strands_remaining; |
136 | 0 | } |
137 | | /* Returns number of graphs left in the strand, ignoring repetitions */ |
138 | 0 | MVM_STATIC_INLINE MVMStringIndex MVM_string_gi_graphs_left_in_strand(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
139 | 0 | return gi->end - gi->pos; |
140 | 0 | } |
141 | 0 | MVM_STATIC_INLINE MVMGrapheme8 * MVM_string_gi_active_blob_8_pos(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
142 | 0 | return gi->active_blob.blob_8 + gi->pos; |
143 | 0 | } |
144 | 0 | MVM_STATIC_INLINE MVMGrapheme32 * MVM_string_gi_active_blob_32_pos(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
145 | 0 | return gi->active_blob.blob_32 + gi->pos; |
146 | 0 | } |
147 | 0 | MVM_STATIC_INLINE MVMuint16 MVM_string_gi_blob_type(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
148 | 0 | return gi->blob_type; |
149 | 0 | } |
150 | | /* Returns if there are more strands left in the gi, including repetitions */ |
151 | 0 | MVM_STATIC_INLINE int MVM_string_gi_has_more_strands_rep(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
152 | 0 | return !!(gi->strands_remaining || gi->repetitions); |
153 | 0 | } |
154 | | /* Gets the next grapheme. */ |
155 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_get_grapheme(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
156 | 0 | while (1) { |
157 | 0 | if (gi->pos < gi->end) { |
158 | 0 | switch (gi->blob_type) { |
159 | 0 | case MVM_STRING_GRAPHEME_32: |
160 | 0 | return gi->active_blob.blob_32[gi->pos++]; |
161 | 0 | case MVM_STRING_GRAPHEME_ASCII: |
162 | 0 | return gi->active_blob.blob_ascii[gi->pos++]; |
163 | 0 | case MVM_STRING_GRAPHEME_8: |
164 | 0 | return gi->active_blob.blob_8[gi->pos++]; |
165 | 0 | } |
166 | 0 | } |
167 | 0 | else if (gi->repetitions) { |
168 | 0 | gi->pos = gi->start; |
169 | 0 | gi->repetitions--; |
170 | 0 | } |
171 | 0 | else if (gi->strands_remaining) { |
172 | 0 | MVMStringStrand *next = gi->next_strand; |
173 | 0 | gi->active_blob.any = next->blob_string->body.storage.any; |
174 | 0 | gi->blob_type = next->blob_string->body.storage_type; |
175 | 0 | gi->pos = next->start; |
176 | 0 | gi->end = next->end; |
177 | 0 | gi->start = next->start; |
178 | 0 | gi->repetitions = next->repetitions; |
179 | 0 | gi->strands_remaining--; |
180 | 0 | gi->next_strand++; |
181 | 0 | } |
182 | 0 | else { |
183 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator"); |
184 | 0 | } |
185 | 0 | } |
186 | 0 | } |
187 | | |
188 | | |
189 | | /* Returns the codepoint without doing checks, for internal VM use only. */ |
190 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString *a, MVMint64 index) { |
191 | 0 | switch (a->body.storage_type) { |
192 | 0 | case MVM_STRING_GRAPHEME_32: |
193 | 0 | return a->body.storage.blob_32[index]; |
194 | 0 | case MVM_STRING_GRAPHEME_ASCII: |
195 | 0 | return a->body.storage.blob_ascii[index]; |
196 | 0 | case MVM_STRING_GRAPHEME_8: |
197 | 0 | return a->body.storage.blob_8[index]; |
198 | 0 | case MVM_STRING_STRAND: { |
199 | 0 | MVMGraphemeIter gi; |
200 | 0 | MVM_string_gi_init(tc, &gi, a); |
201 | 0 | MVM_string_gi_move_to(tc, &gi, index); |
202 | 0 | return MVM_string_gi_get_grapheme(tc, &gi); |
203 | 0 | } |
204 | 0 | default: |
205 | 0 | MVM_exception_throw_adhoc(tc, "String corruption detected: bad storage type"); |
206 | 0 | } |
207 | 0 | } |
208 | | |
209 | | /* Code point iterator. Uses the grapheme iterator, and adds some extra bits |
210 | | * in order to iterate the code points in synthetics. */ |
211 | | struct MVMCodepointIter { |
212 | | /* The grapheme iterator. */ |
213 | | MVMGraphemeIter gi; |
214 | | |
215 | | /* The codes of the current synthetic we're walking through, if any, with |
216 | | * the number of combiners we returned so far, and the total number of |
217 | | * combiners there are. */ |
218 | | MVMCodepoint *synth_codes; |
219 | | MVMint32 visited_synth_codes; |
220 | | MVMint32 total_synth_codes; |
221 | | /* first_code is only used for string_grapheme_ci functions */ |
222 | | MVMCodepoint first_code; |
223 | | /* If we should translate newline \n into \r\n. */ |
224 | | MVMint32 translate_newlines; |
225 | | /* Used to pass through utf8-c8 synthetics, but not any others so we can |
226 | | * renomalize text without getting rid of utf8-c8 synthetics */ |
227 | | MVMint32 pass_utfc8_synthetics; |
228 | | |
229 | | }; |
230 | | |
231 | | /* Initializes a code point iterator. */ |
232 | | MVM_STATIC_INLINE void MVM_string_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMString *s, |
233 | 0 | MVMint32 translate_newlines, MVMint32 pass_utfc8_synthetics) { |
234 | 0 | /* Initialize our underlying grapheme iterator. */ |
235 | 0 | MVM_string_gi_init(tc, &(ci->gi), s); |
236 | 0 |
|
237 | 0 | /* We've no currently active synthetic codepoint (and other fields are |
238 | 0 | * unused until we do, so leave them alone for now). */ |
239 | 0 | ci->synth_codes = NULL; |
240 | 0 | ci->translate_newlines = translate_newlines; |
241 | 0 | ci->pass_utfc8_synthetics = pass_utfc8_synthetics; |
242 | 0 | }; |
243 | | /* Iterates on a grapheme. Returns the number of codepoints in the grapheme */ |
244 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMGrapheme32 g, MVMint32 pass_utfc8_synthetics) { |
245 | 0 | MVMNFGSynthetic *synth = NULL; |
246 | 0 | if (g < 0) { |
247 | 0 | /* Get the synthetics info. */ |
248 | 0 | synth = MVM_nfg_get_synthetic_info(tc, g); |
249 | 0 | } |
250 | 0 | /* If we got a synth, but *not* if we are supposed to pass utf8-c8 synthetics |
251 | 0 | * and it is a utf8-c8 synthetic */ |
252 | 0 | if (synth && !(pass_utfc8_synthetics && synth->is_utf8_c8)) { |
253 | 0 | /* Set up the iterator so in the next iteration we will start to |
254 | 0 | * hand back codepoints after the initial. |
255 | 0 | * TODO: This may be able to be optimized |
256 | 0 | * to remove first_code. */ |
257 | 0 | ci->synth_codes = synth->codes + 1; |
258 | 0 | ci->visited_synth_codes = -1; |
259 | 0 | ci->total_synth_codes = synth->num_codes - 1; |
260 | 0 | ci->first_code = synth->codes[0]; |
261 | 0 | } |
262 | 0 | else { |
263 | 0 | ci->synth_codes = NULL; |
264 | 0 | ci->visited_synth_codes = -1; |
265 | 0 | ci->total_synth_codes = 0; |
266 | 0 | ci->first_code = g; |
267 | 0 | } |
268 | 0 | return ci->total_synth_codes + 1; |
269 | 0 | } |
270 | | /* Only for string_grapheme_ci ops */ |
271 | 0 | MVM_STATIC_INLINE MVMCodepoint MVM_string_grapheme_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) { |
272 | 0 | MVMCodepoint result = ci->visited_synth_codes < 0 |
273 | 0 | ? ci->first_code |
274 | 0 | : ci->synth_codes[ci->visited_synth_codes]; |
275 | 0 | ci->visited_synth_codes++; |
276 | 0 | return result; |
277 | 0 | } |
278 | | |
279 | | /* Checks if there is more to read from a code point iterator; this is the |
280 | | * case if we're still walking through a synthetic or we have more things |
281 | | * available from the underlying grapheme iterator. */ |
282 | 0 | MVM_STATIC_INLINE MVMint32 MVM_string_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) { |
283 | 0 | return ci->synth_codes || MVM_string_gi_has_more(tc, &(ci->gi)); |
284 | 0 | } |
285 | | /* Only for use with string_grapheme_ci ops */ |
286 | 0 | MVM_STATIC_INLINE MVMint32 MVM_string_grapheme_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) { |
287 | 0 | return ci->visited_synth_codes < ci->total_synth_codes; |
288 | 0 | } |
289 | | |
290 | | /* Gets the next code point. */ |
291 | 0 | MVM_STATIC_INLINE MVMCodepoint MVM_string_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) { |
292 | 0 | MVMCodepoint result; |
293 | 0 |
|
294 | 0 | /* Do we have combiners from a synthetic to return? */ |
295 | 0 | if (ci->synth_codes) { |
296 | 0 | /* Take the current combiner as the result. */ |
297 | 0 | result = ci->synth_codes[ci->visited_synth_codes]; |
298 | 0 |
|
299 | 0 | /* If we've seen all of the synthetics, clear up so we'll take another |
300 | 0 | * grapheme next time around. */ |
301 | 0 | ci->visited_synth_codes++; |
302 | 0 | if (ci->visited_synth_codes == ci->total_synth_codes) |
303 | 0 | ci->synth_codes = NULL; |
304 | 0 | } |
305 | 0 |
|
306 | 0 | /* Otherwise, proceed to the next grapheme. */ |
307 | 0 | else { |
308 | 0 | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &(ci->gi)); |
309 | 0 | #ifdef _WIN32 |
310 | 0 | if (ci->translate_newlines && g == '\n') |
311 | 0 | g = MVM_nfg_crlf_grapheme(tc); |
312 | 0 | #endif |
313 | 0 | if (g >= 0) { |
314 | 0 | /* It's not a synthetic, so we're done. */ |
315 | 0 | result = (MVMCodepoint)g; |
316 | 0 | } |
317 | 0 | else { |
318 | 0 | /* It's a synthetic. Look it up. */ |
319 | 0 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
320 | 0 | /* If we have pass_utfc8_synthetics set and it's a utf_c8 codepoint |
321 | 0 | * pass it back unchanged */ |
322 | 0 | if (ci->pass_utfc8_synthetics && synth->is_utf8_c8) { |
323 | 0 | result = g; |
324 | 0 | } |
325 | 0 | else { |
326 | 0 | /* Set up the iterator so in the next iteration we will start to |
327 | 0 | * hand back codepoints. */ |
328 | 0 | ci->synth_codes = synth->codes + 1; |
329 | 0 | ci->visited_synth_codes = 0; |
330 | 0 | /* Emulate num_combs and subtract one from num_codes */ |
331 | 0 | ci->total_synth_codes = synth->num_codes - 1; |
332 | 0 |
|
333 | 0 | /* Result is the first codepoint of the `codes` array */ |
334 | 0 | result = synth->codes[0]; |
335 | 0 | } |
336 | 0 | } |
337 | 0 | } |
338 | 0 |
|
339 | 0 | return result; |
340 | 0 | } |
341 | | /* The MVMGraphemeIter_cached is used for the Knuth-Morris-Pratt algorithm |
342 | | * because often it will request the same grapheme again, and our grapheme |
343 | | * iterators only return the next grapheme */ |
344 | | struct MVMGraphemeIter_cached { |
345 | | MVMGraphemeIter gi; |
346 | | MVMGrapheme32 last_g; |
347 | | MVMStringIndex last_location; |
348 | | MVMString *string; |
349 | | }; |
350 | | typedef struct MVMGraphemeIter_cached MVMGraphemeIter_cached; |
351 | 0 | MVM_STATIC_INLINE void MVM_string_gi_cached_init (MVMThreadContext *tc, MVMGraphemeIter_cached *gic, MVMString *s, MVMint64 index) { |
352 | 0 | MVM_string_gi_init(tc, &(gic->gi), s); |
353 | 0 | if (index) MVM_string_gi_move_to(tc, &(gic->gi), index); |
354 | 0 | gic->last_location = index; |
355 | 0 | gic->last_g = MVM_string_gi_get_grapheme(tc, &(gic->gi)); |
356 | 0 | gic->string = s; |
357 | 0 | } |
358 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_cached_get_grapheme(MVMThreadContext *tc, MVMGraphemeIter_cached *gic, MVMint64 index) { |
359 | 0 | /* Most likely case is we are getting the next grapheme. When that happens |
360 | 0 | * we will go directly to the end. */ |
361 | 0 | if (index == gic->last_location + 1) { |
362 | 0 | } |
363 | 0 | /* Second most likely is getting the cached grapheme */ |
364 | 0 | else if (index == gic->last_location) { |
365 | 0 | return gic->last_g; |
366 | 0 | } |
367 | 0 | /* If we have to move forward */ |
368 | 0 | else if (gic->last_location < index) { |
369 | 0 | MVM_string_gi_move_to(tc, &(gic->gi), index - gic->last_location - 1); |
370 | 0 | } |
371 | 0 | /* If we have to backtrack we need to reinitialize the grapheme iterator */ |
372 | 0 | else { |
373 | 0 | MVM_string_gi_cached_init(tc, gic, gic->string, index); |
374 | 0 | return gic->last_g; |
375 | 0 | } |
376 | 0 | gic->last_location = index; |
377 | 0 | return (gic->last_g = MVM_string_gi_get_grapheme(tc, &(gic->gi))); |
378 | 0 | } |