/home/travis/build/MoarVM/MoarVM/src/strings/iter.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Grapheme iterator structure; iterates through graphemes in a string. */ |
2 | | struct MVMGraphemeIter { |
3 | | /* The blob we're currently iterating over. */ |
4 | | union { |
5 | | MVMGrapheme32 *blob_32; |
6 | | MVMGraphemeASCII *blob_ascii; |
7 | | MVMGrapheme8 *blob_8; |
8 | | void *any; |
9 | | } active_blob; |
10 | | |
11 | | /* The type of blob we have. */ |
12 | | MVMuint16 blob_type; |
13 | | |
14 | | /* The number of strands remaining, if any. */ |
15 | | MVMuint16 strands_remaining; |
16 | | |
17 | | /* The current position, and the end position. */ |
18 | | MVMStringIndex pos; |
19 | | MVMStringIndex end; |
20 | | |
21 | | /* Repetition count, and the start index in the blob (only needed if we're |
22 | | * doing an iteration over a repetition). */ |
23 | | MVMStringIndex start; |
24 | | MVMuint32 repetitions; |
25 | | |
26 | | /* The next strand, if we're doing a strand-based iteration. */ |
27 | | MVMStringStrand *next_strand; |
28 | | }; |
29 | | |
30 | | /* Initializes a grapheme iterator. */ |
31 | 0 | MVM_STATIC_INLINE void MVM_string_gi_init(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *s) { |
32 | 0 | if (s->body.storage_type == MVM_STRING_STRAND) { |
33 | 0 | MVMStringStrand *strands = s->body.storage.strands; |
34 | 0 | MVMString *first = strands[0].blob_string; |
35 | 0 | gi->active_blob.any = first->body.storage.any; |
36 | 0 | gi->blob_type = first->body.storage_type; |
37 | 0 | gi->strands_remaining = s->body.num_strands - 1; |
38 | 0 | gi->pos = strands[0].start; |
39 | 0 | gi->end = strands[0].end; |
40 | 0 | gi->start = strands[0].start; |
41 | 0 | gi->repetitions = strands[0].repetitions; |
42 | 0 | gi->next_strand = strands + 1; |
43 | 0 | } |
44 | 0 | else { |
45 | 0 | gi->active_blob.any = s->body.storage.any; |
46 | 0 | gi->blob_type = s->body.storage_type; |
47 | 0 | gi->strands_remaining = 0; |
48 | 0 | gi->pos = 0; |
49 | 0 | gi->end = s->body.num_graphs; |
50 | 0 | gi->repetitions = 0; |
51 | 0 | } |
52 | 0 | }; |
53 | | |
54 | | /* Sets the position of the iterator. (Can be optimized in many ways in the |
55 | | * repetitions and strands branches.) */ |
56 | 0 | MVM_STATIC_INLINE void MVM_string_gi_move_to(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMuint32 pos) { |
57 | 0 | MVMuint32 remaining = pos; |
58 | 0 | MVMuint32 strand_graphs; |
59 | 0 |
|
60 | 0 | /* Find the appropriate strand. */ |
61 | 0 | while (remaining > (strand_graphs = (gi->end - gi->pos) * (gi->repetitions + 1))) { |
62 | 0 | MVMStringStrand *next = gi->next_strand; |
63 | 0 | if (!gi->strands_remaining) |
64 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator"); |
65 | 0 | gi->active_blob.any = next->blob_string->body.storage.any; |
66 | 0 | gi->blob_type = next->blob_string->body.storage_type; |
67 | 0 | gi->pos = next->start; |
68 | 0 | gi->end = next->end; |
69 | 0 | gi->start = next->start; |
70 | 0 | gi->repetitions = next->repetitions; |
71 | 0 | gi->strands_remaining--; |
72 | 0 | gi->next_strand++; |
73 | 0 | remaining -= strand_graphs; |
74 | 0 | } |
75 | 0 |
|
76 | 0 | /* Now look within the strand. */ |
77 | 0 | while (1) { |
78 | 0 | if (remaining == 0) { |
79 | 0 | return; |
80 | 0 | } |
81 | 0 | if (gi->pos < gi->end) { |
82 | 0 | if (gi->pos + remaining <= gi->end) { |
83 | 0 | gi->pos += remaining; |
84 | 0 | return; |
85 | 0 | } |
86 | 0 | remaining -= gi->end - gi->pos; |
87 | 0 | gi->pos = gi->end; |
88 | 0 | } |
89 | 0 | else if (gi->repetitions) { |
90 | 0 | MVMuint32 rep_graphs = gi->end - gi->start; |
91 | 0 | MVMuint32 remaining_reps = remaining / rep_graphs; |
92 | 0 | if (remaining_reps > gi->repetitions) |
93 | 0 | remaining_reps = gi->repetitions; |
94 | 0 | gi->repetitions -= remaining_reps; |
95 | 0 | remaining -= remaining_reps * rep_graphs; |
96 | 0 | if (gi->repetitions) { |
97 | 0 | gi->pos = gi->start; |
98 | 0 | gi->repetitions--; /* Next read will be reading *this* repetition. */ |
99 | 0 | } |
100 | 0 | } |
101 | 0 | else { |
102 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator"); |
103 | 0 | } |
104 | 0 | } |
105 | 0 | } |
106 | | |
107 | | /* Checks if there is more to read from a grapheme iterator. */ |
108 | 0 | MVM_STATIC_INLINE MVMint32 MVM_string_gi_has_more(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
109 | 0 | return gi->pos < gi->end || gi->repetitions || gi->strands_remaining; |
110 | 0 | } |
111 | | |
112 | | /* Gets the next grapheme. */ |
113 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_get_grapheme(MVMThreadContext *tc, MVMGraphemeIter *gi) { |
114 | 0 | while (1) { |
115 | 0 | if (gi->pos < gi->end) { |
116 | 0 | switch (gi->blob_type) { |
117 | 0 | case MVM_STRING_GRAPHEME_32: |
118 | 0 | return gi->active_blob.blob_32[gi->pos++]; |
119 | 0 | case MVM_STRING_GRAPHEME_ASCII: |
120 | 0 | return gi->active_blob.blob_ascii[gi->pos++]; |
121 | 0 | case MVM_STRING_GRAPHEME_8: |
122 | 0 | return gi->active_blob.blob_8[gi->pos++]; |
123 | 0 | } |
124 | 0 | } |
125 | 0 | else if (gi->repetitions) { |
126 | 0 | gi->pos = gi->start; |
127 | 0 | gi->repetitions--; |
128 | 0 | } |
129 | 0 | else if (gi->strands_remaining) { |
130 | 0 | MVMStringStrand *next = gi->next_strand; |
131 | 0 | gi->active_blob.any = next->blob_string->body.storage.any; |
132 | 0 | gi->blob_type = next->blob_string->body.storage_type; |
133 | 0 | gi->pos = next->start; |
134 | 0 | gi->end = next->end; |
135 | 0 | gi->start = next->start; |
136 | 0 | gi->repetitions = next->repetitions; |
137 | 0 | gi->strands_remaining--; |
138 | 0 | gi->next_strand++; |
139 | 0 | } |
140 | 0 | else { |
141 | 0 | MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator"); |
142 | 0 | } |
143 | 0 | } |
144 | 0 | } |
145 | | |
146 | | /* Code point iterator. Uses the grapheme iterator, and adds some extra bits |
147 | | * in order to iterate the code points in synthetics. */ |
148 | | struct MVMCodepointIter { |
149 | | /* The grapheme iterator. */ |
150 | | MVMGraphemeIter gi; |
151 | | |
152 | | /* The codes of the current synthetic we're walking through, if any, with |
153 | | * the number of combiners we returned so far, and the total number of |
154 | | * combiners there are. */ |
155 | | MVMCodepoint *synth_codes; |
156 | | MVMint32 visited_synth_codes; |
157 | | MVMint32 total_synth_codes; |
158 | | |
159 | | /* If we should translate newline \n into \r\n. */ |
160 | | MVMint32 translate_newlines; |
161 | | }; |
162 | | |
163 | | /* Initializes a code point iterator. */ |
164 | | MVM_STATIC_INLINE void MVM_string_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMString *s, |
165 | 0 | MVMint32 translate_newlines) { |
166 | 0 | /* Initialize our underlying grapheme iterator. */ |
167 | 0 | MVM_string_gi_init(tc, &(ci->gi), s); |
168 | 0 |
|
169 | 0 | /* We've no currently active synthetic codepoint (and other fields are |
170 | 0 | * unused until we do, so leave them alone for now). */ |
171 | 0 | ci->synth_codes = NULL; |
172 | 0 | ci->translate_newlines = translate_newlines; |
173 | 0 | }; |
174 | | |
175 | | /* Checks if there is more to read from a code point iterator; this is the |
176 | | * case if we're still walking through a synthetic or we have more things |
177 | | * available from the underlying grapheme iterator. */ |
178 | 0 | MVM_STATIC_INLINE MVMint32 MVM_string_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) { |
179 | 0 | return ci->synth_codes || MVM_string_gi_has_more(tc, &(ci->gi)); |
180 | 0 | } |
181 | | |
182 | | /* Gets the next code point. */ |
183 | 0 | MVM_STATIC_INLINE MVMCodepoint MVM_string_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) { |
184 | 0 | MVMCodepoint result; |
185 | 0 |
|
186 | 0 | /* Do we have combiners from a synthetic to return? */ |
187 | 0 | if (ci->synth_codes) { |
188 | 0 | /* Take the current combiner as the result. */ |
189 | 0 | result = ci->synth_codes[ci->visited_synth_codes]; |
190 | 0 |
|
191 | 0 | /* If we've seen all of the synthetics, clear up so we'll take another |
192 | 0 | * grapheme next time around. */ |
193 | 0 | ci->visited_synth_codes++; |
194 | 0 | if (ci->visited_synth_codes == ci->total_synth_codes) |
195 | 0 | ci->synth_codes = NULL; |
196 | 0 | } |
197 | 0 |
|
198 | 0 | /* Otherwise, proceed to the next grapheme. */ |
199 | 0 | else { |
200 | 0 | MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &(ci->gi)); |
201 | 0 | if (ci->translate_newlines && g == '\n') |
202 | 0 | g = MVM_nfg_crlf_grapheme(tc); |
203 | 0 | if (g >= 0) { |
204 | 0 | /* It's not a synthetic, so we're done. */ |
205 | 0 | result = (MVMCodepoint)g; |
206 | 0 | } |
207 | 0 | else { |
208 | 0 | /* It's a synthetic. Look it up. */ |
209 | 0 | MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g); |
210 | 0 |
|
211 | 0 | /* Set up the iterator so in the next iteration we will start to |
212 | 0 | * hand back combiners. */ |
213 | 0 | ci->synth_codes = synth->combs; |
214 | 0 | ci->visited_synth_codes = 0; |
215 | 0 | ci->total_synth_codes = synth->num_combs; |
216 | 0 |
|
217 | 0 | /* Result is the base character of the grapheme. */ |
218 | 0 | result = synth->base; |
219 | 0 | } |
220 | 0 | } |
221 | 0 |
|
222 | 0 | return result; |
223 | 0 | } |