Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/iter.h
Line
Count
Source (jump to first uncovered line)
1
/* Grapheme iterator structure; iterates through graphemes in a string. */
2
struct MVMGraphemeIter {
3
    /* The blob we're currently iterating over. */
4
    union {
5
        MVMGrapheme32    *blob_32;
6
        MVMGraphemeASCII *blob_ascii;
7
        MVMGrapheme8     *blob_8;
8
        void             *any;
9
    } active_blob;
10
11
    /* The type of blob we have. */
12
    MVMuint16 blob_type;
13
14
    /* The number of strands remaining, if any. */
15
    MVMuint16 strands_remaining;
16
17
    /* The current position, and the end position. */
18
    MVMStringIndex pos;
19
    MVMStringIndex end;
20
21
    /* Repetition count, and the start index in the blob (only needed if we're
22
     * doing an iteration over a repetition). */
23
    MVMStringIndex start;
24
    MVMuint32      repetitions;
25
26
    /* The next strand, if we're doing a strand-based iteration. */
27
    MVMStringStrand *next_strand;
28
};
29
30
/* Initializes a grapheme iterator. */
31
0
MVM_STATIC_INLINE void MVM_string_gi_init(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *s) {
32
0
    if (s->body.storage_type == MVM_STRING_STRAND) {
33
0
        MVMStringStrand *strands = s->body.storage.strands;
34
0
        MVMString       *first   = strands[0].blob_string;
35
0
        gi->active_blob.any      = first->body.storage.any;
36
0
        gi->blob_type            = first->body.storage_type;
37
0
        gi->strands_remaining    = s->body.num_strands - 1;
38
0
        gi->pos = gi->start      = strands[0].start;
39
0
        gi->end                  = strands[0].end;
40
0
        gi->repetitions          = strands[0].repetitions;
41
0
        gi->next_strand          = strands + 1;
42
0
    }
43
0
    else {
44
0
        gi->active_blob.any   = s->body.storage.any;
45
0
        gi->blob_type         = s->body.storage_type;
46
0
        gi->end               = s->body.num_graphs;
47
0
        gi->strands_remaining = gi->start = gi->pos = gi->repetitions = 0;
48
0
    }
49
0
};
50
/* Gets the number of graphemes remaining in the current strand of the grapheme
51
 * iterator, including repetitions */
52
#define MVM_string_gi_graphs_left_in_strand_rep(gi) \
53
0
    (gi->end - gi->pos + gi->repetitions * (gi->end - gi->start))
54
/* graphs left in strand + graphs left in repetitions of current strand */
55
56
/* Moves to the next strand, or repetition if there is one. */
57
0
static void MVM_string_gi_next_strand_rep(MVMThreadContext *tc, MVMGraphemeIter *gi) {
58
0
    MVMStringStrand *next = NULL;
59
0
    if (gi->repetitions) {
60
0
        gi->pos = gi->start;
61
0
        gi->repetitions--;
62
0
        return;
63
0
    }
64
0
    if (gi->strands_remaining <= 0)
65
0
        MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator\n");
66
0
67
0
    next = (gi->next_strand)++;
68
0
    gi->pos = gi->start = next->start;
69
0
    gi->end             = next->end;
70
0
    gi->repetitions     = next->repetitions;
71
0
    gi->blob_type       = next->blob_string->body.storage_type;
72
0
    gi->active_blob.any = next->blob_string->body.storage.any;
73
0
    gi->strands_remaining--;
74
0
}
75
/* Sets the position of the iterator. (Can be optimized in many ways in the
76
 * repetitions and strands branches.) */
77
0
MVM_STATIC_INLINE void MVM_string_gi_move_to(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMuint32 pos) {
78
0
    MVMuint32 remaining = pos;
79
0
    MVMuint32 strand_graphs;
80
0
    MVMStringStrand *next = NULL;
81
0
82
0
    /* Find the appropriate strand. */
83
0
    /* Set strand_graphs to the number of graphemes */
84
0
    while (remaining > (strand_graphs = MVM_string_gi_graphs_left_in_strand_rep(gi))) {
85
0
        remaining -= strand_graphs;
86
0
        if (!(gi->strands_remaining--))
87
0
            MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator");
88
0
        next = (gi->next_strand)++;
89
0
        gi->pos = gi->start = next->start;
90
0
        gi->end             = next->end;
91
0
        gi->repetitions     = next->repetitions;
92
0
    }
93
0
    if (next) {
94
0
        gi->blob_type       = next->blob_string->body.storage_type;
95
0
        gi->active_blob.any = next->blob_string->body.storage.any;
96
0
    }
97
0
98
0
    /* Now look within the strand. */
99
0
    if (!remaining)
100
0
        return;
101
0
    /* Most common case where we move within the strand */
102
0
    if (gi->pos + remaining <= gi->end) {
103
0
        gi->pos += remaining;
104
0
        return;
105
0
    }
106
0
    /* If we are here we are encountering a repetition */
107
0
    if (gi->repetitions) {
108
0
        MVMuint32 rep_graphs = gi->end - gi->start;
109
0
        MVMuint32 remaining_reps;
110
0
        /* If we aren't at the end of the repetition, move to the end */
111
0
        if (gi->pos < gi->end) {
112
0
            remaining -= gi->end - gi->pos;
113
0
            gi->pos    = gi->end;
114
0
        }
115
0
        remaining_reps = remaining / rep_graphs;
116
0
        if (gi->repetitions < remaining_reps)
117
0
            MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator:"
118
0
                                          " no more repetitions remaining\n");
119
0
        gi->repetitions -= remaining_reps;
120
0
        /* Since we're still at the end, if there's repetitions left over
121
0
         * we are going to have to seek forward */
122
0
        if (remaining -= remaining_reps * rep_graphs) {
123
0
            gi->repetitions--; /* Move to the next repetition. */
124
0
            gi->pos = gi->start + remaining;
125
0
            /* remaining = 0 now for all purposes now, but since we return, no
126
0
             * need to set it */
127
0
        }
128
0
        return;
129
0
    }
130
0
    MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator");
131
0
}
132
133
/* Checks if there is more to read from a grapheme iterator. */
134
0
MVM_STATIC_INLINE MVMint32 MVM_string_gi_has_more(MVMThreadContext *tc, MVMGraphemeIter *gi) {
135
0
    return gi->pos < gi->end || gi->repetitions || gi->strands_remaining;
136
0
}
137
/* Returns number of graphs left in the strand, ignoring repetitions */
138
0
MVM_STATIC_INLINE MVMStringIndex MVM_string_gi_graphs_left_in_strand(MVMThreadContext *tc, MVMGraphemeIter *gi) {
139
0
    return gi->end - gi->pos;
140
0
}
141
0
MVM_STATIC_INLINE MVMGrapheme8 * MVM_string_gi_active_blob_8_pos(MVMThreadContext *tc, MVMGraphemeIter *gi) {
142
0
    return gi->active_blob.blob_8 + gi->pos;
143
0
}
144
0
MVM_STATIC_INLINE MVMGrapheme32 * MVM_string_gi_active_blob_32_pos(MVMThreadContext *tc, MVMGraphemeIter *gi) {
145
0
    return gi->active_blob.blob_32 + gi->pos;
146
0
}
147
0
MVM_STATIC_INLINE MVMuint16 MVM_string_gi_blob_type(MVMThreadContext *tc, MVMGraphemeIter *gi) {
148
0
    return gi->blob_type;
149
0
}
150
/* Returns if there are more strands left in the gi, including repetitions */
151
0
MVM_STATIC_INLINE int MVM_string_gi_has_more_strands_rep(MVMThreadContext *tc, MVMGraphemeIter *gi) {
152
0
    return !!(gi->strands_remaining || gi->repetitions);
153
0
}
154
/* Gets the next grapheme. */
155
0
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_get_grapheme(MVMThreadContext *tc, MVMGraphemeIter *gi) {
156
0
    while (1) {
157
0
        if (gi->pos < gi->end) {
158
0
            switch (gi->blob_type) {
159
0
                case MVM_STRING_GRAPHEME_32:
160
0
                    return gi->active_blob.blob_32[gi->pos++];
161
0
                case MVM_STRING_GRAPHEME_ASCII:
162
0
                    return gi->active_blob.blob_ascii[gi->pos++];
163
0
                case MVM_STRING_GRAPHEME_8:
164
0
                    return gi->active_blob.blob_8[gi->pos++];
165
0
                }
166
0
        }
167
0
        else if (gi->repetitions) {
168
0
            gi->pos = gi->start;
169
0
            gi->repetitions--;
170
0
        }
171
0
        else if (gi->strands_remaining) {
172
0
            MVMStringStrand *next = gi->next_strand;
173
0
            gi->active_blob.any = next->blob_string->body.storage.any;
174
0
            gi->blob_type       = next->blob_string->body.storage_type;
175
0
            gi->pos             = next->start;
176
0
            gi->end             = next->end;
177
0
            gi->start           = next->start;
178
0
            gi->repetitions     = next->repetitions;
179
0
            gi->strands_remaining--;
180
0
            gi->next_strand++;
181
0
        }
182
0
        else {
183
0
            MVM_exception_throw_adhoc(tc, "Iteration past end of grapheme iterator");
184
0
        }
185
0
    }
186
0
}
187
188
189
/* Returns the codepoint without doing checks, for internal VM use only. */
190
0
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
191
0
    switch (a->body.storage_type) {
192
0
        case MVM_STRING_GRAPHEME_32:
193
0
            return a->body.storage.blob_32[index];
194
0
        case MVM_STRING_GRAPHEME_ASCII:
195
0
            return a->body.storage.blob_ascii[index];
196
0
        case MVM_STRING_GRAPHEME_8:
197
0
            return a->body.storage.blob_8[index];
198
0
        case MVM_STRING_STRAND: {
199
0
            MVMGraphemeIter gi;
200
0
            MVM_string_gi_init(tc, &gi, a);
201
0
            MVM_string_gi_move_to(tc, &gi, index);
202
0
            return MVM_string_gi_get_grapheme(tc, &gi);
203
0
        }
204
0
        default:
205
0
            MVM_exception_throw_adhoc(tc, "String corruption detected: bad storage type");
206
0
    }
207
0
}
208
209
/* Code point iterator. Uses the grapheme iterator, and adds some extra bits
210
 * in order to iterate the code points in synthetics. */
211
struct MVMCodepointIter {
212
    /* The grapheme iterator. */
213
    MVMGraphemeIter gi;
214
215
    /* The codes of the current synthetic we're walking through, if any, with
216
     * the number of combiners we returned so far, and the total number of
217
     * combiners there are. */
218
    MVMCodepoint  *synth_codes;
219
    MVMint32       visited_synth_codes;
220
    MVMint32       total_synth_codes;
221
    /* first_code is only used for string_grapheme_ci functions */
222
    MVMCodepoint   first_code;
223
    /* If we should translate newline \n into \r\n. */
224
    MVMint32       translate_newlines;
225
    /* Used to pass through utf8-c8 synthetics, but not any others so we can
226
     * renomalize text without getting rid of utf8-c8 synthetics */
227
    MVMint32       pass_utfc8_synthetics;
228
229
};
230
231
/* Initializes a code point iterator. */
232
MVM_STATIC_INLINE void MVM_string_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMString *s,
233
0
        MVMint32 translate_newlines, MVMint32 pass_utfc8_synthetics) {
234
0
    /* Initialize our underlying grapheme iterator. */
235
0
    MVM_string_gi_init(tc, &(ci->gi), s);
236
0
237
0
    /* We've no currently active synthetic codepoint (and other fields are
238
0
     * unused until we do, so leave them alone for now). */
239
0
    ci->synth_codes           = NULL;
240
0
    ci->translate_newlines    = translate_newlines;
241
0
    ci->pass_utfc8_synthetics = pass_utfc8_synthetics;
242
0
};
243
/* Iterates on a grapheme. Returns the number of codepoints in the grapheme */
244
0
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_grapheme_ci_init(MVMThreadContext *tc, MVMCodepointIter *ci, MVMGrapheme32 g, MVMint32 pass_utfc8_synthetics) {
245
0
    MVMNFGSynthetic *synth = NULL;
246
0
    if (g < 0) {
247
0
        /* Get the synthetics info. */
248
0
        synth = MVM_nfg_get_synthetic_info(tc, g);
249
0
    }
250
0
    /* If we got a synth, but *not* if we are supposed to pass utf8-c8 synthetics
251
0
     * and it is a utf8-c8 synthetic */
252
0
    if (synth && !(pass_utfc8_synthetics && synth->is_utf8_c8)) {
253
0
        /* Set up the iterator so in the next iteration we will start to
254
0
        * hand back codepoints after the initial.
255
0
        * TODO: This may be able to be optimized
256
0
        * to remove first_code. */
257
0
        ci->synth_codes         =  synth->codes + 1;
258
0
        ci->visited_synth_codes = -1;
259
0
        ci->total_synth_codes   =  synth->num_codes - 1;
260
0
        ci->first_code          =  synth->codes[0];
261
0
    }
262
0
    else {
263
0
        ci->synth_codes         =  NULL;
264
0
        ci->visited_synth_codes = -1;
265
0
        ci->total_synth_codes   =  0;
266
0
        ci->first_code          =  g;
267
0
    }
268
0
    return ci->total_synth_codes + 1;
269
0
}
270
/* Only for string_grapheme_ci ops */
271
0
MVM_STATIC_INLINE MVMCodepoint MVM_string_grapheme_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) {
272
0
    MVMCodepoint result = ci->visited_synth_codes < 0
273
0
        ? ci->first_code
274
0
        : ci->synth_codes[ci->visited_synth_codes];
275
0
    ci->visited_synth_codes++;
276
0
    return result;
277
0
}
278
279
/* Checks if there is more to read from a code point iterator; this is the
280
 * case if we're still walking through a synthetic or we have more things
281
 * available from the underlying grapheme iterator. */
282
0
MVM_STATIC_INLINE MVMint32 MVM_string_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) {
283
0
    return ci->synth_codes || MVM_string_gi_has_more(tc, &(ci->gi));
284
0
}
285
/* Only for use with string_grapheme_ci ops */
286
0
MVM_STATIC_INLINE MVMint32 MVM_string_grapheme_ci_has_more(MVMThreadContext *tc, MVMCodepointIter *ci) {
287
0
    return ci->visited_synth_codes < ci->total_synth_codes;
288
0
}
289
290
/* Gets the next code point. */
291
0
MVM_STATIC_INLINE MVMCodepoint MVM_string_ci_get_codepoint(MVMThreadContext *tc, MVMCodepointIter *ci) {
292
0
    MVMCodepoint result;
293
0
294
0
    /* Do we have combiners from a synthetic to return? */
295
0
    if (ci->synth_codes) {
296
0
        /* Take the current combiner as the result. */
297
0
        result = ci->synth_codes[ci->visited_synth_codes];
298
0
299
0
        /* If we've seen all of the synthetics, clear up so we'll take another
300
0
         * grapheme next time around. */
301
0
        ci->visited_synth_codes++;
302
0
        if (ci->visited_synth_codes == ci->total_synth_codes)
303
0
            ci->synth_codes = NULL;
304
0
    }
305
0
306
0
    /* Otherwise, proceed to the next grapheme. */
307
0
    else {
308
0
        MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &(ci->gi));
309
0
#ifdef _WIN32
310
0
        if (ci->translate_newlines && g == '\n')
311
0
            g = MVM_nfg_crlf_grapheme(tc);
312
0
#endif
313
0
        if (g >= 0) {
314
0
            /* It's not a synthetic, so we're done. */
315
0
            result = (MVMCodepoint)g;
316
0
        }
317
0
        else {
318
0
            /* It's a synthetic. Look it up. */
319
0
            MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
320
0
            /* If we have pass_utfc8_synthetics set and it's a utf_c8 codepoint
321
0
             * pass it back unchanged */
322
0
            if (ci->pass_utfc8_synthetics && synth->is_utf8_c8) {
323
0
                result = g;
324
0
            }
325
0
            else {
326
0
                /* Set up the iterator so in the next iteration we will start to
327
0
                * hand back codepoints. */
328
0
                ci->synth_codes         = synth->codes + 1;
329
0
                ci->visited_synth_codes = 0;
330
0
                /* Emulate num_combs and subtract one from num_codes */
331
0
                ci->total_synth_codes   = synth->num_codes - 1;
332
0
333
0
                /* Result is the first codepoint of the `codes` array */
334
0
                result = synth->codes[0];
335
0
            }
336
0
        }
337
0
    }
338
0
339
0
    return result;
340
0
}
341
/* The MVMGraphemeIter_cached is used for the Knuth-Morris-Pratt algorithm
342
 * because often it will request the same grapheme again, and our grapheme
343
 * iterators only return the next grapheme */
344
struct MVMGraphemeIter_cached {
345
    MVMGraphemeIter gi;
346
    MVMGrapheme32   last_g;
347
    MVMStringIndex  last_location;
348
    MVMString      *string;
349
};
350
typedef struct MVMGraphemeIter_cached MVMGraphemeIter_cached;
351
0
MVM_STATIC_INLINE void MVM_string_gi_cached_init (MVMThreadContext *tc, MVMGraphemeIter_cached *gic, MVMString *s, MVMint64 index) {
352
0
    MVM_string_gi_init(tc, &(gic->gi), s);
353
0
    if (index) MVM_string_gi_move_to(tc, &(gic->gi), index);
354
0
    gic->last_location = index;
355
0
    gic->last_g = MVM_string_gi_get_grapheme(tc, &(gic->gi));
356
0
    gic->string = s;
357
0
}
358
0
MVM_STATIC_INLINE MVMGrapheme32 MVM_string_gi_cached_get_grapheme(MVMThreadContext *tc, MVMGraphemeIter_cached *gic, MVMint64 index) {
359
0
    /* Most likely case is we are getting the next grapheme. When that happens
360
0
     * we will go directly to the end. */
361
0
    if (index == gic->last_location + 1) {
362
0
    }
363
0
    /* Second most likely is getting the cached grapheme */
364
0
    else if (index == gic->last_location) {
365
0
        return gic->last_g;
366
0
    }
367
0
    /* If we have to move forward */
368
0
    else if (gic->last_location < index) {
369
0
        MVM_string_gi_move_to(tc, &(gic->gi), index - gic->last_location - 1);
370
0
    }
371
0
    /* If we have to backtrack we need to reinitialize the grapheme iterator */
372
0
    else {
373
0
        MVM_string_gi_cached_init(tc, gic, gic->string, index);
374
0
        return gic->last_g;
375
0
    }
376
0
    gic->last_location = index;
377
0
    return (gic->last_g = MVM_string_gi_get_grapheme(tc, &(gic->gi)));
378
0
}