/home/travis/build/MoarVM/MoarVM/src/6model/reprs/NFA.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* This representation's function pointer table. */ |
4 | | static const MVMREPROps NFA_this_repr; |
5 | | |
6 | | /* Creates a new type object of this representation, and associates it with |
7 | | * the given HOW. */ |
8 | 2 | static MVMObject * type_object_for(MVMThreadContext *tc, MVMObject *HOW) { |
9 | 2 | MVMSTable *st = MVM_gc_allocate_stable(tc, &NFA_this_repr, HOW); |
10 | 2 | |
11 | 2 | MVMROOT(tc, st, { |
12 | 2 | MVMObject *obj = MVM_gc_allocate_type_object(tc, st); |
13 | 2 | MVM_ASSIGN_REF(tc, &(st->header), st->WHAT, obj); |
14 | 2 | st->size = sizeof(MVMNFA); |
15 | 2 | }); |
16 | 2 | |
17 | 2 | return st->WHAT; |
18 | 2 | } |
19 | | |
20 | | /* Copies the body of one object to another. */ |
21 | 0 | static void copy_to(MVMThreadContext *tc, MVMSTable *st, void *src, MVMObject *dest_root, void *dest) { |
22 | 0 | MVM_exception_throw_adhoc(tc, "Cannot copy object with representation NFA"); |
23 | 0 | } |
24 | | |
25 | | /* Called by the VM to mark any GCable items. */ |
26 | 3.02k | static void gc_mark(MVMThreadContext *tc, MVMSTable *st, void *data, MVMGCWorklist *worklist) { |
27 | 3.02k | MVMNFABody *lb = (MVMNFABody *)data; |
28 | 3.02k | MVMint64 i, j; |
29 | 3.02k | |
30 | 3.02k | MVM_gc_worklist_add(tc, worklist, &lb->fates); |
31 | 3.02k | |
32 | 97.3k | for (i = 0; i < lb->num_states; i++) { |
33 | 94.3k | MVMint64 edges = lb->num_state_edges[i]; |
34 | 253k | for (j = 0; j < edges; j++) { |
35 | 159k | switch (lb->states[i][j].act) { |
36 | 18.2k | case MVM_NFA_EDGE_CHARLIST: |
37 | 18.2k | case MVM_NFA_EDGE_CHARLIST_NEG: |
38 | 18.2k | MVM_gc_worklist_add(tc, worklist, &lb->states[i][j].arg.s); |
39 | 159k | } |
40 | 159k | } |
41 | 94.3k | } |
42 | 3.02k | } |
43 | | |
44 | | /* Called by the VM in order to free memory associated with this object. */ |
45 | 0 | static void gc_free(MVMThreadContext *tc, MVMObject *obj) { |
46 | 0 | MVMNFA *nfa = (MVMNFA *)obj; |
47 | 0 | MVMint64 i; |
48 | 0 | for (i = 0; i < nfa->body.num_states; i++) |
49 | 0 | if (nfa->body.num_state_edges[i]) |
50 | 0 | MVM_fixed_size_free(tc, tc->instance->fsa, nfa->body.num_state_edges[i] * sizeof(MVMNFAStateInfo), nfa->body.states[i]); |
51 | 0 | MVM_fixed_size_free(tc, tc->instance->fsa, nfa->body.num_states * sizeof(MVMNFAStateInfo *), nfa->body.states); |
52 | 0 | MVM_fixed_size_free(tc, tc->instance->fsa, nfa->body.num_states * sizeof(MVMint64), nfa->body.num_state_edges); |
53 | 0 | } |
54 | | |
55 | | |
56 | | static const MVMStorageSpec storage_spec = { |
57 | | MVM_STORAGE_SPEC_REFERENCE, /* inlineable */ |
58 | | 0, /* bits */ |
59 | | 0, /* align */ |
60 | | MVM_STORAGE_SPEC_BP_NONE, /* boxed_primitive */ |
61 | | 0, /* can_box */ |
62 | | 0, /* is_unsigned */ |
63 | | }; |
64 | | |
65 | | |
66 | | /* Gets the storage specification for this representation. */ |
67 | 0 | static const MVMStorageSpec * get_storage_spec(MVMThreadContext *tc, MVMSTable *st) { |
68 | 0 | return &storage_spec; |
69 | 0 | } |
70 | | |
71 | | /* Serializes the data. */ |
72 | 1 | static void serialize(MVMThreadContext *tc, MVMSTable *st, void *data, MVMSerializationWriter *writer) { |
73 | 1 | MVMNFABody *body = (MVMNFABody *)data; |
74 | 1 | MVMint64 i, j; |
75 | 1 | |
76 | 1 | /* Write fates. */ |
77 | 1 | MVM_serialization_write_ref(tc, writer, body->fates); |
78 | 1 | |
79 | 1 | /* Write number of states. */ |
80 | 1 | MVM_serialization_write_int(tc, writer, body->num_states); |
81 | 1 | |
82 | 1 | /* Write state edge list counts, skipping synthetic start node. */ |
83 | 5 | for (i = 0; i < body->num_states; i++) { |
84 | 4 | MVMint64 sig_edges = body->num_state_edges[i]; |
85 | 4 | if (sig_edges && body->states[i][0].act == MVM_NFA_EDGE_SYNTH_CP_COUNT) |
86 | 0 | sig_edges--; |
87 | 4 | MVM_serialization_write_int(tc, writer, sig_edges); |
88 | 4 | } |
89 | 1 | |
90 | 1 | /* Write state graph. */ |
91 | 5 | for (i = 0; i < body->num_states; i++) { |
92 | 8 | for (j = 0; j < body->num_state_edges[i]; j++) { |
93 | 4 | MVMint64 act = body->states[i][j].act; |
94 | 4 | if (act == MVM_NFA_EDGE_SYNTH_CP_COUNT) |
95 | 0 | continue; |
96 | 4 | MVM_serialization_write_int(tc, writer, act); |
97 | 4 | MVM_serialization_write_int(tc, writer, body->states[i][j].to); |
98 | 4 | switch (act & 0xff) { |
99 | 1 | case MVM_NFA_EDGE_FATE: |
100 | 1 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.i); |
101 | 1 | break; |
102 | 3 | case MVM_NFA_EDGE_CODEPOINT: |
103 | 3 | case MVM_NFA_EDGE_CODEPOINT_LL: |
104 | 3 | case MVM_NFA_EDGE_CODEPOINT_NEG: |
105 | 3 | case MVM_NFA_EDGE_CODEPOINT_M: |
106 | 3 | case MVM_NFA_EDGE_CODEPOINT_M_NEG: { |
107 | 3 | MVMGrapheme32 g = body->states[i][j].arg.g; |
108 | 3 | if (g >= 0) { |
109 | 3 | /* Non-synthetic. */ |
110 | 3 | MVM_serialization_write_int(tc, writer, g); |
111 | 3 | } |
112 | 0 | else { |
113 | 0 | /* Synthetic. Write the number of codepoints negated, |
114 | 0 | * and then each of the codepoints. */ |
115 | 0 | MVMNFGSynthetic *si = MVM_nfg_get_synthetic_info(tc, g); |
116 | 0 | MVMint32 k; |
117 | 0 | MVM_serialization_write_int(tc, writer, -(si->num_codes)); |
118 | 0 | for (k = 0; k < si->num_codes; k++) |
119 | 0 | MVM_serialization_write_int(tc, writer, si->codes[k]); |
120 | 0 | } |
121 | 3 | break; |
122 | 3 | } |
123 | 0 | case MVM_NFA_EDGE_CHARCLASS: |
124 | 0 | case MVM_NFA_EDGE_CHARCLASS_NEG: |
125 | 0 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.i); |
126 | 0 | break; |
127 | 0 | case MVM_NFA_EDGE_CHARLIST: |
128 | 0 | case MVM_NFA_EDGE_CHARLIST_NEG: |
129 | 0 | MVM_serialization_write_str(tc, writer, body->states[i][j].arg.s); |
130 | 0 | break; |
131 | 0 | case MVM_NFA_EDGE_CODEPOINT_I: |
132 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_LL: |
133 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_NEG: |
134 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM: |
135 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: |
136 | 0 | case MVM_NFA_EDGE_CHARRANGE: |
137 | 0 | case MVM_NFA_EDGE_CHARRANGE_NEG: |
138 | 0 | case MVM_NFA_EDGE_CHARRANGE_M: |
139 | 0 | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
140 | 0 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.uclc.lc); |
141 | 0 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.uclc.uc); |
142 | 0 | break; |
143 | 0 | } |
144 | 4 | } |
145 | 4 | } |
146 | 4 | } |
147 | 1 | } |
148 | | |
149 | | /* Go through each state and see if there are at least four edges involving |
150 | | * a simple codepoint match. If so, sort them to the start and stick in a |
151 | | * synthetic edge which indicates the number of codepoint edges ahead. This |
152 | | * means that our matching can binary search with the codepoint it has, and |
153 | | * skip over any inappropriate edges. */ |
154 | 122k | static int classify_edge(MVMNFAStateInfo *e) { |
155 | 122k | switch (e->act) { |
156 | 5.83k | case MVM_NFA_EDGE_SYNTH_CP_COUNT: |
157 | 5.83k | return 0; |
158 | 91.4k | case MVM_NFA_EDGE_CODEPOINT: |
159 | 91.4k | case MVM_NFA_EDGE_CODEPOINT_LL: |
160 | 91.4k | return 1; |
161 | 24.9k | default: |
162 | 24.9k | return 2; |
163 | 122k | } |
164 | 122k | } |
165 | 61.0k | static int opt_edge_comp(const void *av, const void *bv) { |
166 | 61.0k | MVMNFAStateInfo *a = (MVMNFAStateInfo *)av; |
167 | 61.0k | MVMNFAStateInfo *b = (MVMNFAStateInfo *)bv; |
168 | 61.0k | MVMint32 type_a = classify_edge(a); |
169 | 61.0k | MVMint32 type_b = classify_edge(b); |
170 | 61.0k | if (type_a < type_b) |
171 | 8.15k | return -1; |
172 | 52.9k | if (type_a > type_b) |
173 | 7.40k | return 1; |
174 | 45.5k | if (type_a == 1) { |
175 | 13.6k | return a->arg.g < b->arg.g ? -1 : |
176 | 24.7k | a->arg.g > b->arg.g ? 1 : |
177 | 9.13k | 0; |
178 | 38.3k | } |
179 | 7.12k | else { |
180 | 7.12k | return 0; |
181 | 7.12k | } |
182 | 45.5k | } |
183 | 5.20k | static void sort_states_and_add_synth_cp_node(MVMThreadContext *tc, MVMNFABody *body) { |
184 | 5.20k | MVMint64 s; |
185 | 181k | for (s = 0; s < body->num_states; s++) { |
186 | 176k | /* See if there's enough interesting edges to do the opt. */ |
187 | 176k | MVMint32 applicable_edges = 0; |
188 | 176k | MVMint64 num_orig_edges = body->num_state_edges[s]; |
189 | 176k | if (num_orig_edges >= 4) { |
190 | 8.39k | MVMint64 e; |
191 | 61.0k | for (e = 0; e < num_orig_edges; e++) { |
192 | 52.6k | MVMint64 act = body->states[s][e].act; |
193 | 52.6k | if (act == MVM_NFA_EDGE_CODEPOINT || act == MVM_NFA_EDGE_CODEPOINT_LL) |
194 | 21.2k | applicable_edges++; |
195 | 52.6k | } |
196 | 8.39k | } |
197 | 176k | |
198 | 176k | /* If enough edges, insert synthetic and so the sort. */ |
199 | 176k | if (applicable_edges >= 4) { |
200 | 1.82k | MVMint64 num_new_edges = num_orig_edges + 1; |
201 | 1.82k | MVMNFAStateInfo *new_edges = MVM_fixed_size_alloc(tc, tc->instance->fsa, |
202 | 1.82k | num_new_edges * sizeof(MVMNFAStateInfo)); |
203 | 1.82k | new_edges[0].act = MVM_NFA_EDGE_SYNTH_CP_COUNT; |
204 | 1.82k | new_edges[0].arg.i = applicable_edges; |
205 | 1.82k | memcpy(new_edges + 1, body->states[s], num_orig_edges * sizeof(MVMNFAStateInfo)); |
206 | 1.82k | qsort(new_edges, num_new_edges, sizeof(MVMNFAStateInfo), opt_edge_comp); |
207 | 1.82k | MVM_fixed_size_free(tc, tc->instance->fsa, num_orig_edges * sizeof(MVMNFAStateInfo), |
208 | 1.82k | body->states[s]); |
209 | 1.82k | body->states[s] = new_edges; |
210 | 1.82k | body->num_state_edges[s] = num_new_edges; |
211 | 1.82k | } |
212 | 176k | } |
213 | 5.20k | } |
214 | | |
215 | | /* Deserializes the data. */ |
216 | 1 | static void deserialize(MVMThreadContext *tc, MVMSTable *st, MVMObject *root, void *data, MVMSerializationReader *reader) { |
217 | 1 | MVMNFABody *body = (MVMNFABody *)data; |
218 | 1 | MVMint64 i, j; |
219 | 1 | |
220 | 1 | /* Read fates. */ |
221 | 1 | body->fates = MVM_serialization_read_ref(tc, reader); |
222 | 1 | |
223 | 1 | /* Read number of states. */ |
224 | 1 | body->num_states = MVM_serialization_read_int(tc, reader); |
225 | 1 | |
226 | 1 | if (body->num_states > 0) { |
227 | 1 | /* Read state edge list counts. */ |
228 | 1 | body->num_state_edges = MVM_fixed_size_alloc(tc, tc->instance->fsa, body->num_states * sizeof(MVMint64)); |
229 | 5 | for (i = 0; i < body->num_states; i++) |
230 | 4 | body->num_state_edges[i] = MVM_serialization_read_int(tc, reader); |
231 | 1 | |
232 | 1 | /* Read state graph. */ |
233 | 1 | body->states = MVM_fixed_size_alloc(tc, tc->instance->fsa, body->num_states * sizeof(MVMNFAStateInfo *)); |
234 | 5 | for (i = 0; i < body->num_states; i++) { |
235 | 4 | MVMint64 edges = body->num_state_edges[i]; |
236 | 4 | if (edges > 0) { |
237 | 4 | body->states[i] = MVM_fixed_size_alloc(tc, tc->instance->fsa, edges * sizeof(MVMNFAStateInfo)); |
238 | 4 | } |
239 | 8 | for (j = 0; j < edges; j++) { |
240 | 4 | body->states[i][j].act = MVM_serialization_read_int(tc, reader); |
241 | 4 | body->states[i][j].to = MVM_serialization_read_int(tc, reader); |
242 | 4 | switch (body->states[i][j].act & 0xff) { |
243 | 1 | case MVM_NFA_EDGE_FATE: |
244 | 1 | body->states[i][j].arg.i = MVM_serialization_read_int(tc, reader); |
245 | 1 | break; |
246 | 3 | case MVM_NFA_EDGE_CODEPOINT: |
247 | 3 | case MVM_NFA_EDGE_CODEPOINT_LL: |
248 | 3 | case MVM_NFA_EDGE_CODEPOINT_NEG: |
249 | 3 | case MVM_NFA_EDGE_CODEPOINT_M: |
250 | 3 | case MVM_NFA_EDGE_CODEPOINT_M_NEG: { |
251 | 3 | MVMint64 cp_or_synth_count = MVM_serialization_read_int(tc, reader); |
252 | 3 | if (cp_or_synth_count >= 0) { |
253 | 3 | body->states[i][j].arg.g = (MVMGrapheme32)cp_or_synth_count; |
254 | 3 | } |
255 | 0 | else { |
256 | 0 | MVMint32 num_codes = -cp_or_synth_count; |
257 | 0 | MVMCodepoint *codes = MVM_fixed_size_alloc(tc, tc->instance->fsa, num_codes * sizeof(MVMCodepoint)); |
258 | 0 | MVMint32 k; |
259 | 0 | for (k = 0; k < num_codes; k++) |
260 | 0 | codes[k] = (MVMCodepoint)MVM_serialization_read_int(tc, reader); |
261 | 0 | body->states[i][j].arg.g = MVM_nfg_codes_to_grapheme(tc, codes, num_codes); |
262 | 0 | MVM_fixed_size_free(tc, tc->instance->fsa, num_codes * sizeof(MVMCodepoint), codes); |
263 | 0 | } |
264 | 3 | break; |
265 | 3 | } |
266 | 0 | case MVM_NFA_EDGE_CHARCLASS: |
267 | 0 | case MVM_NFA_EDGE_CHARCLASS_NEG: |
268 | 0 | body->states[i][j].arg.i = MVM_serialization_read_int(tc, reader); |
269 | 0 | break; |
270 | 0 | case MVM_NFA_EDGE_CHARLIST: |
271 | 0 | case MVM_NFA_EDGE_CHARLIST_NEG: |
272 | 0 | MVM_ASSIGN_REF(tc, &(root->header), body->states[i][j].arg.s, MVM_serialization_read_str(tc, reader)); |
273 | 0 | break; |
274 | 0 | case MVM_NFA_EDGE_CODEPOINT_I: |
275 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_LL: |
276 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_NEG: |
277 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM: |
278 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: |
279 | 0 | case MVM_NFA_EDGE_CHARRANGE: |
280 | 0 | case MVM_NFA_EDGE_CHARRANGE_NEG: |
281 | 0 | case MVM_NFA_EDGE_CHARRANGE_M: |
282 | 0 | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
283 | 0 | body->states[i][j].arg.uclc.lc = MVM_serialization_read_int(tc, reader); |
284 | 0 | body->states[i][j].arg.uclc.uc = MVM_serialization_read_int(tc, reader); |
285 | 0 | break; |
286 | 0 | } |
287 | 4 | } |
288 | 4 | } |
289 | 4 | } |
290 | 1 | } |
291 | 1 | |
292 | 1 | sort_states_and_add_synth_cp_node(tc, body); |
293 | 1 | } |
294 | | |
295 | | /* Compose the representation. */ |
296 | 2 | static void compose(MVMThreadContext *tc, MVMSTable *st, MVMObject *info) { |
297 | 2 | /* Nothing to do for this REPR. */ |
298 | 2 | } |
299 | | |
300 | | /* Set the size of the STable. */ |
301 | 144 | static void deserialize_stable_size(MVMThreadContext *tc, MVMSTable *st, MVMSerializationReader *reader) { |
302 | 144 | st->size = sizeof(MVMNFA); |
303 | 144 | } |
304 | | |
305 | | /* Calculates the non-GC-managed memory we hold on to. */ |
306 | 1.05k | static MVMuint64 unmanaged_size(MVMThreadContext *tc, MVMSTable *st, void *data) { |
307 | 1.05k | MVMNFABody *body = (MVMNFABody *)data; |
308 | 1.05k | MVMuint64 total; |
309 | 1.05k | MVMint64 i; |
310 | 1.05k | |
311 | 1.05k | total = body->num_states * sizeof(MVMint64); /* for num_state_edges */ |
312 | 1.05k | total += body->num_states * sizeof(MVMNFAStateInfo *); /* for states level 1 */ |
313 | 33.6k | for (i = 0; i < body->num_states; i++) |
314 | 32.6k | total += body->num_state_edges[i] * sizeof(MVMNFAStateInfo); |
315 | 1.05k | |
316 | 1.05k | return total; |
317 | 1.05k | } |
318 | | |
319 | | /* Initializes the representation. */ |
320 | 144 | const MVMREPROps * MVMNFA_initialize(MVMThreadContext *tc) { |
321 | 144 | return &NFA_this_repr; |
322 | 144 | } |
323 | | |
324 | | static const MVMREPROps NFA_this_repr = { |
325 | | type_object_for, |
326 | | MVM_gc_allocate_object, |
327 | | NULL, /* initialize */ |
328 | | copy_to, |
329 | | MVM_REPR_DEFAULT_ATTR_FUNCS, |
330 | | MVM_REPR_DEFAULT_BOX_FUNCS, |
331 | | MVM_REPR_DEFAULT_POS_FUNCS, |
332 | | MVM_REPR_DEFAULT_ASS_FUNCS, |
333 | | MVM_REPR_DEFAULT_ELEMS, |
334 | | get_storage_spec, |
335 | | NULL, /* change_type */ |
336 | | serialize, |
337 | | deserialize, |
338 | | NULL, /* serialize_repr_data */ |
339 | | NULL, /* deserialize_repr_data */ |
340 | | deserialize_stable_size, |
341 | | gc_mark, |
342 | | gc_free, |
343 | | NULL, /* gc_cleanup */ |
344 | | NULL, /* gc_mark_repr_data */ |
345 | | NULL, /* gc_free_repr_data */ |
346 | | compose, |
347 | | NULL, /* spesh */ |
348 | | "NFA", /* name */ |
349 | | MVM_REPR_ID_NFA, |
350 | | unmanaged_size, |
351 | | NULL, /* describe_refs */ |
352 | | }; |
353 | | |
354 | | /* We may be provided a grapheme as a codepoint for non-synthetics, or as a |
355 | | * 1-char string for synthetics. */ |
356 | 125k | static MVMGrapheme32 get_grapheme(MVMThreadContext *tc, MVMObject *obj) { |
357 | 125k | /* Handle null and non-concrete case. */ |
358 | 125k | if (MVM_UNLIKELY(MVM_is_null(tc, obj) || !IS_CONCRETE(obj))) { |
359 | 0 | MVM_exception_throw_adhoc(tc, |
360 | 0 | "NFA must be provided with a concrete string or integer for graphemes"); |
361 | 0 | } |
362 | 125k | |
363 | 125k | /* Otherwise, guess something appropriate. */ |
364 | 125k | else { |
365 | 125k | const MVMStorageSpec *ss = REPR(obj)->get_storage_spec(tc, STABLE(obj)); |
366 | 125k | if (ss->can_box & MVM_STORAGE_SPEC_CAN_BOX_INT) |
367 | 125k | return REPR(obj)->box_funcs.get_int(tc, STABLE(obj), obj, OBJECT_BODY(obj)); |
368 | 0 | else if (ss->can_box & MVM_STORAGE_SPEC_CAN_BOX_STR) |
369 | 0 | return MVM_string_get_grapheme_at(tc, |
370 | 0 | REPR(obj)->box_funcs.get_str(tc, STABLE(obj), obj, OBJECT_BODY(obj)), |
371 | 0 | 0); |
372 | 0 | else |
373 | 0 | MVM_exception_throw_adhoc(tc, |
374 | 0 | "NFA must be provided with a string or integer for graphemes"); |
375 | 125k | } |
376 | 125k | } |
377 | | |
378 | 5.20k | MVMObject * MVM_nfa_from_statelist(MVMThreadContext *tc, MVMObject *states, MVMObject *nfa_type) { |
379 | 5.20k | MVMObject *nfa_obj; |
380 | 5.20k | MVMNFABody *nfa; |
381 | 5.20k | MVMint64 i, j, num_states; |
382 | 5.20k | |
383 | 5.20k | MVMROOT2(tc, states, nfa_type, { |
384 | 5.20k | /* Create NFA object. */ |
385 | 5.20k | nfa_obj = MVM_repr_alloc_init(tc, nfa_type); |
386 | 5.20k | nfa = (MVMNFABody *)OBJECT_BODY(nfa_obj); |
387 | 5.20k | |
388 | 5.20k | /* The first state entry is the fates list. */ |
389 | 5.20k | nfa->fates = MVM_repr_at_pos_o(tc, states, 0); |
390 | 5.20k | |
391 | 5.20k | /* Go over the rest and convert to the NFA. */ |
392 | 5.20k | num_states = MVM_repr_elems(tc, states) - 1; |
393 | 5.20k | nfa->num_states = num_states; |
394 | 5.20k | if (num_states > 0) { |
395 | 5.20k | nfa->num_state_edges = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, num_states * sizeof(MVMint64)); |
396 | 5.20k | nfa->states = MVM_fixed_size_alloc_zeroed(tc, tc->instance->fsa, num_states * sizeof(MVMNFAStateInfo *)); |
397 | 5.20k | } |
398 | 5.20k | for (i = 0; i < num_states; i++) { |
399 | 5.20k | MVMObject *edge_info = MVM_repr_at_pos_o(tc, states, i + 1); |
400 | 5.20k | MVMint64 elems = MVM_repr_elems(tc, edge_info); |
401 | 5.20k | MVMint64 edges = elems / 3; |
402 | 5.20k | MVMint64 cur_edge = 0; |
403 | 5.20k | |
404 | 5.20k | nfa->num_state_edges[i] = edges; |
405 | 5.20k | if (edges > 0) { |
406 | 5.20k | nfa->states[i] = MVM_fixed_size_alloc(tc, tc->instance->fsa, edges * sizeof(MVMNFAStateInfo)); |
407 | 5.20k | } |
408 | 5.20k | |
409 | 5.20k | for (j = 0; j < elems; j += 3) { |
410 | 5.20k | MVMint64 act = MVM_coerce_simple_intify(tc, |
411 | 5.20k | MVM_repr_at_pos_o(tc, edge_info, j)); |
412 | 5.20k | MVMint64 to = MVM_coerce_simple_intify(tc, |
413 | 5.20k | MVM_repr_at_pos_o(tc, edge_info, j + 2)); |
414 | 5.20k | if (to <= 0 && act != MVM_NFA_EDGE_FATE) |
415 | 5.20k | MVM_exception_throw_adhoc(tc, "Invalid to edge %"PRId64" in NFA statelist", to); |
416 | 5.20k | |
417 | 5.20k | nfa->states[i][cur_edge].act = act; |
418 | 5.20k | nfa->states[i][cur_edge].to = to; |
419 | 5.20k | |
420 | 5.20k | switch (act & 0xff) { |
421 | 5.20k | case MVM_NFA_EDGE_FATE: |
422 | 5.20k | nfa->states[i][cur_edge].arg.i = MVM_coerce_simple_intify(tc, |
423 | 5.20k | MVM_repr_at_pos_o(tc, edge_info, j + 1)); |
424 | 5.20k | break; |
425 | 5.20k | case MVM_NFA_EDGE_CODEPOINT: |
426 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_LL: |
427 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_NEG: |
428 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_M: |
429 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_M_NEG: |
430 | 5.20k | nfa->states[i][cur_edge].arg.g = get_grapheme(tc, |
431 | 5.20k | MVM_repr_at_pos_o(tc, edge_info, j + 1)); |
432 | 5.20k | break; |
433 | 5.20k | case MVM_NFA_EDGE_CHARCLASS: |
434 | 5.20k | case MVM_NFA_EDGE_CHARCLASS_NEG: |
435 | 5.20k | nfa->states[i][cur_edge].arg.i = MVM_coerce_simple_intify(tc, |
436 | 5.20k | MVM_repr_at_pos_o(tc, edge_info, j + 1)); |
437 | 5.20k | break; |
438 | 5.20k | case MVM_NFA_EDGE_CHARLIST: |
439 | 5.20k | case MVM_NFA_EDGE_CHARLIST_NEG: |
440 | 5.20k | MVM_ASSIGN_REF(tc, &(nfa_obj->header), |
441 | 5.20k | nfa->states[i][cur_edge].arg.s, |
442 | 5.20k | MVM_repr_get_str(tc, MVM_repr_at_pos_o(tc, edge_info, j + 1))); |
443 | 5.20k | break; |
444 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_I: |
445 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_I_LL: |
446 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_I_NEG: |
447 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_IM: |
448 | 5.20k | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: |
449 | 5.20k | /* That is not about uppercase/lowercase here, but lower and upper bounds |
450 | 5.20k | of our range. */ |
451 | 5.20k | case MVM_NFA_EDGE_CHARRANGE: |
452 | 5.20k | case MVM_NFA_EDGE_CHARRANGE_NEG: |
453 | 5.20k | case MVM_NFA_EDGE_CHARRANGE_M: |
454 | 5.20k | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
455 | 5.20k | MVMObject *arg = MVM_repr_at_pos_o(tc, edge_info, j + 1); |
456 | 5.20k | nfa->states[i][cur_edge].arg.uclc.lc = MVM_coerce_simple_intify(tc, |
457 | 5.20k | MVM_repr_at_pos_o(tc, arg, 0)); |
458 | 5.20k | nfa->states[i][cur_edge].arg.uclc.uc = MVM_coerce_simple_intify(tc, |
459 | 5.20k | MVM_repr_at_pos_o(tc, arg, 1)); |
460 | 5.20k | break; |
461 | 5.20k | } |
462 | 5.20k | } |
463 | 5.20k | |
464 | 5.20k | cur_edge++; |
465 | 5.20k | } |
466 | 5.20k | } |
467 | 5.20k | }); |
468 | 5.20k | |
469 | 5.20k | sort_states_and_add_synth_cp_node(tc, nfa); |
470 | 5.20k | |
471 | 5.20k | return nfa_obj; |
472 | 5.20k | } |
473 | | |
474 | | /* Does a run of the NFA. Produces a list of integers indicating the |
475 | | * chosen ordering. */ |
476 | 6.52M | static MVMint32 in_done(MVMuint32 *done, MVMuint32 numdone, MVMuint32 st) { |
477 | 6.52M | MVMuint32 i = 0; |
478 | 26.7M | for (i = 0; i < numdone; i++) |
479 | 20.2M | if (done[i] == st) |
480 | 12.3k | return 1; |
481 | 6.51M | return 0; |
482 | 6.52M | } |
483 | | |
484 | 713k | static MVMint64 * nqp_nfa_run(MVMThreadContext *tc, MVMNFABody *nfa, MVMString *target, MVMint64 offset, MVMint64 *total_fates_out) { |
485 | 713k | MVMint64 eos = MVM_string_graphs(tc, target); |
486 | 713k | MVMuint32 gen = 1; |
487 | 713k | MVMint64 numcur = 0; |
488 | 713k | MVMint64 numnext = 0; |
489 | 713k | MVMint64 numdone = 0; |
490 | 713k | MVMuint32 *done, *curst, *nextst; |
491 | 713k | MVMint64 *fates, *longlit; |
492 | 713k | MVMint64 i, fate_arr_len, num_states, total_fates, prev_fates, usedlonglit; |
493 | 713k | MVMint64 orig_offset = offset; |
494 | 713k | /* We used a cached grapheme iterator since we often request the same |
495 | 713k | * grapheme multiple times, most common after that is requesting the next |
496 | 713k | * grapheme. */ |
497 | 713k | MVMGraphemeIter_cached gic; |
498 | 713k | int nfadeb = tc->instance->nfa_debug_enabled; |
499 | 713k | |
500 | 713k | /* Obtain or (re)allocate "done states", "current states" and "next |
501 | 713k | * states" arrays. */ |
502 | 713k | num_states = nfa->num_states; |
503 | 713k | if (tc->nfa_alloc_states < num_states) { |
504 | 287 | size_t alloc = (num_states + 1) * sizeof(MVMuint32); |
505 | 287 | tc->nfa_done = (MVMuint32 *)MVM_realloc(tc->nfa_done, alloc); |
506 | 287 | tc->nfa_curst = (MVMuint32 *)MVM_realloc(tc->nfa_curst, alloc); |
507 | 287 | tc->nfa_nextst = (MVMuint32 *)MVM_realloc(tc->nfa_nextst, alloc); |
508 | 287 | tc->nfa_alloc_states = num_states; |
509 | 287 | } |
510 | 713k | done = tc->nfa_done; |
511 | 713k | curst = tc->nfa_curst; |
512 | 713k | nextst = tc->nfa_nextst; |
513 | 713k | |
514 | 713k | /* Allocate fates array. */ |
515 | 713k | fate_arr_len = 1 + MVM_repr_elems(tc, nfa->fates); |
516 | 713k | if (tc->nfa_fates_len < fate_arr_len) { |
517 | 587 | tc->nfa_fates = (MVMint64 *)MVM_realloc(tc->nfa_fates, sizeof(MVMint64) * fate_arr_len); |
518 | 587 | tc->nfa_fates_len = fate_arr_len; |
519 | 587 | } |
520 | 713k | fates = tc->nfa_fates; |
521 | 713k | total_fates = 0; |
522 | 713k | if (MVM_UNLIKELY(nfadeb)) fprintf(stderr,"======================================\nStarting with %d fates in %d states\n", (int)fate_arr_len, (int)num_states) ; |
523 | 713k | |
524 | 713k | /* longlit will be updated on a fate whenever NFA passes through final char of a literal. */ |
525 | 713k | /* These edges are specially marked to indicate which fate they influence the fate of. */ |
526 | 713k | if (tc->nfa_longlit_len < fate_arr_len) { |
527 | 587 | tc->nfa_longlit = (MVMint64 *)MVM_realloc(tc->nfa_longlit, sizeof(MVMint64) * fate_arr_len); |
528 | 587 | tc->nfa_longlit_len = fate_arr_len; |
529 | 587 | } |
530 | 713k | longlit = tc->nfa_longlit; |
531 | 713k | usedlonglit = 0; |
532 | 713k | |
533 | 713k | nextst[numnext++] = 1; |
534 | 713k | /* In NFA could be called with a string that has 0 graphemes in it. Guard |
535 | 713k | * against this so we don't init for the empty string. */ |
536 | 713k | if (target->body.num_graphs) MVM_string_gi_cached_init(tc, &gic, target, 0); |
537 | 713k | |
538 | 2.32M | while (numnext && offset <= eos) { |
539 | 1.61M | /* Swap next and current */ |
540 | 1.61M | MVMuint32 *temp = curst; |
541 | 1.61M | curst = nextst; |
542 | 1.61M | nextst = temp; |
543 | 1.61M | numcur = numnext; |
544 | 1.61M | numnext = 0; |
545 | 1.61M | numdone = 0; |
546 | 1.61M | |
547 | 1.61M | /* Save how many fates we have before this position is considered. */ |
548 | 1.61M | prev_fates = total_fates; |
549 | 1.61M | |
550 | 1.61M | if (MVM_UNLIKELY(nfadeb)) { |
551 | 0 | if (offset < eos) { |
552 | 0 | MVMGrapheme32 cp = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
553 | 0 | fprintf(stderr,"%c with %ds target %lx offset %"PRId64"\n",cp,(int)numcur, (long)target, offset); |
554 | 0 | } |
555 | 0 | else { |
556 | 0 | fprintf(stderr,"EOS with %ds\n",(int)numcur); |
557 | 0 | } |
558 | 0 | } |
559 | 5.93M | while (numcur) { |
560 | 4.32M | MVMNFAStateInfo *edge_info; |
561 | 4.32M | MVMint64 edge_info_elems; |
562 | 4.32M | |
563 | 4.32M | MVMint64 st = curst[--numcur]; |
564 | 4.32M | if (st <= num_states) { |
565 | 4.32M | if (in_done(done, numdone, st)) |
566 | 5.89k | continue; |
567 | 4.31M | done[numdone++] = st; |
568 | 4.31M | } |
569 | 4.32M | |
570 | 4.31M | edge_info = nfa->states[st - 1]; |
571 | 4.31M | edge_info_elems = nfa->num_state_edges[st - 1]; |
572 | 4.31M | if (MVM_UNLIKELY(nfadeb)) |
573 | 0 | fprintf(stderr,"\t%"PRIi64"\t%"PRIi64"\t", st, edge_info_elems); |
574 | 15.0M | for (i = 0; i < edge_info_elems; i++) { |
575 | 10.7M | MVMint64 act = edge_info[i].act; |
576 | 10.7M | MVMint64 to = edge_info[i].to; |
577 | 10.7M | |
578 | 10.7M | /* All the special cases are under one test. */ |
579 | 10.7M | if (act <= MVM_NFA_EDGE_EPSILON) { |
580 | 4.57M | if (act < 0) { |
581 | 1.45M | /* Negative indicates a fate is encoded in the act of the codepoint edge. */ |
582 | 1.45M | /* These will redispatch to one of the _LL cases below */ |
583 | 1.45M | act &= 0xff; |
584 | 1.45M | } |
585 | 3.12M | else if (act == MVM_NFA_EDGE_FATE) { |
586 | 926k | /* Crossed a fate edge. Check if we already saw this fate, and |
587 | 926k | * if so remove the entry so we can re-add at the new token length. */ |
588 | 926k | MVMint64 arg = edge_info[i].arg.i; |
589 | 926k | MVMint64 j; |
590 | 926k | MVMint64 found_fate = 0; |
591 | 926k | if (MVM_UNLIKELY(nfadeb)) |
592 | 0 | fprintf(stderr, "fate(%016llx) ", (long long unsigned int)arg); |
593 | 1.75M | for (j = 0; j < total_fates; j++) { |
594 | 826k | if (found_fate) |
595 | 80.4k | fates[j - found_fate] = fates[j]; |
596 | 826k | if ((fates[j] & 0xffffff) == arg) { |
597 | 572k | found_fate++; |
598 | 572k | if (j < prev_fates) |
599 | 560k | prev_fates--; |
600 | 572k | } |
601 | 826k | } |
602 | 926k | total_fates -= found_fate; |
603 | 926k | if (arg < usedlonglit) |
604 | 155k | arg -= longlit[arg] << 24; |
605 | 926k | if (MVM_UNLIKELY(++total_fates > fate_arr_len)) { |
606 | 0 | /* should never happen if nfa->fates is correct and dedup above works right */ |
607 | 0 | fprintf(stderr, "oops adding %016llx to\n", (long long unsigned int)arg); |
608 | 0 | for (j = 0; j < total_fates - 1; j++) { |
609 | 0 | fprintf(stderr, " %016llx\n", (long long unsigned int)fates[j]); |
610 | 0 | } |
611 | 0 | fate_arr_len = total_fates + 10; |
612 | 0 | tc->nfa_fates = (MVMint64 *)MVM_realloc(tc->nfa_fates, |
613 | 0 | sizeof(MVMint64) * fate_arr_len); |
614 | 0 | tc->nfa_fates_len = fate_arr_len; |
615 | 0 | fates = tc->nfa_fates; |
616 | 0 | } |
617 | 926k | /* a small insertion sort */ |
618 | 926k | j = total_fates - 1; |
619 | 982k | while (--j >= prev_fates && fates[j] < arg) { |
620 | 56.4k | fates[j + 1] = fates[j]; |
621 | 56.4k | } |
622 | 926k | fates[++j] = arg; |
623 | 926k | continue; |
624 | 926k | } |
625 | 2.20M | else if (act == MVM_NFA_EDGE_EPSILON && to <= num_states && |
626 | 2.20M | !in_done(done, numdone, to)) { |
627 | 2.19M | if (to) |
628 | 2.19M | curst[numcur++] = to; |
629 | 0 | else if (MVM_UNLIKELY(nfadeb)) /* XXX should turn into a "can't happen" after rebootstrap */ |
630 | 0 | fprintf(stderr, " oops, ignoring epsilon to 0\n"); |
631 | 2.19M | continue; |
632 | 2.19M | } |
633 | 4.57M | } |
634 | 10.7M | |
635 | 7.64M | if (eos <= offset) { |
636 | 24.0k | /* Can't match, so drop state. */ |
637 | 24.0k | continue; |
638 | 24.0k | } |
639 | 7.62M | else { |
640 | 7.62M | switch (act) { |
641 | 1.45M | case MVM_NFA_EDGE_CODEPOINT_LL: { |
642 | 1.45M | const MVMGrapheme32 arg = edge_info[i].arg.g; |
643 | 1.45M | if (MVM_string_gi_cached_get_grapheme(tc, &gic, offset) == arg) { |
644 | 66.9k | MVMint64 fate = (edge_info[i].act >> 8) & 0xfffff; |
645 | 66.9k | nextst[numnext++] = to; |
646 | 388k | while (usedlonglit <= fate) |
647 | 321k | longlit[usedlonglit++] = 0; |
648 | 66.9k | longlit[fate] = offset - orig_offset + 1; |
649 | 66.9k | if (MVM_UNLIKELY(nfadeb)) |
650 | 0 | fprintf(stderr, "%d->%d ", (int)i, (int)to); |
651 | 66.9k | } |
652 | 1.45M | continue; |
653 | 1.45M | } |
654 | 2.40M | case MVM_NFA_EDGE_CODEPOINT: { |
655 | 2.40M | const MVMGrapheme32 arg = edge_info[i].arg.g; |
656 | 2.40M | if (MVM_string_gi_cached_get_grapheme(tc, &gic, offset) == arg) { |
657 | 312k | nextst[numnext++] = to; |
658 | 312k | if (MVM_UNLIKELY(nfadeb)) |
659 | 0 | fprintf(stderr, "%d->%d ", (int)i, (int)to); |
660 | 312k | } |
661 | 2.40M | continue; |
662 | 1.45M | } |
663 | 0 | case MVM_NFA_EDGE_CODEPOINT_NEG: { |
664 | 0 | const MVMGrapheme32 arg = edge_info[i].arg.g; |
665 | 0 | if (MVM_string_gi_cached_get_grapheme(tc, &gic, offset) != arg) |
666 | 0 | nextst[numnext++] = to; |
667 | 0 | continue; |
668 | 1.45M | } |
669 | 1.09M | case MVM_NFA_EDGE_CHARCLASS: { |
670 | 1.09M | const MVMint64 arg = edge_info[i].arg.i; |
671 | 1.09M | if (MVM_string_grapheme_is_cclass(tc, arg, MVM_string_gi_cached_get_grapheme(tc, &gic, offset))) |
672 | 565k | nextst[numnext++] = to; |
673 | 1.09M | continue; |
674 | 1.45M | } |
675 | 110k | case MVM_NFA_EDGE_CHARCLASS_NEG: { |
676 | 110k | const MVMint64 arg = edge_info[i].arg.i; |
677 | 110k | if (!MVM_string_grapheme_is_cclass(tc, arg, MVM_string_gi_cached_get_grapheme(tc, &gic, offset))) |
678 | 106k | nextst[numnext++] = to; |
679 | 110k | continue; |
680 | 1.45M | } |
681 | 2.15M | case MVM_NFA_EDGE_CHARLIST: { |
682 | 2.15M | MVMString *arg = edge_info[i].arg.s; |
683 | 2.15M | MVMGrapheme32 cp = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
684 | 2.15M | if (MVM_string_index_of_grapheme(tc, arg, cp) >= 0) |
685 | 295k | nextst[numnext++] = to; |
686 | 2.15M | continue; |
687 | 1.45M | } |
688 | 2.91k | case MVM_NFA_EDGE_CHARLIST_NEG: { |
689 | 2.91k | MVMString *arg = edge_info[i].arg.s; |
690 | 2.91k | const MVMGrapheme32 cp = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
691 | 2.91k | if (MVM_string_index_of_grapheme(tc, arg, cp) < 0) |
692 | 2.32k | nextst[numnext++] = to; |
693 | 2.91k | continue; |
694 | 1.45M | } |
695 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_LL: { |
696 | 0 | const MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
697 | 0 | const MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
698 | 0 | const MVMGrapheme32 ord = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
699 | 0 | if (ord == lc_arg || ord == uc_arg) { |
700 | 0 | MVMint64 fate = (edge_info[i].act >> 8) & 0xfffff; |
701 | 0 | nextst[numnext++] = to; |
702 | 0 | while (usedlonglit <= fate) |
703 | 0 | longlit[usedlonglit++] = 0; |
704 | 0 | longlit[fate] = offset - orig_offset + 1; |
705 | 0 | } |
706 | 0 | continue; |
707 | 1.45M | } |
708 | 0 | case MVM_NFA_EDGE_CODEPOINT_I: { |
709 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
710 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
711 | 0 | MVMGrapheme32 ord = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
712 | 0 | if (ord == lc_arg || ord == uc_arg) |
713 | 0 | nextst[numnext++] = to; |
714 | 0 | continue; |
715 | 1.45M | } |
716 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_NEG: { |
717 | 0 | const MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
718 | 0 | const MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
719 | 0 | const MVMGrapheme32 ord = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
720 | 0 | if (ord != lc_arg && ord != uc_arg) |
721 | 0 | nextst[numnext++] = to; |
722 | 0 | continue; |
723 | 1.45M | } |
724 | 12.3k | case MVM_NFA_EDGE_CHARRANGE: { |
725 | 12.3k | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
726 | 12.3k | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
727 | 12.3k | MVMGrapheme32 ord = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
728 | 12.3k | if (ord >= lc_arg && ord <= uc_arg) |
729 | 1.37k | nextst[numnext++] = to; |
730 | 12.3k | continue; |
731 | 1.45M | } |
732 | 0 | case MVM_NFA_EDGE_CHARRANGE_NEG: { |
733 | 0 | const MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
734 | 0 | const MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
735 | 0 | const MVMGrapheme32 ord = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
736 | 0 | if (ord < lc_arg || uc_arg < ord) |
737 | 0 | nextst[numnext++] = to; |
738 | 0 | continue; |
739 | 1.45M | } |
740 | 0 | case MVM_NFA_EDGE_SUBRULE: |
741 | 0 | if (MVM_UNLIKELY(nfadeb)) |
742 | 0 | fprintf(stderr, "IGNORING SUBRULE\n"); |
743 | 0 | continue; |
744 | 0 | case MVM_NFA_EDGE_CODEPOINT_M: |
745 | 0 | case MVM_NFA_EDGE_CODEPOINT_M_NEG: { |
746 | 0 | MVMNormalizer norm; |
747 | 0 | MVMint32 ready; |
748 | 0 | MVMGrapheme32 ga = edge_info[i].arg.g; |
749 | 0 | MVMGrapheme32 gb = MVM_string_ord_basechar_at(tc, target, offset); |
750 | 0 |
|
751 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
752 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, ga, &ga); |
753 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
754 | 0 | if (!ready) |
755 | 0 | ga = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
756 | 0 |
|
757 | 0 | if (((act == MVM_NFA_EDGE_CODEPOINT_M) && (ga == gb)) |
758 | 0 | || ((act == MVM_NFA_EDGE_CODEPOINT_M_NEG) && (ga != gb))) |
759 | 0 | nextst[numnext++] = to; |
760 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
761 | 0 | continue; |
762 | 0 | } |
763 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM: |
764 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: { |
765 | 0 | MVMNormalizer norm; |
766 | 0 | MVMint32 ready; |
767 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
768 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
769 | 0 | const MVMGrapheme32 ord = MVM_string_ord_basechar_at(tc, target, offset); |
770 | 0 |
|
771 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
772 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, uc_arg, &uc_arg); |
773 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
774 | 0 | if (!ready) |
775 | 0 | uc_arg = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
776 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
777 | 0 |
|
778 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
779 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, lc_arg, &lc_arg); |
780 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
781 | 0 | if (!ready) |
782 | 0 | lc_arg = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
783 | 0 |
|
784 | 0 | if (((act == MVM_NFA_EDGE_CODEPOINT_IM) && (ord == lc_arg || ord == uc_arg)) |
785 | 0 | || ((act == MVM_NFA_EDGE_CODEPOINT_IM_NEG) && (ord != lc_arg && ord != uc_arg))) |
786 | 0 | nextst[numnext++] = to; |
787 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
788 | 0 | continue; |
789 | 0 | } |
790 | 0 | case MVM_NFA_EDGE_CHARRANGE_M: { |
791 | 0 | const MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
792 | 0 | const MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
793 | 0 | const MVMGrapheme32 ord = MVM_string_ord_basechar_at(tc, target, offset); |
794 | 0 | if (ord >= lc_arg && ord <= uc_arg) |
795 | 0 | nextst[numnext++] = to; |
796 | 0 | continue; |
797 | 0 | } |
798 | 0 | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
799 | 0 | const MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
800 | 0 | const MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
801 | 0 | const MVMGrapheme32 ord = MVM_string_ord_basechar_at(tc, target, offset); |
802 | 0 | if (ord < lc_arg || uc_arg < ord) |
803 | 0 | nextst[numnext++] = to; |
804 | 0 | continue; |
805 | 0 | } |
806 | 380k | case MVM_NFA_EDGE_SYNTH_CP_COUNT: { |
807 | 380k | /* Binary search the edges ahead for the grapheme. */ |
808 | 380k | const MVMGrapheme32 search = MVM_string_gi_cached_get_grapheme(tc, &gic, offset); |
809 | 380k | const MVMint64 num_possibilities = edge_info[i].arg.i; |
810 | 380k | const MVMint64 end = i + num_possibilities; |
811 | 380k | MVMint64 l = i + 1; |
812 | 380k | MVMint64 r = end; |
813 | 380k | MVMint64 found = -1; |
814 | 1.61M | while (l <= r) { |
815 | 1.27M | const MVMint64 m = l + (r - l) / 2; |
816 | 1.27M | const MVMGrapheme32 test = edge_info[m].arg.g; |
817 | 1.27M | if (test == search) { |
818 | 39.8k | /* We found it, but important we get the first edge |
819 | 39.8k | * that matches. */ |
820 | 39.8k | found = m; |
821 | 55.7k | while (found > i + 1 && edge_info[found - 1].arg.g == search) |
822 | 15.8k | found--; |
823 | 39.8k | break; |
824 | 39.8k | } |
825 | 1.23M | if (test < search) |
826 | 675k | l = m + 1; |
827 | 1.23M | else |
828 | 562k | r = m - 1; |
829 | 1.23M | } |
830 | 380k | if (found == -1) { |
831 | 340k | /* Binary search failed to find a match, so just skip all |
832 | 340k | * the nodes. */ |
833 | 340k | i += num_possibilities; |
834 | 340k | } |
835 | 39.8k | else { |
836 | 39.8k | /* Add all states that match. */ |
837 | 100k | while (found <= end && edge_info[found].arg.g == search) { |
838 | 60.4k | to = edge_info[found].to; |
839 | 60.4k | if (edge_info[found].act == MVM_NFA_EDGE_CODEPOINT) { |
840 | 60.4k | nextst[numnext++] = to; |
841 | 60.4k | if (MVM_UNLIKELY(nfadeb)) |
842 | 0 | fprintf(stderr, "%d->%d ", (int)found, (int)to); |
843 | 60.4k | } |
844 | 0 | else { |
845 | 0 | const MVMint64 fate = (edge_info[found].act >> 8) & 0xfffff; |
846 | 0 | nextst[numnext++] = to; |
847 | 0 | while (usedlonglit <= fate) |
848 | 0 | longlit[usedlonglit++] = 0; |
849 | 0 | longlit[fate] = offset - orig_offset + 1; |
850 | 0 | if (MVM_UNLIKELY(nfadeb)) |
851 | 0 | fprintf(stderr, "%d->%d ", (int)found, (int)to); |
852 | 0 | } |
853 | 60.4k | found++; |
854 | 60.4k | } |
855 | 39.8k | i += num_possibilities; |
856 | 39.8k | } |
857 | 380k | break; |
858 | 0 | } |
859 | 7.62M | } |
860 | 7.62M | } |
861 | 7.64M | } |
862 | 4.31M | if (MVM_UNLIKELY(nfadeb)) fprintf(stderr,"\n"); |
863 | 4.31M | } |
864 | 1.61M | |
865 | 1.61M | /* Move to next character and generation. */ |
866 | 1.61M | offset++; |
867 | 1.61M | gen++; |
868 | 1.61M | } |
869 | 713k | /* strip any literal lengths, leaving only fates */ |
870 | 713k | if (usedlonglit || nfadeb) { |
871 | 35.6k | if (MVM_UNLIKELY(nfadeb)) fprintf(stderr,"Final\n"); |
872 | 85.0k | for (i = 0; i < total_fates; i++) { |
873 | 49.3k | if (MVM_UNLIKELY(nfadeb)) fprintf(stderr, " %08llx\n", (long long unsigned int)fates[i]); |
874 | 49.3k | fates[i] &= 0xffffff; |
875 | 49.3k | } |
876 | 35.6k | } |
877 | 713k | |
878 | 713k | *total_fates_out = total_fates; |
879 | 713k | return fates; |
880 | 713k | } |
881 | | |
882 | | /* Takes an NFA, a target string in and an offset. Runs the NFA and returns |
883 | | * the order to try the fates in. */ |
884 | 322k | MVMObject * MVM_nfa_run_proto(MVMThreadContext *tc, MVMObject *nfa, MVMString *target, MVMint64 offset) { |
885 | 322k | /* Run the NFA. */ |
886 | 322k | MVMint64 total_fates, i; |
887 | 322k | MVMint64 *fates = nqp_nfa_run(tc, (MVMNFABody *)OBJECT_BODY(nfa), target, offset, &total_fates); |
888 | 322k | |
889 | 322k | /* Copy results into an integer array. */ |
890 | 322k | MVMObject *fateres = MVM_repr_alloc_init(tc, tc->instance->boot_types.BOOTIntArray); |
891 | 423k | for (i = 0; i < total_fates; i++) |
892 | 100k | MVM_repr_bind_pos_i(tc, fateres, i, fates[i]); |
893 | 322k | |
894 | 322k | return fateres; |
895 | 322k | } |
896 | | |
897 | | /* Takes an NFA, target string and offset. Runs the NFA, and uses the output |
898 | | * to update the bstack with backtracking points to try the alternation |
899 | | * branches in the correct order. The current capture stack is needed for its |
900 | | * height. */ |
901 | | void MVM_nfa_run_alt(MVMThreadContext *tc, MVMObject *nfa, MVMString *target, |
902 | 390k | MVMint64 offset, MVMObject *bstack, MVMObject *cstack, MVMObject *labels) { |
903 | 390k | /* Run the NFA. */ |
904 | 390k | MVMint64 total_fates, i; |
905 | 390k | MVMint64 *fates = nqp_nfa_run(tc, (MVMNFABody *)OBJECT_BODY(nfa), target, offset, &total_fates); |
906 | 390k | |
907 | 390k | /* Push the results onto the bstack. */ |
908 | 390k | MVMint64 caps = cstack && IS_CONCRETE(cstack) |
909 | 16.2k | ? MVM_repr_elems(tc, cstack) |
910 | 374k | : 0; |
911 | 643k | for (i = 0; i < total_fates; i++) { |
912 | 253k | MVM_repr_push_i(tc, bstack, MVM_repr_at_pos_i(tc, labels, fates[i])); |
913 | 253k | MVM_repr_push_i(tc, bstack, offset); |
914 | 253k | MVM_repr_push_i(tc, bstack, 0); |
915 | 253k | MVM_repr_push_i(tc, bstack, caps); |
916 | 253k | } |
917 | 390k | } |