/home/travis/build/MoarVM/MoarVM/src/6model/reprs/NFA.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | | /* This representation's function pointer table. */ |
4 | | static const MVMREPROps NFA_this_repr; |
5 | | |
6 | | /* Creates a new type object of this representation, and associates it with |
7 | | * the given HOW. */ |
8 | 2 | static MVMObject * type_object_for(MVMThreadContext *tc, MVMObject *HOW) { |
9 | 2 | MVMSTable *st = MVM_gc_allocate_stable(tc, &NFA_this_repr, HOW); |
10 | 2 | |
11 | 2 | MVMROOT(tc, st, { |
12 | 2 | MVMObject *obj = MVM_gc_allocate_type_object(tc, st); |
13 | 2 | MVM_ASSIGN_REF(tc, &(st->header), st->WHAT, obj); |
14 | 2 | st->size = sizeof(MVMNFA); |
15 | 2 | }); |
16 | 2 | |
17 | 2 | return st->WHAT; |
18 | 2 | } |
19 | | |
20 | | /* Copies the body of one object to another. */ |
21 | 0 | static void copy_to(MVMThreadContext *tc, MVMSTable *st, void *src, MVMObject *dest_root, void *dest) { |
22 | 0 | MVM_exception_throw_adhoc(tc, "Cannot copy object with representation NFA"); |
23 | 0 | } |
24 | | |
25 | | /* Called by the VM to mark any GCable items. */ |
26 | 2.49k | static void gc_mark(MVMThreadContext *tc, MVMSTable *st, void *data, MVMGCWorklist *worklist) { |
27 | 2.49k | MVMNFABody *lb = (MVMNFABody *)data; |
28 | 2.49k | MVMint64 i, j; |
29 | 2.49k | |
30 | 2.49k | MVM_gc_worklist_add(tc, worklist, &lb->fates); |
31 | 2.49k | |
32 | 80.4k | for (i = 0; i < lb->num_states; i++) { |
33 | 77.9k | MVMint64 edges = lb->num_state_edges[i]; |
34 | 208k | for (j = 0; j < edges; j++) { |
35 | 130k | switch (lb->states[i][j].act) { |
36 | 15.0k | case MVM_NFA_EDGE_CHARLIST: |
37 | 15.0k | case MVM_NFA_EDGE_CHARLIST_NEG: |
38 | 15.0k | MVM_gc_worklist_add(tc, worklist, &lb->states[i][j].arg.s); |
39 | 130k | } |
40 | 130k | } |
41 | 77.9k | } |
42 | 2.49k | } |
43 | | |
44 | | /* Called by the VM in order to free memory associated with this object. */ |
45 | 0 | static void gc_free(MVMThreadContext *tc, MVMObject *obj) { |
46 | 0 | MVMNFA *nfa = (MVMNFA *)obj; |
47 | 0 | MVMint64 i; |
48 | 0 | for (i = 0; i < nfa->body.num_states; i++) |
49 | 0 | if (nfa->body.num_state_edges[i]) |
50 | 0 | MVM_free(nfa->body.states[i]); |
51 | 0 | MVM_free(nfa->body.states); |
52 | 0 | MVM_free(nfa->body.num_state_edges); |
53 | 0 | } |
54 | | |
55 | | |
56 | | static const MVMStorageSpec storage_spec = { |
57 | | MVM_STORAGE_SPEC_REFERENCE, /* inlineable */ |
58 | | 0, /* bits */ |
59 | | 0, /* align */ |
60 | | MVM_STORAGE_SPEC_BP_NONE, /* boxed_primitive */ |
61 | | 0, /* can_box */ |
62 | | 0, /* is_unsigned */ |
63 | | }; |
64 | | |
65 | | |
66 | | /* Gets the storage specification for this representation. */ |
67 | 0 | static const MVMStorageSpec * get_storage_spec(MVMThreadContext *tc, MVMSTable *st) { |
68 | 0 | return &storage_spec; |
69 | 0 | } |
70 | | |
71 | | /* Serializes the data. */ |
72 | 1 | static void serialize(MVMThreadContext *tc, MVMSTable *st, void *data, MVMSerializationWriter *writer) { |
73 | 1 | MVMNFABody *body = (MVMNFABody *)data; |
74 | 1 | MVMint64 i, j; |
75 | 1 | |
76 | 1 | /* Write fates. */ |
77 | 1 | MVM_serialization_write_ref(tc, writer, body->fates); |
78 | 1 | |
79 | 1 | /* Write number of states. */ |
80 | 1 | MVM_serialization_write_int(tc, writer, body->num_states); |
81 | 1 | |
82 | 1 | /* Write state edge list counts. */ |
83 | 5 | for (i = 0; i < body->num_states; i++) |
84 | 4 | MVM_serialization_write_int(tc, writer, body->num_state_edges[i]); |
85 | 1 | |
86 | 1 | /* Write state graph. */ |
87 | 5 | for (i = 0; i < body->num_states; i++) { |
88 | 8 | for (j = 0; j < body->num_state_edges[i]; j++) { |
89 | 4 | MVM_serialization_write_int(tc, writer, body->states[i][j].act); |
90 | 4 | MVM_serialization_write_int(tc, writer, body->states[i][j].to); |
91 | 4 | switch (body->states[i][j].act & 0xff) { |
92 | 1 | case MVM_NFA_EDGE_FATE: |
93 | 1 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.i); |
94 | 1 | break; |
95 | 3 | case MVM_NFA_EDGE_CODEPOINT: |
96 | 3 | case MVM_NFA_EDGE_CODEPOINT_LL: |
97 | 3 | case MVM_NFA_EDGE_CODEPOINT_NEG: |
98 | 3 | case MVM_NFA_EDGE_CODEPOINT_M: |
99 | 3 | case MVM_NFA_EDGE_CODEPOINT_M_NEG: { |
100 | 3 | MVMGrapheme32 g = body->states[i][j].arg.g; |
101 | 3 | if (g >= 0) { |
102 | 3 | /* Non-synthetic. */ |
103 | 3 | MVM_serialization_write_int(tc, writer, g); |
104 | 3 | } |
105 | 0 | else { |
106 | 0 | /* Synthetic. Write the number of codepoints negated, |
107 | 0 | * and then each of the codepoints. */ |
108 | 0 | MVMNFGSynthetic *si = MVM_nfg_get_synthetic_info(tc, g); |
109 | 0 | MVMint32 k; |
110 | 0 | MVM_serialization_write_int(tc, writer, -(si->num_combs + 1)); |
111 | 0 | MVM_serialization_write_int(tc, writer, si->base); |
112 | 0 | for (k = 0; k < si->num_combs; k++) |
113 | 0 | MVM_serialization_write_int(tc, writer, si->combs[k]); |
114 | 0 | } |
115 | 3 | break; |
116 | 3 | } |
117 | 0 | case MVM_NFA_EDGE_CHARCLASS: |
118 | 0 | case MVM_NFA_EDGE_CHARCLASS_NEG: |
119 | 0 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.i); |
120 | 0 | break; |
121 | 0 | case MVM_NFA_EDGE_CHARLIST: |
122 | 0 | case MVM_NFA_EDGE_CHARLIST_NEG: |
123 | 0 | MVM_serialization_write_str(tc, writer, body->states[i][j].arg.s); |
124 | 0 | break; |
125 | 0 | case MVM_NFA_EDGE_CODEPOINT_I: |
126 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_LL: |
127 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_NEG: |
128 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM: |
129 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: |
130 | 0 | case MVM_NFA_EDGE_CHARRANGE: |
131 | 0 | case MVM_NFA_EDGE_CHARRANGE_NEG: |
132 | 0 | case MVM_NFA_EDGE_CHARRANGE_M: |
133 | 0 | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
134 | 0 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.uclc.lc); |
135 | 0 | MVM_serialization_write_int(tc, writer, body->states[i][j].arg.uclc.uc); |
136 | 0 | break; |
137 | 0 | } |
138 | 4 | } |
139 | 4 | } |
140 | 4 | } |
141 | 1 | } |
142 | | |
143 | | /* Deserializes the data. */ |
144 | 1 | static void deserialize(MVMThreadContext *tc, MVMSTable *st, MVMObject *root, void *data, MVMSerializationReader *reader) { |
145 | 1 | MVMNFABody *body = (MVMNFABody *)data; |
146 | 1 | MVMint64 i, j; |
147 | 1 | |
148 | 1 | /* Read fates. */ |
149 | 1 | body->fates = MVM_serialization_read_ref(tc, reader); |
150 | 1 | |
151 | 1 | /* Read number of states. */ |
152 | 1 | body->num_states = MVM_serialization_read_int(tc, reader); |
153 | 1 | |
154 | 1 | if (body->num_states > 0) { |
155 | 1 | /* Read state edge list counts. */ |
156 | 1 | body->num_state_edges = MVM_malloc(body->num_states * sizeof(MVMint64)); |
157 | 5 | for (i = 0; i < body->num_states; i++) |
158 | 4 | body->num_state_edges[i] = MVM_serialization_read_int(tc, reader); |
159 | 1 | |
160 | 1 | /* Read state graph. */ |
161 | 1 | body->states = MVM_malloc(body->num_states * sizeof(MVMNFAStateInfo *)); |
162 | 5 | for (i = 0; i < body->num_states; i++) { |
163 | 4 | MVMint64 edges = body->num_state_edges[i]; |
164 | 4 | if (edges > 0) |
165 | 4 | body->states[i] = MVM_malloc(edges * sizeof(MVMNFAStateInfo)); |
166 | 8 | for (j = 0; j < edges; j++) { |
167 | 4 | body->states[i][j].act = MVM_serialization_read_int(tc, reader); |
168 | 4 | body->states[i][j].to = MVM_serialization_read_int(tc, reader); |
169 | 4 | switch (body->states[i][j].act & 0xff) { |
170 | 1 | case MVM_NFA_EDGE_FATE: |
171 | 1 | body->states[i][j].arg.i = MVM_serialization_read_int(tc, reader); |
172 | 1 | break; |
173 | 3 | case MVM_NFA_EDGE_CODEPOINT: |
174 | 3 | case MVM_NFA_EDGE_CODEPOINT_LL: |
175 | 3 | case MVM_NFA_EDGE_CODEPOINT_NEG: |
176 | 3 | case MVM_NFA_EDGE_CODEPOINT_M: |
177 | 3 | case MVM_NFA_EDGE_CODEPOINT_M_NEG: { |
178 | 3 | MVMint64 cp_or_synth_count = MVM_serialization_read_int(tc, reader); |
179 | 3 | if (cp_or_synth_count >= 0) { |
180 | 3 | body->states[i][j].arg.g = (MVMGrapheme32)cp_or_synth_count; |
181 | 3 | } |
182 | 0 | else { |
183 | 0 | MVMint32 num_codes = -cp_or_synth_count; |
184 | 0 | MVMCodepoint *codes = MVM_malloc(num_codes * sizeof(MVMCodepoint)); |
185 | 0 | MVMint32 k; |
186 | 0 | for (k = 0; k < num_codes; k++) |
187 | 0 | codes[k] = (MVMCodepoint)MVM_serialization_read_int(tc, reader); |
188 | 0 | body->states[i][j].arg.g = MVM_nfg_codes_to_grapheme(tc, codes, num_codes); |
189 | 0 | MVM_free(codes); |
190 | 0 | } |
191 | 3 | break; |
192 | 3 | } |
193 | 0 | case MVM_NFA_EDGE_CHARCLASS: |
194 | 0 | case MVM_NFA_EDGE_CHARCLASS_NEG: |
195 | 0 | body->states[i][j].arg.i = MVM_serialization_read_int(tc, reader); |
196 | 0 | break; |
197 | 0 | case MVM_NFA_EDGE_CHARLIST: |
198 | 0 | case MVM_NFA_EDGE_CHARLIST_NEG: |
199 | 0 | MVM_ASSIGN_REF(tc, &(root->header), body->states[i][j].arg.s, MVM_serialization_read_str(tc, reader)); |
200 | 0 | break; |
201 | 0 | case MVM_NFA_EDGE_CODEPOINT_I: |
202 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_LL: |
203 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_NEG: |
204 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM: |
205 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: |
206 | 0 | case MVM_NFA_EDGE_CHARRANGE: |
207 | 0 | case MVM_NFA_EDGE_CHARRANGE_NEG: |
208 | 0 | case MVM_NFA_EDGE_CHARRANGE_M: |
209 | 0 | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
210 | 0 | body->states[i][j].arg.uclc.lc = MVM_serialization_read_int(tc, reader); |
211 | 0 | body->states[i][j].arg.uclc.uc = MVM_serialization_read_int(tc, reader); |
212 | 0 | break; |
213 | 0 | } |
214 | 4 | } |
215 | 4 | } |
216 | 4 | } |
217 | 1 | } |
218 | 1 | } |
219 | | |
220 | | /* Compose the representation. */ |
221 | 2 | static void compose(MVMThreadContext *tc, MVMSTable *st, MVMObject *info) { |
222 | 2 | /* Nothing to do for this REPR. */ |
223 | 2 | } |
224 | | |
225 | | /* Set the size of the STable. */ |
226 | 130 | static void deserialize_stable_size(MVMThreadContext *tc, MVMSTable *st, MVMSerializationReader *reader) { |
227 | 130 | st->size = sizeof(MVMNFA); |
228 | 130 | } |
229 | | |
230 | | /* Calculates the non-GC-managed memory we hold on to. */ |
231 | 882 | static MVMuint64 unmanaged_size(MVMThreadContext *tc, MVMSTable *st, void *data) { |
232 | 882 | MVMNFABody *body = (MVMNFABody *)data; |
233 | 882 | MVMuint64 total; |
234 | 882 | MVMint64 i; |
235 | 882 | |
236 | 882 | total = body->num_states * sizeof(MVMint64); /* for num_state_edges */ |
237 | 882 | total += body->num_states * sizeof(MVMNFAStateInfo *); /* for states level 1 */ |
238 | 28.2k | for (i = 0; i < body->num_states; i++) |
239 | 27.3k | total += body->num_state_edges[i] * sizeof(MVMNFAStateInfo); |
240 | 882 | |
241 | 882 | return total; |
242 | 882 | } |
243 | | |
244 | | /* Initializes the representation. */ |
245 | 130 | const MVMREPROps * MVMNFA_initialize(MVMThreadContext *tc) { |
246 | 130 | return &NFA_this_repr; |
247 | 130 | } |
248 | | |
249 | | static const MVMREPROps NFA_this_repr = { |
250 | | type_object_for, |
251 | | MVM_gc_allocate_object, |
252 | | NULL, /* initialize */ |
253 | | copy_to, |
254 | | MVM_REPR_DEFAULT_ATTR_FUNCS, |
255 | | MVM_REPR_DEFAULT_BOX_FUNCS, |
256 | | MVM_REPR_DEFAULT_POS_FUNCS, |
257 | | MVM_REPR_DEFAULT_ASS_FUNCS, |
258 | | MVM_REPR_DEFAULT_ELEMS, |
259 | | get_storage_spec, |
260 | | NULL, /* change_type */ |
261 | | serialize, |
262 | | deserialize, |
263 | | NULL, /* serialize_repr_data */ |
264 | | NULL, /* deserialize_repr_data */ |
265 | | deserialize_stable_size, |
266 | | gc_mark, |
267 | | gc_free, |
268 | | NULL, /* gc_cleanup */ |
269 | | NULL, /* gc_mark_repr_data */ |
270 | | NULL, /* gc_free_repr_data */ |
271 | | compose, |
272 | | NULL, /* spesh */ |
273 | | "NFA", /* name */ |
274 | | MVM_REPR_ID_NFA, |
275 | | unmanaged_size, |
276 | | NULL, /* describe_refs */ |
277 | | }; |
278 | | |
279 | | /* We may be provided a grapheme as a codepoint for non-synthetics, or as a |
280 | | * 1-char string for synthetics. */ |
281 | 111k | static MVMGrapheme32 get_grapheme(MVMThreadContext *tc, MVMObject *obj) { |
282 | 111k | /* Handle null and non-concrete case. */ |
283 | 111k | if (MVM_is_null(tc, obj) || !IS_CONCRETE(obj)) { |
284 | 0 | MVM_exception_throw_adhoc(tc, |
285 | 0 | "NFA must be provided with a concrete string or integer for graphemes"); |
286 | 0 | } |
287 | 111k | |
288 | 111k | /* Otherwise, guess something appropriate. */ |
289 | 111k | else { |
290 | 111k | const MVMStorageSpec *ss = REPR(obj)->get_storage_spec(tc, STABLE(obj)); |
291 | 111k | if (ss->can_box & MVM_STORAGE_SPEC_CAN_BOX_INT) |
292 | 111k | return REPR(obj)->box_funcs.get_int(tc, STABLE(obj), obj, OBJECT_BODY(obj)); |
293 | 0 | else if (ss->can_box & MVM_STORAGE_SPEC_CAN_BOX_STR) |
294 | 0 | return MVM_string_get_grapheme_at(tc, |
295 | 0 | REPR(obj)->box_funcs.get_str(tc, STABLE(obj), obj, OBJECT_BODY(obj)), |
296 | 0 | 0); |
297 | 0 | else |
298 | 0 | MVM_exception_throw_adhoc(tc, |
299 | 0 | "NFA must be provided with a string or integer for graphemes"); |
300 | 111k | } |
301 | 111k | } |
302 | | |
303 | 4.56k | MVMObject * MVM_nfa_from_statelist(MVMThreadContext *tc, MVMObject *states, MVMObject *nfa_type) { |
304 | 4.56k | MVMObject *nfa_obj; |
305 | 4.56k | MVMNFABody *nfa; |
306 | 4.56k | MVMint64 i, j, num_states; |
307 | 4.56k | |
308 | 4.56k | MVMROOT(tc, states, { |
309 | 4.56k | MVMROOT(tc, nfa_type, { |
310 | 4.56k | /* Create NFA object. */ |
311 | 4.56k | nfa_obj = MVM_repr_alloc_init(tc, nfa_type); |
312 | 4.56k | nfa = (MVMNFABody *)OBJECT_BODY(nfa_obj); |
313 | 4.56k | |
314 | 4.56k | /* The first state entry is the fates list. */ |
315 | 4.56k | nfa->fates = MVM_repr_at_pos_o(tc, states, 0); |
316 | 4.56k | |
317 | 4.56k | /* Go over the rest and convert to the NFA. */ |
318 | 4.56k | num_states = MVM_repr_elems(tc, states) - 1; |
319 | 4.56k | nfa->num_states = num_states; |
320 | 4.56k | if (num_states > 0) { |
321 | 4.56k | nfa->num_state_edges = MVM_calloc(num_states, sizeof(MVMint64)); |
322 | 4.56k | nfa->states = MVM_calloc(num_states, sizeof(MVMNFAStateInfo *)); |
323 | 4.56k | } |
324 | 4.56k | for (i = 0; i < num_states; i++) { |
325 | 4.56k | MVMObject *edge_info = MVM_repr_at_pos_o(tc, states, i + 1); |
326 | 4.56k | MVMint64 elems = MVM_repr_elems(tc, edge_info); |
327 | 4.56k | MVMint64 edges = elems / 3; |
328 | 4.56k | MVMint64 cur_edge = 0; |
329 | 4.56k | |
330 | 4.56k | nfa->num_state_edges[i] = edges; |
331 | 4.56k | if (edges > 0) |
332 | 4.56k | nfa->states[i] = MVM_malloc(edges * sizeof(MVMNFAStateInfo)); |
333 | 4.56k | |
334 | 4.56k | for (j = 0; j < elems; j += 3) { |
335 | 4.56k | MVMint64 act = MVM_coerce_simple_intify(tc, |
336 | 4.56k | MVM_repr_at_pos_o(tc, edge_info, j)); |
337 | 4.56k | MVMint64 to = MVM_coerce_simple_intify(tc, |
338 | 4.56k | MVM_repr_at_pos_o(tc, edge_info, j + 2)); |
339 | 4.56k | if (to <= 0 && act != MVM_NFA_EDGE_FATE) |
340 | 4.56k | MVM_exception_throw_adhoc(tc, "Invalid to edge %"PRId64" in NFA statelist", to); |
341 | 4.56k | |
342 | 4.56k | nfa->states[i][cur_edge].act = act; |
343 | 4.56k | nfa->states[i][cur_edge].to = to; |
344 | 4.56k | |
345 | 4.56k | switch (act & 0xff) { |
346 | 4.56k | case MVM_NFA_EDGE_FATE: |
347 | 4.56k | nfa->states[i][cur_edge].arg.i = MVM_coerce_simple_intify(tc, |
348 | 4.56k | MVM_repr_at_pos_o(tc, edge_info, j + 1)); |
349 | 4.56k | break; |
350 | 4.56k | case MVM_NFA_EDGE_CODEPOINT: |
351 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_LL: |
352 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_NEG: |
353 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_M: |
354 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_M_NEG: |
355 | 4.56k | nfa->states[i][cur_edge].arg.g = get_grapheme(tc, |
356 | 4.56k | MVM_repr_at_pos_o(tc, edge_info, j + 1)); |
357 | 4.56k | break; |
358 | 4.56k | case MVM_NFA_EDGE_CHARCLASS: |
359 | 4.56k | case MVM_NFA_EDGE_CHARCLASS_NEG: |
360 | 4.56k | nfa->states[i][cur_edge].arg.i = MVM_coerce_simple_intify(tc, |
361 | 4.56k | MVM_repr_at_pos_o(tc, edge_info, j + 1)); |
362 | 4.56k | break; |
363 | 4.56k | case MVM_NFA_EDGE_CHARLIST: |
364 | 4.56k | case MVM_NFA_EDGE_CHARLIST_NEG: |
365 | 4.56k | MVM_ASSIGN_REF(tc, &(nfa_obj->header), |
366 | 4.56k | nfa->states[i][cur_edge].arg.s, |
367 | 4.56k | MVM_repr_get_str(tc, MVM_repr_at_pos_o(tc, edge_info, j + 1))); |
368 | 4.56k | break; |
369 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_I: |
370 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_I_LL: |
371 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_I_NEG: |
372 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_IM: |
373 | 4.56k | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: |
374 | 4.56k | /* That is not about uppercase/lowercase here, but lower and upper bounds |
375 | 4.56k | of our range. */ |
376 | 4.56k | case MVM_NFA_EDGE_CHARRANGE: |
377 | 4.56k | case MVM_NFA_EDGE_CHARRANGE_NEG: |
378 | 4.56k | case MVM_NFA_EDGE_CHARRANGE_M: |
379 | 4.56k | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
380 | 4.56k | MVMObject *arg = MVM_repr_at_pos_o(tc, edge_info, j + 1); |
381 | 4.56k | nfa->states[i][cur_edge].arg.uclc.lc = MVM_coerce_simple_intify(tc, |
382 | 4.56k | MVM_repr_at_pos_o(tc, arg, 0)); |
383 | 4.56k | nfa->states[i][cur_edge].arg.uclc.uc = MVM_coerce_simple_intify(tc, |
384 | 4.56k | MVM_repr_at_pos_o(tc, arg, 1)); |
385 | 4.56k | break; |
386 | 4.56k | } |
387 | 4.56k | } |
388 | 4.56k | |
389 | 4.56k | cur_edge++; |
390 | 4.56k | } |
391 | 4.56k | } |
392 | 4.56k | }); |
393 | 4.56k | }); |
394 | 4.56k | |
395 | 4.56k | return nfa_obj; |
396 | 4.56k | } |
397 | | |
398 | | /* Does a run of the NFA. Produces a list of integers indicating the |
399 | | * chosen ordering. */ |
400 | 577k | static MVMint64 * nqp_nfa_run(MVMThreadContext *tc, MVMNFABody *nfa, MVMString *target, MVMint64 offset, MVMint64 *total_fates_out) { |
401 | 577k | MVMint64 eos = MVM_string_graphs(tc, target); |
402 | 577k | MVMint64 gen = 1; |
403 | 577k | MVMint64 numcur = 0; |
404 | 577k | MVMint64 numnext = 0; |
405 | 577k | MVMint64 *done, *fates, *curst, *nextst, *longlit; |
406 | 577k | MVMint64 i, fate_arr_len, num_states, total_fates, prev_fates, usedlonglit; |
407 | 577k | MVMint64 orig_offset = offset; |
408 | 577k | int nfadeb = tc->instance->nfa_debug_enabled; |
409 | 577k | |
410 | 577k | /* Obtain or (re)allocate "done states", "current states" and "next |
411 | 577k | * states" arrays. */ |
412 | 577k | num_states = nfa->num_states; |
413 | 577k | if (tc->nfa_alloc_states < num_states) { |
414 | 260 | size_t alloc = (num_states + 1) * sizeof(MVMint64); |
415 | 260 | tc->nfa_done = (MVMint64 *)MVM_realloc(tc->nfa_done, alloc); |
416 | 260 | tc->nfa_curst = (MVMint64 *)MVM_realloc(tc->nfa_curst, alloc); |
417 | 260 | tc->nfa_nextst = (MVMint64 *)MVM_realloc(tc->nfa_nextst, alloc); |
418 | 260 | tc->nfa_alloc_states = num_states; |
419 | 260 | } |
420 | 577k | done = tc->nfa_done; |
421 | 577k | curst = tc->nfa_curst; |
422 | 577k | nextst = tc->nfa_nextst; |
423 | 577k | memset(done, 0, (num_states + 1) * sizeof(MVMint64)); |
424 | 577k | |
425 | 577k | /* Allocate fates array. */ |
426 | 577k | fate_arr_len = 1 + MVM_repr_elems(tc, nfa->fates); |
427 | 577k | if (tc->nfa_fates_len < fate_arr_len) { |
428 | 531 | tc->nfa_fates = (MVMint64 *)MVM_realloc(tc->nfa_fates, sizeof(MVMint64) * fate_arr_len); |
429 | 531 | tc->nfa_fates_len = fate_arr_len; |
430 | 531 | } |
431 | 577k | fates = tc->nfa_fates; |
432 | 577k | total_fates = 0; |
433 | 577k | if (nfadeb) fprintf(stderr,"======================================\nStarting with %d fates in %d states\n", (int)fate_arr_len, (int)num_states) ; |
434 | 577k | |
435 | 577k | /* longlit will be updated on a fate whenever NFA passes through final char of a literal. */ |
436 | 577k | /* These edges are specially marked to indicate which fate they influence the fate of. */ |
437 | 577k | if (tc->nfa_longlit_len < fate_arr_len) { |
438 | 531 | tc->nfa_longlit = (MVMint64 *)MVM_realloc(tc->nfa_longlit, sizeof(MVMint64) * fate_arr_len); |
439 | 531 | tc->nfa_longlit_len = fate_arr_len; |
440 | 531 | } |
441 | 577k | longlit = tc->nfa_longlit; |
442 | 577k | usedlonglit = 0; |
443 | 577k | |
444 | 577k | nextst[numnext++] = 1; |
445 | 1.70M | while (numnext && offset <= eos) { |
446 | 1.12M | /* Swap next and current */ |
447 | 1.12M | MVMint64 *temp = curst; |
448 | 1.12M | curst = nextst; |
449 | 1.12M | nextst = temp; |
450 | 1.12M | numcur = numnext; |
451 | 1.12M | numnext = 0; |
452 | 1.12M | |
453 | 1.12M | /* Save how many fates we have before this position is considered. */ |
454 | 1.12M | prev_fates = total_fates; |
455 | 1.12M | |
456 | 1.12M | if (nfadeb) { |
457 | 0 | if (offset < eos) { |
458 | 0 | MVMGrapheme32 cp = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
459 | 0 | fprintf(stderr,"%c with %ds target %lx offset %"PRId64"\n",cp,(int)numcur, (long)target, offset); |
460 | 0 | } |
461 | 0 | else { |
462 | 0 | fprintf(stderr,"EOS with %ds\n",(int)numcur); |
463 | 0 | } |
464 | 0 | } |
465 | 4.41M | while (numcur) { |
466 | 3.29M | MVMNFAStateInfo *edge_info; |
467 | 3.29M | MVMint64 edge_info_elems; |
468 | 3.29M | |
469 | 3.29M | MVMint64 st = curst[--numcur]; |
470 | 3.29M | if (st <= num_states) { |
471 | 3.29M | if (done[st] == gen) |
472 | 5.63k | continue; |
473 | 3.28M | done[st] = gen; |
474 | 3.28M | } |
475 | 3.29M | |
476 | 3.28M | edge_info = nfa->states[st - 1]; |
477 | 3.28M | edge_info_elems = nfa->num_state_edges[st - 1]; |
478 | 3.28M | if (nfadeb) |
479 | 0 | fprintf(stderr,"\t%d\t%d\t",(int)st, (int)edge_info_elems); |
480 | 14.5M | for (i = 0; i < edge_info_elems; i++) { |
481 | 11.3M | MVMint64 act = edge_info[i].act; |
482 | 11.3M | MVMint64 to = edge_info[i].to; |
483 | 11.3M | |
484 | 11.3M | /* All the special cases are under one test. */ |
485 | 11.3M | if (act <= MVM_NFA_EDGE_EPSILON) { |
486 | 3.58M | if (act < 0) { |
487 | 1.15M | /* Negative indicates a fate is encoded in the act of the codepoint edge. */ |
488 | 1.15M | /* These will redispatch to one of the _LL cases below */ |
489 | 1.15M | act &= 0xff; |
490 | 1.15M | } |
491 | 2.42M | else if (act == MVM_NFA_EDGE_FATE) { |
492 | 653k | /* Crossed a fate edge. Check if we already saw this fate, and |
493 | 653k | * if so remove the entry so we can re-add at the new token length. */ |
494 | 653k | MVMint64 arg = edge_info[i].arg.i; |
495 | 653k | MVMint64 j; |
496 | 653k | MVMint64 found_fate = 0; |
497 | 653k | if (nfadeb) |
498 | 0 | fprintf(stderr, "fate(%016llx) ", (long long unsigned int)arg); |
499 | 1.18M | for (j = 0; j < total_fates; j++) { |
500 | 530k | if (found_fate) |
501 | 56.8k | fates[j - found_fate] = fates[j]; |
502 | 530k | if ((fates[j] & 0xffffff) == arg) { |
503 | 358k | found_fate++; |
504 | 358k | if (j < prev_fates) |
505 | 354k | prev_fates--; |
506 | 358k | } |
507 | 530k | } |
508 | 653k | total_fates -= found_fate; |
509 | 653k | if (arg < usedlonglit) |
510 | 117k | arg -= longlit[arg] << 24; |
511 | 653k | if (++total_fates > fate_arr_len) { |
512 | 0 | /* should never happen if nfa->fates is correct and dedup above works right */ |
513 | 0 | fprintf(stderr, "oops adding %016llx to\n", (long long unsigned int)arg); |
514 | 0 | for (j = 0; j < total_fates - 1; j++) { |
515 | 0 | fprintf(stderr, " %016llx\n", (long long unsigned int)fates[j]); |
516 | 0 | } |
517 | 0 | fate_arr_len = total_fates + 10; |
518 | 0 | tc->nfa_fates = (MVMint64 *)MVM_realloc(tc->nfa_fates, |
519 | 0 | sizeof(MVMint64) * fate_arr_len); |
520 | 0 | tc->nfa_fates_len = fate_arr_len; |
521 | 0 | fates = tc->nfa_fates; |
522 | 0 | } |
523 | 653k | /* a small insertion sort */ |
524 | 653k | j = total_fates - 1; |
525 | 695k | while (--j >= prev_fates && fates[j] < arg) { |
526 | 42.4k | fates[j + 1] = fates[j]; |
527 | 42.4k | } |
528 | 653k | fates[++j] = arg; |
529 | 653k | continue; |
530 | 653k | } |
531 | 1.77M | else if (act == MVM_NFA_EDGE_EPSILON && to <= num_states && done[to] != gen) { |
532 | 1.76M | if (to) |
533 | 1.76M | curst[numcur++] = to; |
534 | 0 | else if (nfadeb) /* XXX should turn into a "can't happen" after rebootstrap */ |
535 | 0 | fprintf(stderr, " oops, ignoring epsilon to 0\n"); |
536 | 1.76M | continue; |
537 | 1.76M | } |
538 | 3.58M | } |
539 | 11.3M | |
540 | 8.89M | if (offset >= eos) { |
541 | 23.2k | /* Can't match, so drop state. */ |
542 | 23.2k | continue; |
543 | 23.2k | } |
544 | 8.86M | else { |
545 | 8.86M | switch (act) { |
546 | 1.15M | case MVM_NFA_EDGE_CODEPOINT_LL: { |
547 | 1.15M | MVMGrapheme32 arg = edge_info[i].arg.g; |
548 | 1.15M | if (MVM_string_get_grapheme_at_nocheck(tc, target, offset) == arg) { |
549 | 55.3k | MVMint64 fate = (edge_info[i].act >> 8) & 0xfffff; |
550 | 55.3k | nextst[numnext++] = to; |
551 | 314k | while (usedlonglit <= fate) |
552 | 258k | longlit[usedlonglit++] = 0; |
553 | 55.3k | longlit[fate] = offset - orig_offset + 1; |
554 | 55.3k | if (nfadeb) |
555 | 0 | fprintf(stderr, "%d->%d ", (int)i, (int)to); |
556 | 55.3k | } |
557 | 1.15M | continue; |
558 | 1.15M | } |
559 | 5.04M | case MVM_NFA_EDGE_CODEPOINT: { |
560 | 5.04M | MVMGrapheme32 arg = edge_info[i].arg.g; |
561 | 5.04M | if (MVM_string_get_grapheme_at_nocheck(tc, target, offset) == arg) { |
562 | 139k | nextst[numnext++] = to; |
563 | 139k | if (nfadeb) |
564 | 0 | fprintf(stderr, "%d->%d ", (int)i, (int)to); |
565 | 139k | } |
566 | 5.04M | continue; |
567 | 1.15M | } |
568 | 0 | case MVM_NFA_EDGE_CODEPOINT_NEG: { |
569 | 0 | MVMGrapheme32 arg = edge_info[i].arg.g; |
570 | 0 | if (MVM_string_get_grapheme_at_nocheck(tc, target, offset) != arg) |
571 | 0 | nextst[numnext++] = to; |
572 | 0 | continue; |
573 | 1.15M | } |
574 | 864k | case MVM_NFA_EDGE_CHARCLASS: { |
575 | 864k | MVMint64 arg = edge_info[i].arg.i; |
576 | 864k | if (MVM_string_is_cclass(tc, arg, target, offset)) |
577 | 435k | nextst[numnext++] = to; |
578 | 864k | continue; |
579 | 1.15M | } |
580 | 89.3k | case MVM_NFA_EDGE_CHARCLASS_NEG: { |
581 | 89.3k | MVMint64 arg = edge_info[i].arg.i; |
582 | 89.3k | if (!MVM_string_is_cclass(tc, arg, target, offset)) |
583 | 85.9k | nextst[numnext++] = to; |
584 | 89.3k | continue; |
585 | 1.15M | } |
586 | 1.70M | case MVM_NFA_EDGE_CHARLIST: { |
587 | 1.70M | MVMString *arg = edge_info[i].arg.s; |
588 | 1.70M | MVMGrapheme32 cp = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
589 | 1.70M | if (MVM_string_index_of_grapheme(tc, arg, cp) >= 0) |
590 | 227k | nextst[numnext++] = to; |
591 | 1.70M | continue; |
592 | 1.15M | } |
593 | 2.60k | case MVM_NFA_EDGE_CHARLIST_NEG: { |
594 | 2.60k | MVMString *arg = edge_info[i].arg.s; |
595 | 2.60k | MVMGrapheme32 cp = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
596 | 2.60k | if (MVM_string_index_of_grapheme(tc, arg, cp) < 0) |
597 | 2.06k | nextst[numnext++] = to; |
598 | 2.60k | continue; |
599 | 1.15M | } |
600 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_LL: { |
601 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
602 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
603 | 0 | MVMGrapheme32 ord = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
604 | 0 | if (ord == lc_arg || ord == uc_arg) { |
605 | 0 | MVMint64 fate = (edge_info[i].act >> 8) & 0xfffff; |
606 | 0 | nextst[numnext++] = to; |
607 | 0 | while (usedlonglit <= fate) |
608 | 0 | longlit[usedlonglit++] = 0; |
609 | 0 | longlit[fate] = offset - orig_offset + 1; |
610 | 0 | } |
611 | 0 | continue; |
612 | 1.15M | } |
613 | 0 | case MVM_NFA_EDGE_CODEPOINT_I: { |
614 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
615 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
616 | 0 | MVMGrapheme32 ord = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
617 | 0 | if (ord == lc_arg || ord == uc_arg) |
618 | 0 | nextst[numnext++] = to; |
619 | 0 | continue; |
620 | 1.15M | } |
621 | 0 | case MVM_NFA_EDGE_CODEPOINT_I_NEG: { |
622 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
623 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
624 | 0 | MVMGrapheme32 ord = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
625 | 0 | if (ord != lc_arg && ord != uc_arg) |
626 | 0 | nextst[numnext++] = to; |
627 | 0 | continue; |
628 | 1.15M | } |
629 | 6.63k | case MVM_NFA_EDGE_CHARRANGE: { |
630 | 6.63k | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
631 | 6.63k | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
632 | 6.63k | MVMGrapheme32 ord = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
633 | 6.63k | if (ord >= lc_arg && ord <= uc_arg) |
634 | 977 | nextst[numnext++] = to; |
635 | 6.63k | continue; |
636 | 1.15M | } |
637 | 0 | case MVM_NFA_EDGE_CHARRANGE_NEG: { |
638 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
639 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
640 | 0 | MVMGrapheme32 ord = MVM_string_get_grapheme_at_nocheck(tc, target, offset); |
641 | 0 | if (ord < lc_arg || ord > uc_arg) |
642 | 0 | nextst[numnext++] = to; |
643 | 0 | continue; |
644 | 1.15M | } |
645 | 0 | case MVM_NFA_EDGE_SUBRULE: |
646 | 0 | if (nfadeb) |
647 | 0 | fprintf(stderr, "IGNORING SUBRULE\n"); |
648 | 0 | continue; |
649 | 0 | case MVM_NFA_EDGE_CODEPOINT_M: |
650 | 0 | case MVM_NFA_EDGE_CODEPOINT_M_NEG: { |
651 | 0 | MVMNormalizer norm; |
652 | 0 | MVMint32 ready; |
653 | 0 | MVMGrapheme32 ga = edge_info[i].arg.g; |
654 | 0 | MVMGrapheme32 gb = MVM_string_ord_basechar_at(tc, target, offset); |
655 | 0 |
|
656 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
657 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, ga, &ga); |
658 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
659 | 0 | if (!ready) |
660 | 0 | ga = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
661 | 0 |
|
662 | 0 | if (((act == MVM_NFA_EDGE_CODEPOINT_M) && (ga == gb)) |
663 | 0 | || ((act == MVM_NFA_EDGE_CODEPOINT_M_NEG) && (ga != gb))) |
664 | 0 | nextst[numnext++] = to; |
665 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
666 | 0 | continue; |
667 | 0 | } |
668 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM: |
669 | 0 | case MVM_NFA_EDGE_CODEPOINT_IM_NEG: { |
670 | 0 | MVMNormalizer norm; |
671 | 0 | MVMint32 ready; |
672 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
673 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
674 | 0 | MVMGrapheme32 ord = MVM_string_ord_basechar_at(tc, target, offset); |
675 | 0 |
|
676 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
677 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, uc_arg, &uc_arg); |
678 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
679 | 0 | if (!ready) |
680 | 0 | uc_arg = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
681 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
682 | 0 |
|
683 | 0 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD); |
684 | 0 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, lc_arg, &lc_arg); |
685 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
686 | 0 | if (!ready) |
687 | 0 | lc_arg = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
688 | 0 |
|
689 | 0 | if (((act == MVM_NFA_EDGE_CODEPOINT_IM) && (ord == lc_arg || ord == uc_arg)) |
690 | 0 | || ((act == MVM_NFA_EDGE_CODEPOINT_IM_NEG) && (ord != lc_arg && ord != uc_arg))) |
691 | 0 | nextst[numnext++] = to; |
692 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
693 | 0 | continue; |
694 | 0 | } |
695 | 0 | case MVM_NFA_EDGE_CHARRANGE_M: { |
696 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
697 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
698 | 0 | MVMGrapheme32 ord = MVM_string_ord_basechar_at(tc, target, offset); |
699 | 0 | if (ord >= lc_arg && ord <= uc_arg) |
700 | 0 | nextst[numnext++] = to; |
701 | 0 | continue; |
702 | 0 | } |
703 | 0 | case MVM_NFA_EDGE_CHARRANGE_M_NEG: { |
704 | 0 | MVMGrapheme32 uc_arg = edge_info[i].arg.uclc.uc; |
705 | 0 | MVMGrapheme32 lc_arg = edge_info[i].arg.uclc.lc; |
706 | 0 | MVMGrapheme32 ord = MVM_string_ord_basechar_at(tc, target, offset); |
707 | 0 | if (ord < lc_arg || ord > uc_arg) |
708 | 0 | nextst[numnext++] = to; |
709 | 0 | continue; |
710 | 0 | } |
711 | 8.86M | } |
712 | 8.86M | } |
713 | 8.89M | } |
714 | 3.28M | if (nfadeb) fprintf(stderr,"\n"); |
715 | 3.28M | } |
716 | 1.12M | |
717 | 1.12M | /* Move to next character and generation. */ |
718 | 1.12M | offset++; |
719 | 1.12M | gen++; |
720 | 1.12M | } |
721 | 577k | /* strip any literal lengths, leaving only fates */ |
722 | 577k | if (usedlonglit || nfadeb) { |
723 | 29.2k | if (nfadeb) fprintf(stderr,"Final\n"); |
724 | 69.6k | for (i = 0; i < total_fates; i++) { |
725 | 40.4k | if (nfadeb) fprintf(stderr, " %08llx\n", (long long unsigned int)fates[i]); |
726 | 40.4k | fates[i] &= 0xffffff; |
727 | 40.4k | } |
728 | 29.2k | } |
729 | 577k | |
730 | 577k | *total_fates_out = total_fates; |
731 | 577k | return fates; |
732 | 577k | } |
733 | | |
734 | | /* Takes an NFA, a target string in and an offset. Runs the NFA and returns |
735 | | * the order to try the fates in. */ |
736 | 255k | MVMObject * MVM_nfa_run_proto(MVMThreadContext *tc, MVMObject *nfa, MVMString *target, MVMint64 offset) { |
737 | 255k | /* Run the NFA. */ |
738 | 255k | MVMint64 total_fates, i; |
739 | 255k | MVMint64 *fates = nqp_nfa_run(tc, (MVMNFABody *)OBJECT_BODY(nfa), target, offset, &total_fates); |
740 | 255k | |
741 | 255k | /* Copy results into an integer array. */ |
742 | 255k | MVMObject *fateres = MVM_repr_alloc_init(tc, tc->instance->boot_types.BOOTIntArray); |
743 | 337k | for (i = 0; i < total_fates; i++) |
744 | 81.7k | MVM_repr_bind_pos_i(tc, fateres, i, fates[i]); |
745 | 255k | |
746 | 255k | return fateres; |
747 | 255k | } |
748 | | |
749 | | /* Takes an NFA, target string and offset. Runs the NFA, and uses the output |
750 | | * to update the bstack with backtracking points to try the alternation |
751 | | * branches in the correct order. The current capture stack is needed for its |
752 | | * height. */ |
753 | | void MVM_nfa_run_alt(MVMThreadContext *tc, MVMObject *nfa, MVMString *target, |
754 | 321k | MVMint64 offset, MVMObject *bstack, MVMObject *cstack, MVMObject *labels) { |
755 | 321k | /* Run the NFA. */ |
756 | 321k | MVMint64 total_fates, i; |
757 | 321k | MVMint64 *fates = nqp_nfa_run(tc, (MVMNFABody *)OBJECT_BODY(nfa), target, offset, &total_fates); |
758 | 321k | |
759 | 321k | /* Push the results onto the bstack. */ |
760 | 321k | MVMint64 caps = cstack && IS_CONCRETE(cstack) |
761 | 13.3k | ? MVM_repr_elems(tc, cstack) |
762 | 308k | : 0; |
763 | 533k | for (i = 0; i < total_fates; i++) { |
764 | 212k | MVM_repr_push_i(tc, bstack, MVM_repr_at_pos_i(tc, labels, fates[i])); |
765 | 212k | MVM_repr_push_i(tc, bstack, offset); |
766 | 212k | MVM_repr_push_i(tc, bstack, 0); |
767 | 212k | MVM_repr_push_i(tc, bstack, caps); |
768 | 212k | } |
769 | 321k | } |