/home/travis/build/MoarVM/MoarVM/src/strings/parse_num.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | #include <math.h> |
3 | | |
4 | | /* We put a ' ' into the current code point buffer when we reach the end of the string, |
5 | | * as it's something that can be harmlessly added to the end of a number */ |
6 | | |
7 | 18.9k | #define END_OF_NUM ' ' |
8 | 11.8k | static int is_whitespace(MVMThreadContext *tc, MVMCodepoint cp) { |
9 | 11.8k | if (cp <= '~') { |
10 | 11.8k | if (cp == ' ' || (cp <= 13 && cp >= 9)) |
11 | 5.92k | return 1; |
12 | 11.8k | else |
13 | 5.90k | return 0; |
14 | 11.8k | } |
15 | 13 | return MVM_unicode_codepoint_has_property_value(tc, cp, MVM_UNICODE_PROPERTY_WHITE_SPACE, 1); |
16 | 11.8k | } |
17 | | |
18 | 56.6k | static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) { |
19 | 56.6k | if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */ |
20 | 12.9k | else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10; |
21 | 9.33k | else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10; |
22 | 9.33k | else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */ |
23 | 9.33k | else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */ |
24 | 9.33k | else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE) |
25 | 9.33k | == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) { |
26 | 44 | /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are |
27 | 44 | * thus also of General Category Nd, since 4.0.0) are contiguous |
28 | 44 | * sequences of 10 chars whose Numeric Values ascend from 0 through 9. |
29 | 44 | */ |
30 | 44 | |
31 | 44 | /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer |
32 | 44 | * value. We can use numerator because they all are from 0-9 and have |
33 | 44 | * denominator of 1 */ |
34 | 44 | return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR)); |
35 | 44 | } |
36 | 9.29k | return -1; |
37 | 56.6k | } |
38 | | |
39 | 68.2k | int static get_cp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { |
40 | 68.2k | if (!MVM_string_ci_has_more(tc, ci)) { |
41 | 11.8k | *cp = END_OF_NUM; // FIXME pick a safe value |
42 | 11.8k | return 1; |
43 | 11.8k | } |
44 | 56.4k | else { |
45 | 56.4k | *cp = MVM_string_ci_get_codepoint(tc, ci); |
46 | 56.4k | return 0; |
47 | 56.4k | } |
48 | 68.2k | } |
49 | | |
50 | 0 | static void parse_error(MVMThreadContext *tc, MVMString *s, const char* reason) { |
51 | 0 | char* got = MVM_string_utf8_c8_encode_C_string(tc, s); |
52 | 0 | char *waste[] = { got, NULL }; |
53 | 0 | MVM_exception_throw_adhoc_free(tc, waste, "Can't convert '%s' to num: %s", got, reason); |
54 | 0 | } |
55 | | |
56 | 11.8k | static void skip_whitespace(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { |
57 | 11.8k | while (is_whitespace(tc, *cp)) { |
58 | 5.92k | if (get_cp(tc, ci, cp)) return; |
59 | 5.92k | } |
60 | 11.8k | } |
61 | | |
62 | 9.47k | static int parse_sign(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { |
63 | 9.47k | // Handle any leading +/-/− sign |
64 | 5.79k | int has_minus = (*cp == '-' || *cp == 8722); // '-', '−' |
65 | 9.47k | |
66 | 9.47k | if (has_minus || *cp == '+') { // '-', '−', '+' |
67 | 4.46k | get_cp(tc, ci, cp); |
68 | 4.46k | } |
69 | 9.47k | |
70 | 5.79k | return (has_minus ? -1 : 1); |
71 | 9.47k | } |
72 | | |
73 | 0 | static double parse_decimal_integer(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString* s) { |
74 | 0 | int ends_with_underscore = 0; |
75 | 0 | double value = 0; |
76 | 0 | int digit; |
77 | 0 | if (*cp == '_') parse_error(tc, s, "number can't be start with _"); |
78 | 0 | while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { |
79 | 0 | ends_with_underscore = *cp == '_'; |
80 | 0 | if (*cp != '_') { |
81 | 0 | if (digit >= 10) parse_error(tc, s, "expecting comma seprated decimal numbers after :$radix[]"); |
82 | 0 | value = value * 10 + digit; |
83 | 0 | } |
84 | 0 | get_cp(tc, ci, cp); |
85 | 0 | } |
86 | 0 | if (ends_with_underscore) parse_error(tc, s, "a number can't end in underscore"); |
87 | 0 | return value; |
88 | 0 | } |
89 | | |
90 | 5.91k | static double parse_int_frac_exp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString* s, double radix, int leading_zero) { |
91 | 5.91k | /* |
92 | 5.91k | * What we do here is extract the digits from the original string, |
93 | 5.91k | * effectively stripping off underscores and converting fancy Unicode |
94 | 5.91k | * digits to regular ones. We then ASCII-fy those digits and stuff |
95 | 5.91k | * them into digits_buf (along with double-ish things like the dot |
96 | 5.91k | * and 'e'). At the end we give the resultant string to strtod() to |
97 | 5.91k | * do all the dirty work for us, so we don't have to worry about |
98 | 5.91k | * handling denormals or picking closest representable double |
99 | 5.91k | */ |
100 | 5.91k | int digits = 0; |
101 | 5.91k | int frac_digits = 0; |
102 | 5.91k | int digit; |
103 | 5.91k | int ends_with_underscore = 0; |
104 | 5.91k | char *digits_buf = (char *)MVM_malloc(1 + MVM_string_graphs(tc, s)); |
105 | 5.91k | char *digits_buf_tail = digits_buf; |
106 | 5.91k | double result; |
107 | 5.91k | |
108 | 5.91k | if (*cp == '_') |
109 | 0 | parse_error(tc, s, "number can't start with _"); |
110 | 5.91k | |
111 | 5.91k | if (*cp != '.') { |
112 | 19.1k | while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { |
113 | 13.8k | ends_with_underscore = *cp == '_'; |
114 | 13.8k | if (*cp != '_') { |
115 | 13.8k | if (digit >= radix) break; |
116 | 13.6k | *digits_buf_tail++ = '0' + digit; |
117 | 13.6k | digits++; |
118 | 13.6k | } |
119 | 13.6k | get_cp(tc, ci, cp); |
120 | 13.6k | } |
121 | 5.45k | if (ends_with_underscore) |
122 | 0 | parse_error(tc, s, "a number can't end in underscore"); |
123 | 5.45k | } |
124 | 5.91k | |
125 | 5.91k | |
126 | 5.91k | if (*cp == '.') { |
127 | 3.84k | *digits_buf_tail++ = '.'; |
128 | 3.84k | get_cp(tc, ci, cp); |
129 | 3.84k | if (*cp == '_') |
130 | 0 | parse_error(tc, s, "radix point can't be followed by _"); |
131 | 27.0k | while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { |
132 | 26.5k | ends_with_underscore = *cp == '_'; |
133 | 26.5k | if (*cp != '_') { |
134 | 26.5k | if (digit >= radix) break; |
135 | 23.1k | *digits_buf_tail++ = '0' + digit; |
136 | 23.1k | frac_digits++; |
137 | 23.1k | } |
138 | 23.1k | get_cp(tc, ci, cp); |
139 | 23.1k | } |
140 | 3.84k | if (frac_digits == 0) |
141 | 0 | parse_error(tc, s, |
142 | 0 | "radix point must be followed by one or more valid digits"); |
143 | 3.84k | if (ends_with_underscore) |
144 | 0 | parse_error(tc, s, "a number can't end in underscore"); |
145 | 3.84k | } |
146 | 5.91k | |
147 | 5.91k | if (digits == 0 && frac_digits == 0 && !leading_zero) |
148 | 0 | parse_error(tc, s, "expecting a number"); |
149 | 5.91k | |
150 | 5.91k | if (*cp == 'E' || *cp == 'e') { |
151 | 3.56k | int e_digits = 0; |
152 | 3.56k | |
153 | 3.56k | *digits_buf_tail++ = 'e'; |
154 | 3.56k | get_cp(tc, ci, cp); |
155 | 3.56k | |
156 | 3.56k | if (parse_sign(tc, ci, cp) == -1) |
157 | 1.86k | *digits_buf_tail++ = '-'; |
158 | 3.56k | if (*cp == '_') |
159 | 0 | parse_error(tc, s, "'e' or 'E' can't be followed by _"); |
160 | 10.5k | while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { |
161 | 6.95k | if (*cp != '_') { |
162 | 6.95k | if (digit >= radix) break; |
163 | 6.95k | *digits_buf_tail++ = '0' + digit; |
164 | 6.95k | e_digits++; |
165 | 6.95k | } |
166 | 6.95k | get_cp(tc, ci, cp); |
167 | 6.95k | } |
168 | 3.56k | if (e_digits == 0) |
169 | 0 | parse_error(tc, s, |
170 | 0 | "'e' or 'E' must be followed by one or more valid digits"); |
171 | 3.56k | } |
172 | 5.91k | |
173 | 5.91k | *digits_buf_tail = '\0'; |
174 | 5.91k | result = strtod(digits_buf, NULL); |
175 | 5.91k | MVM_free(digits_buf); |
176 | 5.91k | return result; |
177 | 5.91k | } |
178 | | |
179 | 11.8k | static int match_word(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, char word[3], MVMString *s) { |
180 | 11.8k | if (*cp == word[0]) { |
181 | 4 | get_cp(tc, ci, cp); |
182 | 4 | if (*cp == word[1]) { |
183 | 4 | get_cp(tc, ci, cp); |
184 | 4 | if (*cp == word[2]) { |
185 | 4 | get_cp(tc, ci, cp); |
186 | 4 | return 1; |
187 | 4 | } |
188 | 0 | else { |
189 | 0 | parse_error(tc, s, "that's not a number"); |
190 | 0 | } |
191 | 4 | } |
192 | 0 | else { |
193 | 0 | parse_error(tc, s, "that's not a number"); |
194 | 0 | } |
195 | 4 | } |
196 | 11.8k | return 0; |
197 | 11.8k | } |
198 | | |
199 | | |
200 | 5.91k | static double parse_simple_number(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString *s) { |
201 | 5.91k | double sign; |
202 | 5.91k | // Handle NaN here, to make later parsing simpler |
203 | 5.91k | |
204 | 5.91k | if (match_word(tc, ci, cp, "NaN", s)) { |
205 | 1 | return MVM_num_nan(tc); |
206 | 1 | } |
207 | 5.91k | |
208 | 5.91k | sign = parse_sign(tc, ci, cp); |
209 | 5.91k | |
210 | 5.91k | if (match_word(tc, ci, cp, "Inf", s)) { |
211 | 3 | return sign * MVM_num_posinf(tc); |
212 | 3 | } |
213 | 5.91k | else if (*cp == ':') { |
214 | 0 | int radix; |
215 | 0 | double body; |
216 | 0 | get_cp(tc, ci, cp); |
217 | 0 | radix = (int) parse_int_frac_exp(tc, ci, cp, s, 10, 0); |
218 | 0 | if (*cp == '<') { |
219 | 0 | get_cp(tc, ci, cp); |
220 | 0 | body = parse_int_frac_exp(tc, ci, cp, s, radix, 0); |
221 | 0 | if (*cp == '>') { |
222 | 0 | get_cp(tc, ci, cp); |
223 | 0 | return sign * body; |
224 | 0 | } |
225 | 0 | else { |
226 | 0 | parse_error(tc, s, "malformed ':radix<>' style radix number, expecting '>' after the body"); |
227 | 0 | } |
228 | 0 | } |
229 | 0 | else if (*cp == 171) { // « |
230 | 0 | get_cp(tc, ci, cp); |
231 | 0 | body = parse_int_frac_exp(tc, ci, cp, s, radix, 0); |
232 | 0 | if (*cp == 187) { // » |
233 | 0 | get_cp(tc, ci, cp); |
234 | 0 | return sign * body; |
235 | 0 | } |
236 | 0 | else { |
237 | 0 | parse_error(tc, s, "malformed ':radix«»' style radix number, expecting '>' after the body"); |
238 | 0 | } |
239 | 0 | } |
240 | 0 | else if (*cp == '[') { // « |
241 | 0 | double result = 0; |
242 | 0 | get_cp(tc, ci, cp); |
243 | 0 | while (*cp != ']' && MVM_string_ci_has_more(tc, ci)) { |
244 | 0 | double digit = parse_decimal_integer(tc, ci, cp, s); |
245 | 0 | result = result * radix + digit; |
246 | 0 | if (*cp == ',') { |
247 | 0 | get_cp(tc, ci, cp); |
248 | 0 | } |
249 | 0 | } |
250 | 0 | if (*cp == ']') { // » |
251 | 0 | get_cp(tc, ci, cp); |
252 | 0 | return sign * result; |
253 | 0 | } |
254 | 0 | else { |
255 | 0 | parse_error(tc, s, "malformed ':radix[]' style radix number, expecting ']' after the body"); |
256 | 0 | } |
257 | 0 | } |
258 | 0 | } |
259 | 5.91k | else if (*cp == '0') { |
260 | 698 | int radix = 0; |
261 | 698 | |
262 | 698 | get_cp(tc, ci, cp); |
263 | 698 | if (*cp == 'b') radix = 2; |
264 | 698 | else if (*cp == 'o') radix = 8; |
265 | 698 | else if (*cp == 'd') radix = 10; |
266 | 698 | else if (*cp == 'x') radix = 16; |
267 | 698 | |
268 | 698 | if (radix) { |
269 | 0 | get_cp(tc, ci, cp); |
270 | 0 | if (*cp == '_') get_cp(tc, ci, cp); |
271 | 0 | return sign * parse_int_frac_exp(tc, ci, cp, s, radix, 1); |
272 | 698 | } else { |
273 | 698 | return sign * parse_int_frac_exp(tc, ci, cp, s, 10, 1); |
274 | 698 | } |
275 | 698 | } |
276 | 5.21k | else { |
277 | 5.21k | return sign * parse_int_frac_exp(tc, ci, cp, s, 10, 0); |
278 | 5.21k | } |
279 | 5.91k | } |
280 | | |
281 | 5.91k | static double parse_real(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString *s) { |
282 | 5.91k | double result = parse_simple_number(tc, ci, cp, s); |
283 | 5.91k | double denom; |
284 | 5.91k | |
285 | 5.91k | // Check for '/' indicating Rat denominator |
286 | 5.91k | if (*cp == '/') { |
287 | 0 | get_cp(tc, ci, cp); |
288 | 0 | denom = parse_simple_number(tc, ci, cp, s); |
289 | 0 | result = result / denom; |
290 | 0 | } |
291 | 5.91k | return result; |
292 | 5.91k | } |
293 | | |
294 | 5.91k | MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) { |
295 | 5.91k | MVMCodepointIter ci; |
296 | 5.91k | MVMCodepoint cp; |
297 | 5.91k | MVMnum64 n = 123; |
298 | 5.91k | MVM_string_ci_init(tc, &ci, s, 0, 0); |
299 | 5.91k | |
300 | 5.91k | if (get_cp(tc, &ci, &cp)) return 0; |
301 | 5.91k | |
302 | 5.91k | skip_whitespace(tc, &ci, &cp); |
303 | 5.91k | |
304 | 5.91k | // Do we have only whitespace |
305 | 5.91k | if (!MVM_string_ci_has_more(tc, &ci) && cp == END_OF_NUM) { |
306 | 0 | return 0; |
307 | 0 | } |
308 | 5.91k | |
309 | 5.91k | n = parse_real(tc, &ci, &cp, s); |
310 | 5.91k | |
311 | 5.91k | skip_whitespace(tc, &ci, &cp); |
312 | 5.91k | |
313 | 5.91k | if (MVM_string_ci_has_more(tc, &ci) || cp != END_OF_NUM) { |
314 | 0 | parse_error(tc, s, "trailing characters"); |
315 | 0 | } |
316 | 5.91k | |
317 | 5.91k | return n; |
318 | 5.91k | } |