Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/parse_num.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
#include <math.h>
3
4
/* We put a ' ' into the current code point buffer when we reach the end of the string,
5
 *  as it's something that can be harmlessly added to the end of a number */
6
7
18.9k
#define END_OF_NUM ' '
8
11.8k
static int is_whitespace(MVMThreadContext *tc, MVMCodepoint cp) {
9
11.8k
    if (cp <= '~') {
10
11.8k
        if (cp == ' ' || (cp <= 13 && cp >= 9))
11
5.92k
            return 1;
12
11.8k
        else
13
5.90k
            return 0;
14
11.8k
     }
15
13
     return MVM_unicode_codepoint_has_property_value(tc, cp, MVM_UNICODE_PROPERTY_WHITE_SPACE, 1);
16
11.8k
}
17
18
56.6k
static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) {
19
56.6k
    if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */
20
12.9k
    else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10;
21
9.33k
    else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10;
22
9.33k
    else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */
23
9.33k
    else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */
24
9.33k
    else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE)
25
9.33k
     == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) {
26
44
        /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are
27
44
         * thus also of General Category Nd, since 4.0.0) are contiguous
28
44
         * sequences of 10 chars whose Numeric Values ascend from 0 through 9.
29
44
         */
30
44
31
44
        /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer
32
44
         * value. We can use numerator because they all are from 0-9 and have
33
44
         * denominator of 1 */
34
44
        return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR));
35
44
    }
36
9.29k
    return -1;
37
56.6k
}
38
39
68.2k
int static get_cp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) {
40
68.2k
    if (!MVM_string_ci_has_more(tc, ci)) {
41
11.8k
        *cp = END_OF_NUM; // FIXME pick a safe value
42
11.8k
        return 1;
43
11.8k
    }
44
56.4k
    else {
45
56.4k
        *cp = MVM_string_ci_get_codepoint(tc, ci);
46
56.4k
        return 0;
47
56.4k
    }
48
68.2k
}
49
50
0
static void parse_error(MVMThreadContext *tc, MVMString *s, const char* reason) {
51
0
    char* got = MVM_string_utf8_c8_encode_C_string(tc, s);
52
0
    char *waste[] = { got, NULL };
53
0
    MVM_exception_throw_adhoc_free(tc, waste, "Can't convert '%s' to num: %s", got, reason);
54
0
}
55
56
11.8k
static void skip_whitespace(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) {
57
11.8k
    while (is_whitespace(tc, *cp)) {
58
5.92k
        if (get_cp(tc, ci, cp)) return;
59
5.92k
    }
60
11.8k
}
61
62
9.47k
static int parse_sign(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) {
63
9.47k
    // Handle any leading +/-/− sign
64
5.79k
    int has_minus = (*cp == '-' || *cp == 8722); // '-', '−'
65
9.47k
66
9.47k
    if (has_minus || *cp == '+') {  // '-', '−', '+'
67
4.46k
        get_cp(tc, ci, cp);
68
4.46k
    }
69
9.47k
70
5.79k
    return (has_minus ? -1 : 1);
71
9.47k
}
72
73
0
static double parse_decimal_integer(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString* s) {
74
0
    int ends_with_underscore = 0;
75
0
    double value = 0;
76
0
    int digit;
77
0
    if (*cp == '_') parse_error(tc, s, "number can't be start with _");
78
0
    while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) {
79
0
        ends_with_underscore = *cp == '_';
80
0
        if (*cp != '_') {
81
0
            if (digit >= 10) parse_error(tc, s, "expecting comma seprated decimal numbers after :$radix[]");
82
0
            value = value * 10 + digit;
83
0
        }
84
0
        get_cp(tc, ci, cp);
85
0
    }
86
0
    if (ends_with_underscore) parse_error(tc, s, "a number can't end in underscore");
87
0
    return value;
88
0
}
89
90
5.91k
static double parse_int_frac_exp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString* s, double radix, int leading_zero) {
91
5.91k
    /*
92
5.91k
     * What we do here is extract the digits from the original string,
93
5.91k
     * effectively stripping off underscores and converting fancy Unicode
94
5.91k
     * digits to regular ones. We then ASCII-fy those digits and stuff
95
5.91k
     * them into digits_buf (along with double-ish things like the dot
96
5.91k
     * and 'e'). At the end we give the resultant string to strtod() to
97
5.91k
     * do all the dirty work for us, so we don't have to worry about
98
5.91k
     * handling denormals or picking closest representable double
99
5.91k
     */
100
5.91k
    int digits = 0;
101
5.91k
    int frac_digits = 0;
102
5.91k
    int digit;
103
5.91k
    int ends_with_underscore = 0;
104
5.91k
    char *digits_buf = (char *)MVM_malloc(1 + MVM_string_graphs(tc, s));
105
5.91k
    char *digits_buf_tail = digits_buf;
106
5.91k
    double result;
107
5.91k
108
5.91k
    if (*cp == '_')
109
0
        parse_error(tc, s, "number can't start with _");
110
5.91k
111
5.91k
    if (*cp != '.') {
112
19.1k
        while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) {
113
13.8k
            ends_with_underscore = *cp == '_';
114
13.8k
            if (*cp != '_') {
115
13.8k
                if (digit >= radix) break;
116
13.6k
                *digits_buf_tail++ = '0' + digit;
117
13.6k
                digits++;
118
13.6k
            }
119
13.6k
            get_cp(tc, ci, cp);
120
13.6k
        }
121
5.45k
        if (ends_with_underscore)
122
0
            parse_error(tc, s, "a number can't end in underscore");
123
5.45k
    }
124
5.91k
125
5.91k
126
5.91k
    if (*cp == '.') {
127
3.84k
        *digits_buf_tail++ = '.';
128
3.84k
        get_cp(tc, ci, cp);
129
3.84k
        if (*cp == '_')
130
0
            parse_error(tc, s, "radix point can't be followed by _");
131
27.0k
        while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) {
132
26.5k
            ends_with_underscore = *cp == '_';
133
26.5k
            if (*cp != '_') {
134
26.5k
                if (digit >= radix) break;
135
23.1k
                *digits_buf_tail++ = '0' + digit;
136
23.1k
                frac_digits++;
137
23.1k
            }
138
23.1k
            get_cp(tc, ci, cp);
139
23.1k
        }
140
3.84k
        if (frac_digits == 0)
141
0
            parse_error(tc, s,
142
0
                "radix point must be followed by one or more valid digits");
143
3.84k
        if (ends_with_underscore)
144
0
            parse_error(tc, s, "a number can't end in underscore");
145
3.84k
    }
146
5.91k
147
5.91k
    if (digits == 0 && frac_digits == 0 && !leading_zero)
148
0
        parse_error(tc, s, "expecting a number");
149
5.91k
150
5.91k
    if (*cp == 'E' || *cp == 'e') {
151
3.56k
        int e_digits = 0;
152
3.56k
153
3.56k
        *digits_buf_tail++ = 'e';
154
3.56k
        get_cp(tc, ci, cp);
155
3.56k
156
3.56k
        if (parse_sign(tc, ci, cp) == -1)
157
1.86k
            *digits_buf_tail++ = '-';
158
3.56k
        if (*cp == '_')
159
0
            parse_error(tc, s, "'e' or 'E' can't be followed by _");
160
10.5k
        while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) {
161
6.95k
            if (*cp != '_') {
162
6.95k
                if (digit >= radix) break;
163
6.95k
                *digits_buf_tail++ = '0' + digit;
164
6.95k
                e_digits++;
165
6.95k
            }
166
6.95k
            get_cp(tc, ci, cp);
167
6.95k
        }
168
3.56k
        if (e_digits == 0)
169
0
            parse_error(tc, s,
170
0
                "'e' or 'E' must be followed by one or more valid digits");
171
3.56k
    }
172
5.91k
173
5.91k
    *digits_buf_tail = '\0';
174
5.91k
    result = strtod(digits_buf, NULL);
175
5.91k
    MVM_free(digits_buf);
176
5.91k
    return result;
177
5.91k
}
178
179
11.8k
static int match_word(MVMThreadContext *tc,  MVMCodepointIter *ci, MVMCodepoint *cp, char word[3], MVMString *s) {
180
11.8k
    if (*cp == word[0]) {
181
4
        get_cp(tc, ci, cp);
182
4
        if (*cp == word[1]) {
183
4
            get_cp(tc, ci, cp);
184
4
            if (*cp == word[2]) {
185
4
                get_cp(tc, ci, cp);
186
4
                return 1;
187
4
            }
188
0
            else {
189
0
                parse_error(tc, s, "that's not a number");
190
0
            }
191
4
        }
192
0
        else {
193
0
            parse_error(tc, s, "that's not a number");
194
0
        }
195
4
    }
196
11.8k
    return 0;
197
11.8k
}
198
199
200
5.91k
static double parse_simple_number(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString *s) {
201
5.91k
    double sign;
202
5.91k
    // Handle NaN here, to make later parsing simpler
203
5.91k
204
5.91k
    if (match_word(tc, ci, cp, "NaN", s)) {
205
1
        return MVM_num_nan(tc);
206
1
    }
207
5.91k
208
5.91k
    sign = parse_sign(tc, ci, cp);
209
5.91k
210
5.91k
    if (match_word(tc, ci, cp, "Inf", s)) {
211
3
        return sign * MVM_num_posinf(tc);
212
3
    }
213
5.91k
    else if (*cp == ':') {
214
0
        int radix;
215
0
        double body;
216
0
        get_cp(tc, ci, cp);
217
0
        radix = (int) parse_int_frac_exp(tc, ci, cp, s, 10, 0);
218
0
        if (*cp == '<') {
219
0
            get_cp(tc, ci, cp);
220
0
            body = parse_int_frac_exp(tc, ci, cp, s, radix, 0);
221
0
            if (*cp == '>') {
222
0
                get_cp(tc, ci, cp);
223
0
                return sign * body;
224
0
            }
225
0
            else {
226
0
                parse_error(tc, s, "malformed ':radix<>' style radix number, expecting '>' after the body");
227
0
            }
228
0
        }
229
0
        else if (*cp == 171) { // «
230
0
            get_cp(tc, ci, cp);
231
0
            body = parse_int_frac_exp(tc, ci, cp, s, radix, 0);
232
0
            if (*cp == 187) { // »
233
0
                get_cp(tc, ci, cp);
234
0
                return sign * body;
235
0
            }
236
0
            else {
237
0
                parse_error(tc, s, "malformed ':radix«»' style radix number, expecting '>' after the body");
238
0
            }
239
0
        }
240
0
        else if (*cp == '[') { // «
241
0
            double result = 0;
242
0
            get_cp(tc, ci, cp);
243
0
            while (*cp != ']' && MVM_string_ci_has_more(tc, ci)) {
244
0
                double digit = parse_decimal_integer(tc, ci, cp, s);
245
0
                result = result * radix + digit;
246
0
                if (*cp == ',') {
247
0
                    get_cp(tc, ci, cp);
248
0
                }
249
0
            }
250
0
            if (*cp == ']') { // »
251
0
                get_cp(tc, ci, cp);
252
0
                return sign * result;
253
0
            }
254
0
            else {
255
0
                parse_error(tc, s, "malformed ':radix[]' style radix number, expecting ']' after the body");
256
0
            }
257
0
        }
258
0
    }
259
5.91k
    else if (*cp == '0') {
260
698
        int radix = 0;
261
698
262
698
        get_cp(tc, ci, cp);
263
698
        if (*cp == 'b') radix = 2;
264
698
        else if (*cp == 'o') radix = 8;
265
698
        else if (*cp == 'd') radix = 10;
266
698
        else if (*cp == 'x') radix = 16;
267
698
268
698
        if (radix) {
269
0
            get_cp(tc, ci, cp);
270
0
            if (*cp == '_') get_cp(tc, ci, cp);
271
0
            return sign * parse_int_frac_exp(tc, ci, cp, s, radix, 1);
272
698
        } else {
273
698
            return sign * parse_int_frac_exp(tc, ci, cp, s, 10, 1);
274
698
        }
275
698
    }
276
5.21k
    else {
277
5.21k
        return sign * parse_int_frac_exp(tc, ci, cp, s, 10, 0);
278
5.21k
    }
279
5.91k
}
280
281
5.91k
static double parse_real(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString *s) {
282
5.91k
    double result = parse_simple_number(tc, ci, cp, s);
283
5.91k
    double denom;
284
5.91k
285
5.91k
    // Check for '/' indicating Rat denominator
286
5.91k
    if (*cp == '/') {
287
0
        get_cp(tc, ci, cp);
288
0
        denom = parse_simple_number(tc, ci, cp, s);
289
0
        result = result / denom;
290
0
    }
291
5.91k
    return result;
292
5.91k
}
293
294
5.91k
MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) {
295
5.91k
    MVMCodepointIter ci;
296
5.91k
    MVMCodepoint cp;
297
5.91k
    MVMnum64  n = 123;
298
5.91k
    MVM_string_ci_init(tc, &ci, s, 0, 0);
299
5.91k
300
5.91k
    if (get_cp(tc, &ci, &cp)) return 0;
301
5.91k
302
5.91k
    skip_whitespace(tc, &ci, &cp);
303
5.91k
304
5.91k
    // Do we have only whitespace
305
5.91k
    if (!MVM_string_ci_has_more(tc, &ci) && cp == END_OF_NUM) {
306
0
        return 0;
307
0
    }
308
5.91k
309
5.91k
    n = parse_real(tc, &ci, &cp, s);
310
5.91k
311
5.91k
    skip_whitespace(tc, &ci, &cp);
312
5.91k
313
5.91k
    if (MVM_string_ci_has_more(tc, &ci) || cp != END_OF_NUM) {
314
0
        parse_error(tc, s, "trailing characters");
315
0
    }
316
5.91k
317
5.91k
    return n;
318
5.91k
}