Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/normalize.h
Line
Count
Source (jump to first uncovered line)
1
/* Normalization modes. Numbers picked so that:
2
 *  - The LSB tells us whether to do canonical or compatibility normalization
3
 *  - The second bit tells us whether to do canonical normalization
4
 *  - The third bit tells us to go a step further and create synthetic codes
5
 *    for graphemes.
6
 */
7
typedef enum {
8
    MVM_NORMALIZE_NFD   = 0,
9
    MVM_NORMALIZE_NFKD  = 1,
10
    MVM_NORMALIZE_NFC   = 2,
11
    MVM_NORMALIZE_NFKC  = 3,
12
    MVM_NORMALIZE_NFG   = 6
13
} MVMNormalization;
14
15
/* Ways of checking various properties of the normalization form. */
16
#define MVM_NORMALIZE_COMPAT_DECOMP(form) (form & 1)
17
#define MVM_NORMALIZE_COMPOSE(form)       (form & 2)
18
#define MVM_NORMALIZE_GRAPHEME(form)      (form & 4)
19
20
/* First codepoint where we have to actually do a real check and maybe some
21
 * work when normalizing. */
22
#define MVM_NORMALIZE_FIRST_SIG_NFD     0x00C0
23
#define MVM_NORMALIZE_FIRST_SIG_NFC     0x0300
24
#define MVM_NORMALIZE_FIRST_SIG_NFKD    0x00A0
25
#define MVM_NORMALIZE_FIRST_SIG_NFKC    0x00A0
26
27
/* First codepoint with a non-zero canonical combining class. */
28
#define MVM_NORMALIZE_FIRST_NONZERO_CCC 0x300
29
30
/* Streaming Unicode normalizer structure. */
31
struct MVMNormalizer {
32
    /* What form of normalization are we doing? */
33
    MVMNormalization form;
34
35
    /* Current buffer of codepoints we're working to normalize. */
36
    MVMCodepoint *buffer;
37
38
    /* Size of the normalization buffer. */
39
    MVMint32 buffer_size;
40
41
    /* Start offset in the buffer where we're still processing. */
42
    MVMint32 buffer_start;
43
44
    /* End offset in the buffer, and where to add the next thing to process. */
45
    MVMint32 buffer_end;
46
47
    /* End offset in the buffer for things we've normalized and so can return. */
48
    MVMint32 buffer_norm_end;
49
50
    /* The first significant codepoint in this normalization form that we may
51
     * have to do something with. If we see two things beneath the limit in a
52
     * row then we know the first one below it is good to spit out. */
53
    MVMCodepoint first_significant;
54
55
    /* The quickcheck property for the normalization form in question. */
56
    MVMint32 quick_check_property;
57
58
    /* If we should translate the \r\n grapheme to \n (only applicable when
59
     * normalizing to NFG). */
60
    MVMint32 translate_newlines;
61
};
62
63
/* Guts-y functions, called by the API level ones below. */
64
MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out);
65
MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out);
66
67
/* Takes a codepoint to process for normalization as the "in" parameter. If we
68
 * are able to produce one or more normalized codepoints right off, then we
69
 * put it into the location pointed to by "out", and return the number of
70
 * codepoints now available including the one we just passed out. If we can't
71
 * produce a normalized codepoint right now, we return a 0. */
72
0
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
73
0
    /* Control characters in the Latin-1 range are normalization terminators -
74
0
     * that is, we know we can spit out whatever codepoints we have seen so
75
0
     * far in normalized form without having to consider them into the
76
0
     * normalization process. The exception is if we're computing NFG, and
77
0
     * we got \r, which can form a grapheme in the case of \r\n. */
78
0
    if (in < 0x20 || (in >= 0x7F && in <= 0x9F) || in == 0xAD)
79
0
        if (!(MVM_NORMALIZE_GRAPHEME(n->form) && in == 0x0D))
80
0
            return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
81
0
82
0
    /* Fast-paths apply when the codepoint to consider is too low to have any
83
0
     * interesting properties in the target normalization form. */
84
0
    if (in < n->first_significant) {
85
0
        if (MVM_NORMALIZE_COMPOSE(n->form)) {
86
0
            /* For the composition fast path we always have to know that we've
87
0
            * seen two codepoints in a row that are below those needing a full
88
0
            * check. Then we can spit out the first one. Exception: we are
89
0
            * normalizing to graphemes and see \r. */
90
0
            if (!(MVM_NORMALIZE_GRAPHEME(n->form) && in == 0x0D)) {
91
0
                if (n->buffer_end - n->buffer_start == 1) {
92
0
                    if (n->buffer[n->buffer_start] < n->first_significant) {
93
0
                        *out = n->buffer[n->buffer_start];
94
0
                        n->buffer[n->buffer_start] = in;
95
0
                        return 1;
96
0
                    }
97
0
                }
98
0
            }
99
0
        }
100
0
        else {
101
0
            /* For decomposition fast-path, the buffer should be empty. In
102
0
             * that case, we just hand back what we got. */
103
0
            if (n->buffer_start == n->buffer_end) {
104
0
                *out = in;
105
0
                return 1;
106
0
            }
107
0
        }
108
0
    }
109
0
    /* Fall back to slow path. */
110
0
    return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out);
111
0
}
112
113
/* Grapheme version of the above. Note that this exists mostly for API clarity
114
 * rather than adding any semantics; the normalizer must be configured to
115
 * produce NFG to get synthetics out. */
116
0
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint_to_grapheme(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMGrapheme32 *out) {
117
0
    assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32));
118
0
    return MVM_unicode_normalizer_process_codepoint(tc, n, in, (MVMGrapheme32 *)out);
119
0
}
120
121
/* Push a number of codepoints into the "to normalize" buffer. */
122
void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints);
123
124
/* Get the number of codepoints/graphemes ready to fetch. */
125
0
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_available(MVMThreadContext *tc, MVMNormalizer *n) {
126
0
    return n->buffer_norm_end - n->buffer_start;
127
0
}
128
129
/* Get the number of codepoints/graphemes ready to fetch. */
130
0
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_empty(MVMThreadContext *tc, MVMNormalizer *n) {
131
0
    return n->buffer_end == n->buffer_start;
132
0
}
133
134
/* Indicate that we've reached the end of the input stream. Any codepoints
135
 * left to normalize now can be. */
136
void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n);
137
138
/* Get a normalized codepoint; should only ever be called if there are some
139
 * known to be available, either because normalize_to_codepoint returned a
140
 * value greater than 1, or normalize_available returned a non-zero value. */
141
0
MVM_STATIC_INLINE MVMCodepoint MVM_unicode_normalizer_get_codepoint(MVMThreadContext *tc, MVMNormalizer *n) {
142
0
    if (n->buffer_norm_end == n->buffer_start)
143
0
        MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get codepoint");
144
0
    return n->buffer[n->buffer_start++];
145
0
}
146
147
/* Grapheme version of the above. Note that this exists mostly for API clarity
148
 * rather than adding any semantics; the normalizer must be configured to
149
 * produce NFG to get synthetics out. */
150
0
MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_get_grapheme(MVMThreadContext *tc, MVMNormalizer *n) {
151
0
    assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32));
152
0
    if (n->buffer_norm_end == n->buffer_start)
153
0
        MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get grapheme");
154
0
    return (MVMGrapheme32)n->buffer[n->buffer_start++];
155
0
}
156
157
/* Setup and teardown of the MVMNormalizer struct. */
158
MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in);
159
void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization norm);
160
void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n);
161
void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n);
162
163
/* High-level normalize implementation, working from an input array of
164
 * codepoints and producing an output array of codepoints. */
165
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form);
166
167
/* High-level function to produces an NFG string from an input array of
168
 * codepoints. */
169
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc,const MVMObject *codes);
170
MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count);
171
172
/* High-level function to produce an array of codepoints from a string. */
173
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out);
174
175
/* faster atoi function */
176
0
MVM_STATIC_INLINE MVMint32 fast_atoi( const char * dec_str ) {
177
0
    MVMint32 value = 0;
178
0
    while( *dec_str ) {
179
0
        value = value*10 + (*dec_str++ - '0');
180
0
    }
181
0
    return value;
182
0
}