/home/travis/build/MoarVM/MoarVM/src/strings/normalize.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Normalization modes. Numbers picked so that: |
2 | | * - The LSB tells us whether to do canonical or compatibility normalization |
3 | | * - The second bit tells us whether to do canonical normalization |
4 | | * - The third bit tells us to go a step further and create synthetic codes |
5 | | * for graphemes. |
6 | | */ |
7 | | typedef enum { |
8 | | MVM_NORMALIZE_NFD = 0, |
9 | | MVM_NORMALIZE_NFKD = 1, |
10 | | MVM_NORMALIZE_NFC = 2, |
11 | | MVM_NORMALIZE_NFKC = 3, |
12 | | MVM_NORMALIZE_NFG = 6 |
13 | | } MVMNormalization; |
14 | | |
15 | | /* Ways of checking various properties of the normalization form. */ |
16 | | #define MVM_NORMALIZE_COMPAT_DECOMP(form) (form & 1) |
17 | | #define MVM_NORMALIZE_COMPOSE(form) (form & 2) |
18 | | #define MVM_NORMALIZE_GRAPHEME(form) (form & 4) |
19 | | |
20 | | /* First codepoint where we have to actually do a real check and maybe some |
21 | | * work when normalizing. */ |
22 | | #define MVM_NORMALIZE_FIRST_SIG_NFD 0x00C0 |
23 | | #define MVM_NORMALIZE_FIRST_SIG_NFC 0x0300 |
24 | | #define MVM_NORMALIZE_FIRST_SIG_NFKD 0x00A0 |
25 | | #define MVM_NORMALIZE_FIRST_SIG_NFKC 0x00A0 |
26 | | |
27 | | /* First codepoint with a non-zero canonical combining class. */ |
28 | | #define MVM_NORMALIZE_FIRST_NONZERO_CCC 0x300 |
29 | | |
30 | | /* Streaming Unicode normalizer structure. */ |
31 | | struct MVMNormalizer { |
32 | | /* What form of normalization are we doing? */ |
33 | | MVMNormalization form; |
34 | | |
35 | | /* Current buffer of codepoints we're working to normalize. */ |
36 | | MVMCodepoint *buffer; |
37 | | |
38 | | /* Size of the normalization buffer. */ |
39 | | MVMint32 buffer_size; |
40 | | |
41 | | /* Start offset in the buffer where we're still processing. */ |
42 | | MVMint32 buffer_start; |
43 | | |
44 | | /* End offset in the buffer, and where to add the next thing to process. */ |
45 | | MVMint32 buffer_end; |
46 | | |
47 | | /* End offset in the buffer for things we've normalized and so can return. */ |
48 | | MVMint32 buffer_norm_end; |
49 | | |
50 | | /* The first significant codepoint in this normalization form that we may |
51 | | * have to do something with. If we see two things beneath the limit in a |
52 | | * row then we know the first one below it is good to spit out. */ |
53 | | MVMCodepoint first_significant; |
54 | | |
55 | | /* The quickcheck property for the normalization form in question. */ |
56 | | MVMint32 quick_check_property; |
57 | | |
58 | | /* If we should translate the \r\n grapheme to \n (only applicable when |
59 | | * normalizing to NFG). */ |
60 | | MVMint32 translate_newlines; |
61 | | }; |
62 | | |
63 | | /* Guts-y functions, called by the API level ones below. */ |
64 | | MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out); |
65 | | MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out); |
66 | | |
67 | | /* Takes a codepoint to process for normalization as the "in" parameter. If we |
68 | | * are able to produce one or more normalized codepoints right off, then we |
69 | | * put it into the location pointed to by "out", and return the number of |
70 | | * codepoints now available including the one we just passed out. If we can't |
71 | | * produce a normalized codepoint right now, we return a 0. */ |
72 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) { |
73 | 0 | /* Control characters in the Latin-1 range are normalization terminators - |
74 | 0 | * that is, we know we can spit out whatever codepoints we have seen so |
75 | 0 | * far in normalized form without having to consider them into the |
76 | 0 | * normalization process. The exception is if we're computing NFG, and |
77 | 0 | * we got \r, which can form a grapheme in the case of \r\n. */ |
78 | 0 | if (in < 0x20 || (in >= 0x7F && in <= 0x9F) || in == 0xAD) |
79 | 0 | if (!(MVM_NORMALIZE_GRAPHEME(n->form) && in == 0x0D)) |
80 | 0 | return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out); |
81 | 0 |
|
82 | 0 | /* Fast-paths apply when the codepoint to consider is too low to have any |
83 | 0 | * interesting properties in the target normalization form. */ |
84 | 0 | if (in < n->first_significant) { |
85 | 0 | if (MVM_NORMALIZE_COMPOSE(n->form)) { |
86 | 0 | /* For the composition fast path we always have to know that we've |
87 | 0 | * seen two codepoints in a row that are below those needing a full |
88 | 0 | * check. Then we can spit out the first one. Exception: we are |
89 | 0 | * normalizing to graphemes and see \r. */ |
90 | 0 | if (!(MVM_NORMALIZE_GRAPHEME(n->form) && in == 0x0D)) { |
91 | 0 | if (n->buffer_end - n->buffer_start == 1) { |
92 | 0 | if (n->buffer[n->buffer_start] < n->first_significant) { |
93 | 0 | *out = n->buffer[n->buffer_start]; |
94 | 0 | n->buffer[n->buffer_start] = in; |
95 | 0 | return 1; |
96 | 0 | } |
97 | 0 | } |
98 | 0 | } |
99 | 0 | } |
100 | 0 | else { |
101 | 0 | /* For decomposition fast-path, the buffer should be empty. In |
102 | 0 | * that case, we just hand back what we got. */ |
103 | 0 | if (n->buffer_start == n->buffer_end) { |
104 | 0 | *out = in; |
105 | 0 | return 1; |
106 | 0 | } |
107 | 0 | } |
108 | 0 | } |
109 | 0 | /* Fall back to slow path. */ |
110 | 0 | return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out); |
111 | 0 | } |
112 | | |
113 | | /* Grapheme version of the above. Note that this exists mostly for API clarity |
114 | | * rather than adding any semantics; the normalizer must be configured to |
115 | | * produce NFG to get synthetics out. */ |
116 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint_to_grapheme(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMGrapheme32 *out) { |
117 | 0 | assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32)); |
118 | 0 | return MVM_unicode_normalizer_process_codepoint(tc, n, in, (MVMGrapheme32 *)out); |
119 | 0 | } |
120 | | |
121 | | /* Push a number of codepoints into the "to normalize" buffer. */ |
122 | | void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints); |
123 | | |
124 | | /* Get the number of codepoints/graphemes ready to fetch. */ |
125 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_available(MVMThreadContext *tc, MVMNormalizer *n) { |
126 | 0 | return n->buffer_norm_end - n->buffer_start; |
127 | 0 | } |
128 | | |
129 | | /* Get the number of codepoints/graphemes ready to fetch. */ |
130 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_empty(MVMThreadContext *tc, MVMNormalizer *n) { |
131 | 0 | return n->buffer_end == n->buffer_start; |
132 | 0 | } |
133 | | |
134 | | /* Indicate that we've reached the end of the input stream. Any codepoints |
135 | | * left to normalize now can be. */ |
136 | | void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n); |
137 | | |
138 | | /* Get a normalized codepoint; should only ever be called if there are some |
139 | | * known to be available, either because normalize_to_codepoint returned a |
140 | | * value greater than 1, or normalize_available returned a non-zero value. */ |
141 | 0 | MVM_STATIC_INLINE MVMCodepoint MVM_unicode_normalizer_get_codepoint(MVMThreadContext *tc, MVMNormalizer *n) { |
142 | 0 | if (n->buffer_norm_end == n->buffer_start) |
143 | 0 | MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get codepoint"); |
144 | 0 | return n->buffer[n->buffer_start++]; |
145 | 0 | } |
146 | | |
147 | | /* Grapheme version of the above. Note that this exists mostly for API clarity |
148 | | * rather than adding any semantics; the normalizer must be configured to |
149 | | * produce NFG to get synthetics out. */ |
150 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_get_grapheme(MVMThreadContext *tc, MVMNormalizer *n) { |
151 | 0 | assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32)); |
152 | 0 | if (n->buffer_norm_end == n->buffer_start) |
153 | 0 | MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get grapheme"); |
154 | 0 | return (MVMGrapheme32)n->buffer[n->buffer_start++]; |
155 | 0 | } |
156 | | |
157 | | /* Setup and teardown of the MVMNormalizer struct. */ |
158 | | MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in); |
159 | | void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization norm); |
160 | | void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n); |
161 | | void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n); |
162 | | |
163 | | /* High-level normalize implementation, working from an input array of |
164 | | * codepoints and producing an output array of codepoints. */ |
165 | | void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form); |
166 | | |
167 | | /* High-level function to produces an NFG string from an input array of |
168 | | * codepoints. */ |
169 | | MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc,const MVMObject *codes); |
170 | | MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count); |
171 | | |
172 | | /* High-level function to produce an array of codepoints from a string. */ |
173 | | void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out); |
174 | | |
175 | | /* faster atoi function */ |
176 | 0 | MVM_STATIC_INLINE MVMint32 fast_atoi( const char * dec_str ) { |
177 | 0 | MVMint32 value = 0; |
178 | 0 | while( *dec_str ) { |
179 | 0 | value = value*10 + (*dec_str++ - '0'); |
180 | 0 | } |
181 | 0 | return value; |
182 | 0 | } |