/home/travis/build/MoarVM/MoarVM/src/strings/normalize.h
Line | Count | Source (jump to first uncovered line) |
1 | | /* Normalization modes. Numbers picked so that: |
2 | | * - The LSB tells us whether to do canonical or compatibility normalization |
3 | | * - The second bit tells us whether to do canonical normalization |
4 | | * - The third bit tells us to go a step further and create synthetic codes |
5 | | * for graphemes. |
6 | | */ |
7 | | typedef enum { |
8 | | MVM_NORMALIZE_NFD = 0, |
9 | | MVM_NORMALIZE_NFKD = 1, |
10 | | MVM_NORMALIZE_NFC = 2, |
11 | | MVM_NORMALIZE_NFKC = 3, |
12 | | MVM_NORMALIZE_NFG = 6 |
13 | | } MVMNormalization; |
14 | | |
15 | | /* Ways of checking various properties of the normalization form. */ |
16 | | #define MVM_NORMALIZE_COMPAT_DECOMP(form) (form & 1) |
17 | | #define MVM_NORMALIZE_COMPOSE(form) (form & 2) |
18 | | #define MVM_NORMALIZE_GRAPHEME(form) (form & 4) |
19 | | |
20 | | /* First codepoint where we have to actually do a real check and maybe some |
21 | | * work when normalizing. */ |
22 | | #define MVM_NORMALIZE_FIRST_SIG_NFD 0x00C0 |
23 | | #define MVM_NORMALIZE_FIRST_SIG_NFC 0x0300 |
24 | | #define MVM_NORMALIZE_FIRST_SIG_NFKD 0x00A0 |
25 | | #define MVM_NORMALIZE_FIRST_SIG_NFKC 0x00A0 |
26 | | |
27 | | /* First codepoint with a non-zero canonical combining class. */ |
28 | | #define MVM_NORMALIZE_FIRST_NONZERO_CCC 0x300 |
29 | | |
30 | | /* Streaming Unicode normalizer structure. */ |
31 | | struct MVMNormalizer { |
32 | | /* What form of normalization are we doing? */ |
33 | | MVMNormalization form; |
34 | | |
35 | | /* Current buffer of codepoints we're working to normalize. */ |
36 | | MVMCodepoint *buffer; |
37 | | |
38 | | /* Size of the normalization buffer. */ |
39 | | MVMint32 buffer_size; |
40 | | |
41 | | /* Start offset in the buffer where we're still processing. */ |
42 | | MVMint32 buffer_start; |
43 | | |
44 | | /* End offset in the buffer, and where to add the next thing to process. */ |
45 | | MVMint32 buffer_end; |
46 | | |
47 | | /* End offset in the buffer for things we've normalized and so can return. */ |
48 | | MVMint32 buffer_norm_end; |
49 | | |
50 | | /* The first significant codepoint in this normalization form that we may |
51 | | * have to do something with. If we see two things beneath the limit in a |
52 | | * row then we know the first one below it is good to spit out. */ |
53 | | MVMCodepoint first_significant; |
54 | | |
55 | | /* The quickcheck property for the normalization form in question. */ |
56 | | MVMint32 quick_check_property; |
57 | | |
58 | | /* If we should translate the \r\n grapheme to \n (only applicable when |
59 | | * normalizing to NFG). */ |
60 | | MVMint32 translate_newlines; |
61 | | |
62 | | MVMint32 prepend_buffer; |
63 | | |
64 | | MVMint32 regional_indicator; |
65 | | |
66 | | }; |
67 | | |
68 | | /* Guts-y functions, called by the API level ones below. */ |
69 | | MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out); |
70 | | MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out); |
71 | | |
72 | | /* Takes a codepoint to process for normalization as the "in" parameter. If we |
73 | | * are able to produce one or more normalized codepoints right off, then we |
74 | | * put it into the location pointed to by "out", and return the number of |
75 | | * codepoints now available including the one we just passed out. If we can't |
76 | | * produce a normalized codepoint right now, we return a 0. */ |
77 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) { |
78 | 0 | /* Control characters in the Latin-1 range are normalization terminators - |
79 | 0 | * that is, we know we can spit out whatever codepoints we have seen so |
80 | 0 | * far in normalized form without having to consider them into the |
81 | 0 | * normalization process. The exception is if we're computing NFG, and |
82 | 0 | * we got \r, which can form a grapheme in the case of \r\n. */ |
83 | 0 | if (in < 0x20 || (0x7F <= in && in <= 0x9F) || in == 0xAD) { |
84 | 0 | /* For utf8-c8 synthetic graphemes. May be able to be removed after |
85 | 0 | * changing and further testing of the TODO marked below. */ |
86 | 0 | if (MVM_UNLIKELY(in < 0)) { |
87 | 0 | if (MVM_LIKELY(MVM_nfg_get_synthetic_info(tc, in)->is_utf8_c8)) |
88 | 0 | return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out); |
89 | 0 | MVM_exception_throw_adhoc(tc, "Internal error: encountered non-utf8-c8 synthetic during normalization"); |
90 | 0 | } |
91 | 0 | /* If in isn't \r */ |
92 | 0 | if (in != 0x0D || !MVM_NORMALIZE_GRAPHEME(n->form)) |
93 | 0 | return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out); |
94 | 0 | } |
95 | 0 |
|
96 | 0 | /* Fast-paths apply when the codepoint to consider is too low to have any |
97 | 0 | * interesting properties in the target normalization form AND |
98 | 0 | * it doesn't follow a prepend character */ |
99 | 0 | if (in < n->first_significant && !n->prepend_buffer) { |
100 | 0 | if (MVM_LIKELY(MVM_NORMALIZE_COMPOSE(n->form))) { |
101 | 0 | /* For the composition fast path we always have to know that we've |
102 | 0 | * seen two codepoints in a row that are below those needing a full |
103 | 0 | * check. Then we can spit out the first one. Exception: we are |
104 | 0 | * normalizing to graphemes and see \r. */ |
105 | 0 | if (MVM_LIKELY(in != 0x0D || !MVM_NORMALIZE_GRAPHEME(n->form))) { |
106 | 0 | if (n->buffer_end - n->buffer_start == 1) { |
107 | 0 | if (n->buffer[n->buffer_start] < n->first_significant) { |
108 | 0 | *out = n->buffer[n->buffer_start]; |
109 | 0 | n->buffer[n->buffer_start] = in; |
110 | 0 | return 1; |
111 | 0 | } |
112 | 0 | } |
113 | 0 | } |
114 | 0 | } |
115 | 0 | else { |
116 | 0 | /* For decomposition fast-path, the buffer should be empty. In |
117 | 0 | * that case, we just hand back what we got. */ |
118 | 0 | if (n->buffer_start == n->buffer_end) { |
119 | 0 | *out = in; |
120 | 0 | return 1; |
121 | 0 | } |
122 | 0 | } |
123 | 0 | } |
124 | 0 | /* Fall back to slow path. */ |
125 | 0 | return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out); |
126 | 0 | } |
127 | | |
128 | | /* Grapheme version of the above. Note that this exists mostly for API clarity |
129 | | * rather than adding any semantics; the normalizer must be configured to |
130 | | * produce NFG to get synthetics out. */ |
131 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint_to_grapheme(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMGrapheme32 *out) { |
132 | 0 | assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32)); |
133 | 0 | return MVM_unicode_normalizer_process_codepoint(tc, n, in, (MVMGrapheme32 *)out); |
134 | 0 | } |
135 | | |
136 | | /* Push a number of codepoints into the "to normalize" buffer. */ |
137 | | void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints); |
138 | | |
139 | | /* Get the number of codepoints/graphemes ready to fetch. */ |
140 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_available(MVMThreadContext *tc, MVMNormalizer *n) { |
141 | 0 | return n->buffer_norm_end - n->buffer_start; |
142 | 0 | } |
143 | | |
144 | | /* Get the number of codepoints/graphemes ready to fetch. */ |
145 | 0 | MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_empty(MVMThreadContext *tc, MVMNormalizer *n) { |
146 | 0 | return n->buffer_end == n->buffer_start; |
147 | 0 | } |
148 | | |
149 | | /* Indicate that we've reached the end of the input stream. Any codepoints |
150 | | * left to normalize now can be. */ |
151 | | void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n); |
152 | | |
153 | | /* Get a normalized codepoint; should only ever be called if there are some |
154 | | * known to be available, either because normalize_to_codepoint returned a |
155 | | * value greater than 1, or normalize_available returned a non-zero value. */ |
156 | 0 | MVM_STATIC_INLINE MVMCodepoint MVM_unicode_normalizer_get_codepoint(MVMThreadContext *tc, MVMNormalizer *n) { |
157 | 0 | if (n->buffer_norm_end == n->buffer_start) |
158 | 0 | MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get codepoint"); |
159 | 0 | return n->buffer[n->buffer_start++]; |
160 | 0 | } |
161 | | |
162 | | /* Grapheme version of the above. Note that this exists mostly for API clarity |
163 | | * rather than adding any semantics; the normalizer must be configured to |
164 | | * produce NFG to get synthetics out. */ |
165 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_get_grapheme(MVMThreadContext *tc, MVMNormalizer *n) { |
166 | 0 | assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32)); |
167 | 0 | if (n->buffer_norm_end == n->buffer_start) |
168 | 0 | MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get grapheme"); |
169 | 0 | return (MVMGrapheme32)n->buffer[n->buffer_start++]; |
170 | 0 | } |
171 | | |
172 | | /* Setup and teardown of the MVMNormalizer struct. */ |
173 | | MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in); |
174 | | void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization norm); |
175 | | void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n); |
176 | | void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n); |
177 | | |
178 | | /* High-level normalize implementation, working from an input array of |
179 | | * codepoints and producing an output array of codepoints. */ |
180 | | void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form); |
181 | | |
182 | | /* High-level function to produces an NFG string from an input array of |
183 | | * codepoints. */ |
184 | | MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc,const MVMObject *codes); |
185 | | MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count); |
186 | | |
187 | | /* High-level function to produce an array of codepoints from a string. */ |
188 | | void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out); |
189 | | |
190 | | /* faster atoi function */ |
191 | 0 | MVM_STATIC_INLINE MVMint32 fast_atoi( const char * dec_str ) { |
192 | 0 | MVMint32 value = 0; |
193 | 0 | while( *dec_str ) { |
194 | 0 | value = value*10 + (*dec_str++ - '0'); |
195 | 0 | } |
196 | 0 | return value; |
197 | 0 | } |
198 | | MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp); |
199 | | MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer *norm); |
200 | | MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp); |
201 | | MVMint32 MVM_string_is_control_full(MVMThreadContext *tc, MVMCodepoint in); |
202 | | /* Function for choosing the appropriate line-ending grapheme depending on if |
203 | | * newline translation is enabled. */ |
204 | 0 | MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_translated_crlf(MVMThreadContext *tc, MVMNormalizer *n) { |
205 | 0 | return n->translate_newlines |
206 | 0 | ? '\n' |
207 | 0 | : MVM_nfg_crlf_grapheme(tc); |
208 | 0 | } |