/home/travis/build/MoarVM/MoarVM/src/strings/normalize.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | 0 | #define UNI_CP_MALE_SIGN 0x2642 |
3 | 0 | #define UNI_CP_FEMALE_SIGN 0x2640 |
4 | 59.2k | #define UNI_CP_ZERO_WIDTH_JOINER 0x200D |
5 | 59.2k | #define UNI_CP_ZERO_WIDTH_NON_JOINER 0x200C |
6 | | |
7 | | /* Maps outside-world normalization form codes to our internal set, validating |
8 | | * that we got something valid. */ |
9 | 3 | MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in) { |
10 | 3 | switch (form_in) { |
11 | 3 | case 1: return MVM_NORMALIZE_NFC; |
12 | 0 | case 2: return MVM_NORMALIZE_NFD; |
13 | 0 | case 3: return MVM_NORMALIZE_NFKC; |
14 | 0 | case 4: return MVM_NORMALIZE_NFKD; |
15 | 0 | default: MVM_exception_throw_adhoc(tc, "Invalid normalization form %d", (int)form_in); |
16 | 3 | } |
17 | 3 | } |
18 | | |
19 | | /* Takes two objects, which must be of VMArray representation and holding |
20 | | * 32-bit integers. Performs normalization to the specified form. */ |
21 | 8 | static void assert_codepoint_array(MVMThreadContext *tc, const MVMObject *arr, char *error) { |
22 | 8 | if (IS_CONCRETE(arr) && REPR(arr)->ID == MVM_REPR_ID_VMArray) { |
23 | 8 | MVMuint8 slot_type = ((MVMArrayREPRData *)STABLE(arr)->REPR_data)->slot_type; |
24 | 8 | if (slot_type == MVM_ARRAY_I32 || slot_type == MVM_ARRAY_U32) |
25 | 8 | return; |
26 | 8 | } |
27 | 0 | MVM_exception_throw_adhoc(tc, "%s", error); |
28 | 0 | } |
29 | 96 | MVM_STATIC_INLINE void maybe_grow_result(MVMCodepoint **result, MVMint64 *result_alloc, MVMint64 needed) { |
30 | 96 | if (needed >= *result_alloc) { |
31 | 8 | while (needed >= *result_alloc) |
32 | 4 | *result_alloc += 32; |
33 | 4 | *result = MVM_realloc(*result, *result_alloc * sizeof(MVMCodepoint)); |
34 | 4 | } |
35 | 96 | } |
36 | 1 | void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form) { |
37 | 1 | MVMNormalizer norm; |
38 | 1 | MVMCodepoint *input; |
39 | 1 | MVMCodepoint *result; |
40 | 1 | MVMint64 input_pos, input_codes, result_pos, result_alloc; |
41 | 1 | MVMint32 ready; |
42 | 1 | |
43 | 1 | /* Validate input/output array. */ |
44 | 1 | assert_codepoint_array(tc, in, "Normalization input must be native array of 32-bit integers"); |
45 | 1 | assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); |
46 | 1 | |
47 | 1 | /* Get input array; if it's empty, we're done already. */ |
48 | 1 | input = (MVMCodepoint *)((MVMArray *)in)->body.slots.u32 + ((MVMArray *)in)->body.start; |
49 | 1 | input_codes = ((MVMArray *)in)->body.elems; |
50 | 1 | if (input_codes == 0) |
51 | 0 | return; |
52 | 1 | |
53 | 1 | /* Guess output size based on input size. */ |
54 | 1 | result_alloc = input_codes; |
55 | 1 | result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); |
56 | 1 | |
57 | 1 | /* Perform normalization. */ |
58 | 1 | MVM_unicode_normalizer_init(tc, &norm, form); |
59 | 1 | input_pos = 0; |
60 | 1 | result_pos = 0; |
61 | 3 | while (input_pos < input_codes) { |
62 | 2 | MVMCodepoint cp; |
63 | 2 | ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, input[input_pos], &cp); |
64 | 2 | if (ready) { |
65 | 0 | maybe_grow_result(&result, &result_alloc, result_pos + ready); |
66 | 0 | result[result_pos++] = cp; |
67 | 0 | while (--ready > 0) |
68 | 0 | result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); |
69 | 0 | } |
70 | 2 | input_pos++; |
71 | 2 | } |
72 | 1 | MVM_unicode_normalizer_eof(tc, &norm); |
73 | 1 | ready = MVM_unicode_normalizer_available(tc, &norm); |
74 | 1 | maybe_grow_result(&result, &result_alloc, result_pos + ready); |
75 | 3 | while (ready--) |
76 | 2 | result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); |
77 | 1 | MVM_unicode_normalizer_cleanup(tc, &norm); |
78 | 1 | |
79 | 1 | /* Put result into array body. */ |
80 | 1 | ((MVMArray *)out)->body.slots.u32 = (MVMuint32 *) result; |
81 | 1 | ((MVMArray *)out)->body.start = 0; |
82 | 1 | ((MVMArray *)out)->body.elems = result_pos; |
83 | 1 | } |
84 | 53 | MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count) { |
85 | 53 | MVMNormalizer norm; |
86 | 53 | MVMint64 input_pos, result_pos, result_alloc; |
87 | 53 | MVMGrapheme32 *result; |
88 | 53 | MVMint32 ready; |
89 | 53 | MVMString *str; |
90 | 53 | |
91 | 53 | if (cp_count == 0) |
92 | 0 | return tc->instance->str_consts.empty; |
93 | 53 | |
94 | 53 | /* Guess output size based on cp_v size. */ |
95 | 53 | result_alloc = cp_count; |
96 | 53 | result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); |
97 | 53 | |
98 | 53 | /* Perform normalization at grapheme level. */ |
99 | 53 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
100 | 53 | input_pos = 0; |
101 | 53 | result_pos = 0; |
102 | 170 | while (input_pos < cp_count) { |
103 | 117 | MVMGrapheme32 g; |
104 | 117 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp_v[input_pos], &g); |
105 | 117 | if (ready) { |
106 | 38 | maybe_grow_result(&result, &result_alloc, result_pos + ready); |
107 | 38 | result[result_pos++] = g; |
108 | 39 | while (--ready > 0) |
109 | 1 | result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
110 | 38 | } |
111 | 117 | input_pos++; |
112 | 117 | } |
113 | 53 | MVM_unicode_normalizer_eof(tc, &norm); |
114 | 53 | ready = MVM_unicode_normalizer_available(tc, &norm); |
115 | 53 | maybe_grow_result(&result, &result_alloc, result_pos + ready); |
116 | 73 | while (ready--) |
117 | 20 | result[result_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
118 | 53 | MVM_unicode_normalizer_cleanup(tc, &norm); |
119 | 53 | |
120 | 53 | /* Produce an MVMString of the result. */ |
121 | 53 | str = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString); |
122 | 53 | str->body.storage.blob_32 = result; |
123 | 53 | str->body.storage_type = MVM_STRING_GRAPHEME_32; |
124 | 53 | str->body.num_graphs = result_pos; |
125 | 53 | return str; |
126 | 53 | } |
127 | | |
128 | | /* Takes an object, which must be of VMArray representation and holding |
129 | | * 32-bit integers. Treats them as Unicode codepoints, normalizes them at |
130 | | * Grapheme level, and returns the resulting NFG string. */ |
131 | 4 | MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc, const MVMObject *codes) { |
132 | 4 | MVMCodepoint *input; |
133 | 4 | MVMint64 input_codes; |
134 | 4 | |
135 | 4 | assert_codepoint_array(tc, codes, "Code points to string input must be native array of 32-bit integers"); |
136 | 4 | |
137 | 4 | input = (MVMCodepoint *)((MVMArray *)codes)->body.slots.u32 + ((MVMArray *)codes)->body.start; |
138 | 4 | input_codes = ((MVMArray *)codes)->body.elems; |
139 | 4 | return MVM_unicode_codepoints_c_array_to_nfg_string(tc, input, input_codes); |
140 | 4 | } |
141 | | |
142 | | /* Takes an NFG string and populates the array out, which must be a 32-bit |
143 | | * integer array, with codepoints normalized according to the specified |
144 | | * normalization form. */ |
145 | 2 | void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) { |
146 | 2 | MVMCodepoint *result; |
147 | 2 | MVMint64 result_pos, result_alloc; |
148 | 2 | MVMCodepointIter ci; |
149 | 2 | |
150 | 2 | /* Validate output array and set up result storage. */ |
151 | 2 | assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); |
152 | 2 | result_alloc = s->body.num_graphs; |
153 | 2 | result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); |
154 | 2 | result_pos = 0; |
155 | 2 | |
156 | 2 | /* Create codepoint iterator. */ |
157 | 2 | MVM_string_ci_init(tc, &ci, s, 0, 0); |
158 | 2 | |
159 | 2 | /* If we want NFC, just iterate, since NFG is constructed out of NFC. */ |
160 | 2 | if (form == MVM_NORMALIZE_NFC) { |
161 | 6 | while (MVM_string_ci_has_more(tc, &ci)) { |
162 | 4 | maybe_grow_result(&result, &result_alloc, result_pos + 1); |
163 | 4 | result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci); |
164 | 4 | } |
165 | 2 | } |
166 | 2 | |
167 | 2 | /* Otherwise, need to feed it through a normalizer. */ |
168 | 0 | else { |
169 | 0 | MVMNormalizer norm; |
170 | 0 | MVMint32 ready; |
171 | 0 | MVM_unicode_normalizer_init(tc, &norm, form); |
172 | 0 | while (MVM_string_ci_has_more(tc, &ci)) { |
173 | 0 | MVMCodepoint cp; |
174 | 0 | ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp); |
175 | 0 | if (ready) { |
176 | 0 | maybe_grow_result(&result, &result_alloc, result_pos + ready); |
177 | 0 | result[result_pos++] = cp; |
178 | 0 | while (--ready > 0) |
179 | 0 | result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); |
180 | 0 | } |
181 | 0 | } |
182 | 0 | MVM_unicode_normalizer_eof(tc, &norm); |
183 | 0 | ready = MVM_unicode_normalizer_available(tc, &norm); |
184 | 0 | maybe_grow_result(&result, &result_alloc, result_pos + ready); |
185 | 0 | while (ready--) |
186 | 0 | result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); |
187 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
188 | 0 | } |
189 | 2 | |
190 | 2 | /* Put result into array body. */ |
191 | 2 | ((MVMArray *)out)->body.slots.u32 = (MVMuint32 *)result; |
192 | 2 | ((MVMArray *)out)->body.start = 0; |
193 | 2 | ((MVMArray *)out)->body.elems = result_pos; |
194 | 2 | } |
195 | | |
196 | | /* Initialize the MVMNormalizer pointed to to perform the specified kind of |
197 | | * normalization. */ |
198 | 56.6k | void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization form) { |
199 | 56.6k | n->form = form; |
200 | 56.6k | n->buffer_size = 32; |
201 | 56.6k | n->buffer = MVM_malloc(n->buffer_size * sizeof(MVMCodepoint)); |
202 | 56.6k | n->buffer_start = 0; |
203 | 56.6k | n->buffer_end = 0; |
204 | 56.6k | n->buffer_norm_end = 0; |
205 | 56.6k | n->translate_newlines = 0; |
206 | 56.6k | n->prepend_buffer = 0; |
207 | 56.6k | n->regional_indicator = 0; |
208 | 56.6k | switch (n->form) { |
209 | 2.77k | case MVM_NORMALIZE_NFD: |
210 | 2.77k | n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFD; |
211 | 2.77k | n->quick_check_property = MVM_UNICODE_PROPERTY_NFD_QC; |
212 | 2.77k | break; |
213 | 0 | case MVM_NORMALIZE_NFKD: |
214 | 0 | n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKD; |
215 | 0 | n->quick_check_property = MVM_UNICODE_PROPERTY_NFKD_QC; |
216 | 0 | break; |
217 | 1 | case MVM_NORMALIZE_NFC: |
218 | 1 | n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC; |
219 | 1 | n->quick_check_property = MVM_UNICODE_PROPERTY_NFC_QC; |
220 | 1 | break; |
221 | 0 | case MVM_NORMALIZE_NFKC: |
222 | 0 | n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKC; |
223 | 0 | n->quick_check_property = MVM_UNICODE_PROPERTY_NFKC_QC; |
224 | 0 | break; |
225 | 53.9k | case MVM_NORMALIZE_NFG: |
226 | 53.9k | n->quick_check_property = MVM_UNICODE_PROPERTY_NFG_QC; |
227 | 53.9k | n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC; |
228 | 53.9k | break; |
229 | 0 | default: |
230 | 0 | abort(); |
231 | 56.6k | } |
232 | 56.6k | } |
233 | | |
234 | | /* Enable translation of newlines from \r\n to \n. */ |
235 | 647 | void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n) { |
236 | 647 | n->translate_newlines = 1; |
237 | 647 | } |
238 | | |
239 | | /* Cleanup an MVMNormalization once we're done normalizing. */ |
240 | 56.1k | void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n) { |
241 | 56.1k | free(n->buffer); |
242 | 56.1k | } |
243 | | |
244 | | /* Adds a codepoint into the buffer, making sure there's space. */ |
245 | 52.6k | static void add_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) { |
246 | 52.6k | if (n->buffer_end == n->buffer_size) { |
247 | 73 | if (n->buffer_start != 0) { |
248 | 73 | MVMint32 shuffle = n->buffer_start; |
249 | 73 | MVMint32 to_move = n->buffer_end - n->buffer_start; |
250 | 73 | memmove(n->buffer, n->buffer + n->buffer_start, to_move * sizeof(MVMCodepoint)); |
251 | 73 | n->buffer_start = 0; |
252 | 73 | n->buffer_end -= shuffle; |
253 | 73 | n->buffer_norm_end -= shuffle; |
254 | 73 | } |
255 | 0 | else { |
256 | 0 | n->buffer_size *= 2; |
257 | 0 | n->buffer = MVM_realloc(n->buffer, n->buffer_size * sizeof(MVMCodepoint)); |
258 | 0 | } |
259 | 73 | } |
260 | 52.6k | n->buffer[n->buffer_end++] = cp; |
261 | 52.6k | } |
262 | | |
263 | | /* Hangul-related constants from Unicode spec 3.12, following naming |
264 | | * convention from spec. */ |
265 | | static const int |
266 | | SBase = 0xAC00, |
267 | | LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, |
268 | | LCount = 19, VCount = 21, TCount = 28, |
269 | | NCount = 588, /* VCount * TCount */ |
270 | | SCount = 11172; /* LCount * NCount */ |
271 | | |
272 | | /* Decomposes a Hangul codepoint and add it into the buffer. */ |
273 | 0 | static void decomp_hangul_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint s) { |
274 | 0 | /* Algorithm from Unicode spec 3.12, following naming convention from spec. */ |
275 | 0 | int SIndex = s - SBase; |
276 | 0 | if (SIndex < 0 || SIndex >= SCount) { |
277 | 0 | add_codepoint_to_buffer(tc, n, s); |
278 | 0 | } |
279 | 0 | else { |
280 | 0 | int L = LBase + SIndex / NCount; |
281 | 0 | int V = VBase + (SIndex % NCount) / TCount; |
282 | 0 | int T = TBase + SIndex % TCount; |
283 | 0 | add_codepoint_to_buffer(tc, n, (MVMCodepoint)L); |
284 | 0 | add_codepoint_to_buffer(tc, n, (MVMCodepoint)V); |
285 | 0 | if (T != TBase) |
286 | 0 | add_codepoint_to_buffer(tc, n, (MVMCodepoint)T); |
287 | 0 | } |
288 | 0 | } |
289 | | |
290 | | /* Decompose the codepoint and add it into the buffer. */ |
291 | 1.87k | static void decomp_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) { |
292 | 1.87k | /* See if we actually need to decompose (can skip if the decomposition |
293 | 1.87k | * type is None, or we're only doing Canonical decomposition and it is |
294 | 1.87k | * anything except Canonical). */ |
295 | 1.87k | MVMint16 cp_DT = MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE); |
296 | 1.87k | MVMint64 decompose = 1; |
297 | 1.87k | if (cp_DT == MVM_UNICODE_PVALUE_DT_NONE) |
298 | 1.75k | decompose = 0; |
299 | 119 | else if (!MVM_NORMALIZE_COMPAT_DECOMP(n->form) && cp_DT != MVM_UNICODE_PVALUE_DT_CANONICAL ) |
300 | 12 | decompose = 0; |
301 | 1.87k | if (decompose) { |
302 | 107 | /* We need to decompose. Get the decomp spec and go over the things in |
303 | 107 | * it; things without a decomp spec are presumably Hangul and need the |
304 | 107 | * algorithmic treatment. */ |
305 | 107 | char *spec = (char *)MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_DECOMP_SPEC); |
306 | 107 | if (spec && spec[0]) { |
307 | 107 | char *end = spec + strlen(spec); |
308 | 310 | while (spec < end) { |
309 | 203 | /* Parse hex character code, and then recurse to do any further |
310 | 203 | * decomposition on it; this recursion terminates when we find a |
311 | 203 | * non-decomposable thing and add it to the buffer. */ |
312 | 203 | MVMCodepoint decomp_char = (MVMCodepoint)strtol(spec, &spec, 16); |
313 | 203 | decomp_codepoint_to_buffer(tc, n, decomp_char); |
314 | 203 | } |
315 | 107 | } |
316 | 0 | else { |
317 | 0 | decomp_hangul_to_buffer(tc, n, cp); |
318 | 0 | } |
319 | 107 | } |
320 | 1.76k | else { |
321 | 1.76k | /* Don't need to decompose; add it right into the buffer. */ |
322 | 1.76k | add_codepoint_to_buffer(tc, n, cp); |
323 | 1.76k | } |
324 | 1.87k | } |
325 | | |
326 | | /* Checks if the specified character answers "yes" on the appropriate quick check. */ |
327 | 154k | static MVMint64 passes_quickcheck(MVMThreadContext *tc, const MVMNormalizer *n, MVMCodepoint cp) { |
328 | 154k | const char *pval = MVM_unicode_codepoint_get_property_cstr(tc, cp, n->quick_check_property); |
329 | 154k | return pval && pval[0] == 'Y'; |
330 | 154k | } |
331 | | |
332 | | /* Gets the CCC (actual value) but is slower as it looks up with string properties |
333 | | * Exact values are not needed for normalization. |
334 | | * Returns 0 for Not_Reordered codepoints *and* CCC 0 codepoints */ |
335 | 0 | static MVMint64 ccc_old(MVMThreadContext *tc, MVMCodepoint cp) { |
336 | 0 | if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) { |
337 | 0 | return 0; |
338 | 0 | } |
339 | 0 | else { |
340 | 0 | const char *ccc_str = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS); |
341 | 0 | return !ccc_str || strlen(ccc_str) > 3 ? 0 : fast_atoi(ccc_str); |
342 | 0 | } |
343 | 0 | } |
344 | | /* Gets the canonical combining class for a codepoint. Does a shortcut |
345 | | * since CCC is stored as a string property, though because they are all sorted |
346 | | * numerically it is ok to get the internal integer value as stored instead of |
347 | | * the string. |
348 | | * Returns 0 for Not_Reordered codepoints *and* CCC 0 codepoints */ |
349 | 170k | MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp) { |
350 | 170k | if (cp < MVM_NORMALIZE_FIRST_NONZERO_CCC) { |
351 | 51.9k | return 0; |
352 | 51.9k | } |
353 | 118k | else { |
354 | 118k | int ccc_int = MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_CANONICAL_COMBINING_CLASS); |
355 | 118k | return ccc_int <= MVM_UNICODE_PVALUE_CCC_0 ? 0 : ccc_int - MVM_UNICODE_PVALUE_CCC_0; |
356 | 118k | } |
357 | 170k | } |
358 | | |
359 | | /* Checks if the thing we have is a control character (for the definition in |
360 | | * the Unicode Standard Annex #29). Full path. Fast path checks for controls |
361 | | * in the Latin-1 range. This works for those as well but needs a property lookup */ |
362 | 59.2k | MVMint32 MVM_string_is_control_full(MVMThreadContext *tc, MVMCodepoint in) { |
363 | 59.2k | /* U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER are excluded because |
364 | 59.2k | * they are Cf but not Control's */ |
365 | 59.2k | if (in != UNI_CP_ZERO_WIDTH_NON_JOINER && in != UNI_CP_ZERO_WIDTH_JOINER) { |
366 | 59.2k | /* Consider general property: |
367 | 59.2k | * Cc, Zl, Zp, and Cn which are also Default_Ignorable_Code_Point=True */ |
368 | 59.2k | const char *genprop = MVM_unicode_codepoint_get_property_cstr(tc, in, |
369 | 59.2k | MVM_UNICODE_PROPERTY_GENERAL_CATEGORY); |
370 | 59.2k | switch (genprop[0]) { |
371 | 3.11k | case 'Z': |
372 | 3.11k | /* Line_Separator and Paragraph_Separator are controls. */ |
373 | 2.96k | return genprop[1] == 'l' || genprop[1] == 'p'; |
374 | 310 | case 'C': |
375 | 310 | /* Control, Surrogate are controls. */ |
376 | 310 | if (genprop[1] == 'c' || genprop[1] == 's') { |
377 | 0 | return 1; |
378 | 0 | } |
379 | 310 | if (genprop[1] == 'f' ) { |
380 | 308 | /* Format can have special properties (not control) */ |
381 | 308 | return 0; |
382 | 308 | } |
383 | 310 | /* Unassigned is, but only for Default_Ignorable_Code_Point. */ |
384 | 2 | if (genprop[1] == 'n') { |
385 | 0 | return MVM_unicode_codepoint_get_property_int(tc, in, |
386 | 0 | MVM_UNICODE_PROPERTY_DEFAULT_IGNORABLE_CODE_POINT) != 0; |
387 | 0 | } |
388 | 59.2k | } |
389 | 59.2k | } |
390 | 55.8k | return 0; |
391 | 59.2k | } |
392 | | |
393 | | /* Implements the Unicode Canonical Ordering algorithm (3.11, D109). */ |
394 | 53.2k | static void canonical_sort(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) { |
395 | 53.2k | /* Yes, this is the simplest possible thing. Key thing if you decide to |
396 | 53.2k | * replace it with something more optimal: it must not re-order code |
397 | 53.2k | * points with equal CCC. */ |
398 | 53.2k | MVMint32 reordered = 1; |
399 | 106k | while (reordered) { |
400 | 53.2k | MVMint32 i = from; |
401 | 53.2k | reordered = 0; |
402 | 55.6k | while (i < to - 1) { |
403 | 2.42k | MVMint64 cccA = MVM_unicode_relative_ccc(tc, n->buffer[i]); |
404 | 2.42k | MVMint64 cccB = MVM_unicode_relative_ccc(tc, n->buffer[i + 1]); |
405 | 2.42k | if (cccA > cccB && cccB > 0) { |
406 | 3 | MVMCodepoint tmp = n->buffer[i]; |
407 | 3 | n->buffer[i] = n->buffer[i + 1]; |
408 | 3 | n->buffer[i + 1] = tmp; |
409 | 3 | reordered = 1; |
410 | 3 | } |
411 | 2.42k | i++; |
412 | 2.42k | } |
413 | 53.2k | } |
414 | 53.2k | } |
415 | | |
416 | | /* Implements the Unicode Canonical Composition algorithm (3.11, D117). */ |
417 | 50.4k | static void canonical_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) { |
418 | 50.4k | MVMint32 c_idx = from + 1; |
419 | 52.7k | while (c_idx < to) { |
420 | 2.32k | /* Search for the last non-blocked starter. */ |
421 | 2.32k | MVMint32 ss_idx = c_idx - 1; |
422 | 2.32k | MVMint32 c_ccc = MVM_unicode_relative_ccc(tc, n->buffer[c_idx]); |
423 | 2.32k | while (ss_idx >= from) { |
424 | 2.32k | /* Make sure we don't go past a code point that blocks a starter |
425 | 2.32k | * from the current character we're considering. */ |
426 | 2.32k | MVMint32 ss_ccc = MVM_unicode_relative_ccc(tc, n->buffer[ss_idx]); |
427 | 2.32k | if (ss_ccc >= c_ccc && ss_ccc != 0) |
428 | 0 | break; |
429 | 2.32k | |
430 | 2.32k | /* Have we found a starter? */ |
431 | 2.32k | if (ss_ccc == 0) { |
432 | 2.32k | /* See if there's a primary composite for the starter and the |
433 | 2.32k | * current code point under consideration. */ |
434 | 2.32k | MVMCodepoint pc = MVM_unicode_find_primary_composite(tc, n->buffer[ss_idx], n->buffer[c_idx]); |
435 | 2.32k | if (pc > 0) { |
436 | 12 | /* Replace the starter with the primary composite. */ |
437 | 12 | n->buffer[ss_idx] = pc; |
438 | 12 | |
439 | 12 | /* Move the rest of the buffer back one position. */ |
440 | 12 | memmove(n->buffer + c_idx, n->buffer + c_idx + 1, |
441 | 12 | (n->buffer_end - (c_idx + 1)) * sizeof(MVMCodepoint)); |
442 | 12 | n->buffer_end--; |
443 | 12 | |
444 | 12 | /* Sync cc_idx and to with the change. */ |
445 | 12 | c_idx--; |
446 | 12 | to--; |
447 | 12 | } |
448 | 2.32k | |
449 | 2.32k | /* Don't look back beyond this starter; covers the ccc(B) = 0 |
450 | 2.32k | * case of D105. */ |
451 | 2.32k | break; |
452 | 2.32k | } |
453 | 1 | ss_idx--; |
454 | 1 | } |
455 | 2.32k | |
456 | 2.32k | /* Move on to the next character. */ |
457 | 2.32k | c_idx++; |
458 | 2.32k | } |
459 | 50.4k | |
460 | 50.4k | /* Make another pass for the Hangul special case. (A future optimization |
461 | 50.4k | * may be to incorporate this into the above loop.) */ |
462 | 50.4k | c_idx = from; |
463 | 52.7k | while (c_idx < to - 1) { |
464 | 2.31k | /* Do we have a potential LPart? */ |
465 | 2.31k | MVMCodepoint LPart = n->buffer[c_idx]; |
466 | 2.31k | if (LPart >= LBase && LPart <= (LBase + LCount)) { |
467 | 0 | /* Yes, now see if it's followed by a VPart (always safe to look |
468 | 0 | * due to "to - 1" in loop condition above). */ |
469 | 0 | MVMCodepoint LIndex = LPart - LBase; |
470 | 0 | MVMCodepoint VPart = n->buffer[c_idx + 1]; |
471 | 0 | if (VPart >= VBase && VPart <= (VBase + VCount)) { |
472 | 0 | /* Certainly something to compose; compute that. */ |
473 | 0 | MVMCodepoint VIndex = VPart - VBase; |
474 | 0 | MVMCodepoint LVIndex = LIndex * NCount + VIndex * TCount; |
475 | 0 | MVMCodepoint s = SBase + LVIndex; |
476 | 0 | MVMint32 composed = 1; |
477 | 0 |
|
478 | 0 | /* Is there a TPart too? */ |
479 | 0 | if (c_idx < to - 2) { |
480 | 0 | MVMCodepoint TPart = n->buffer[c_idx + 2]; |
481 | 0 | if (TPart >= TBase && TPart <= (TBase + TCount)) { |
482 | 0 | /* We need to compose 3 things. */ |
483 | 0 | MVMCodepoint TIndex = TPart - TBase; |
484 | 0 | s += TIndex; |
485 | 0 | composed = 2; |
486 | 0 | } |
487 | 0 | } |
488 | 0 |
|
489 | 0 | /* Put composed codepoint into the buffer. */ |
490 | 0 | n->buffer[c_idx] = s; |
491 | 0 |
|
492 | 0 | /* Shuffle codepoints after this in the buffer back. */ |
493 | 0 | memmove(n->buffer + c_idx + 1, n->buffer + c_idx + 1 + composed, |
494 | 0 | (n->buffer_end - (c_idx + 1 + composed)) * sizeof(MVMCodepoint)); |
495 | 0 | n->buffer_end -= composed; |
496 | 0 |
|
497 | 0 | /* Sync to with updated buffer size. */ |
498 | 0 | to -= composed; |
499 | 0 | } |
500 | 0 | } |
501 | 2.31k | c_idx++; |
502 | 2.31k | } |
503 | 50.4k | } |
504 | | |
505 | | /* Performs grapheme composition (to get Normal Form Grapheme) on the range of |
506 | | * codepoints provided. This follows the algorithm in the Unicode Standard |
507 | | * Annex #29 on grapheme cluster boundaries. Note that we have already done |
508 | | * the handling of breaking around controls much earlier, so don't have to |
509 | | * consider that case. */ |
510 | 0 | static MVMint32 maybe_hangul(MVMCodepoint cp) { |
511 | 0 | return (0x1100 <= cp && cp < 0x1200) || (0xA960 <= cp && cp < 0xD7FC); |
512 | 0 | } |
513 | 0 | static MVMint32 is_grapheme_extend(MVMThreadContext *tc, MVMCodepoint cp) { |
514 | 0 | return MVM_unicode_codepoint_get_property_int(tc, cp, |
515 | 0 | MVM_UNICODE_PROPERTY_GRAPHEME_EXTEND); |
516 | 0 | } |
517 | 101k | static MVMint32 is_grapheme_prepend(MVMThreadContext *tc, MVMCodepoint cp) { |
518 | 101k | return MVM_unicode_codepoint_get_property_int(tc, cp, |
519 | 101k | MVM_UNICODE_PROPERTY_PREPENDED_CONCATENATION_MARK); |
520 | 101k | } |
521 | | /* Returns 0 if the two graphemes should be combined and returns 1 or 2 if |
522 | | * the graphemes should break. 2 is returned if more than the currenly seen |
523 | | * graphemes may be needed to determine the breaking (this is only needed if |
524 | | * we are checking two arbitrary codepoints. If we are normalizing linearly from |
525 | | * the start of the string this has no more significance than returning 1) */ |
526 | 8.87k | MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer *norm) { |
527 | 8.87k | int GCB_a, GCB_b; |
528 | 8.87k | |
529 | 8.87k | /* Don't break between \r and \n, but otherwise break around \r. */ |
530 | 8.87k | if (a == 0x0D && b == 0x0A) |
531 | 359 | return 0; |
532 | 8.51k | if (a == 0x0D || b == 0x0D) |
533 | 176 | return 1; |
534 | 8.51k | /* For utf8-c8 graphemes. These we can't request property values and act like |
535 | 8.51k | * control's */ |
536 | 8.34k | if (a < 0 || b < 0) { |
537 | 0 | if ((a < 0 && MVM_nfg_get_synthetic_info(tc, a)->is_utf8_c8) || (b < 0 && MVM_nfg_get_synthetic_info(tc, b)->is_utf8_c8)) |
538 | 0 | return 1; |
539 | 0 |
|
540 | 0 | MVM_exception_throw_adhoc(tc, "Internal error: synthetic grapheme found when computing grapheme segmentation"); |
541 | 0 | } |
542 | 8.34k | GCB_a = MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK); |
543 | 8.34k | GCB_b = MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_GRAPHEME_CLUSTER_BREAK); |
544 | 8.34k | switch (GCB_a) { |
545 | 0 | case MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR: |
546 | 0 | if (2 <= norm->regional_indicator) { |
547 | 0 | norm->regional_indicator = 0; |
548 | 0 | if (GCB_b == MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR) |
549 | 0 | /* Return 2 here so is_concat_stable can know to run re_nfg */ |
550 | 0 | return 2; |
551 | 0 | } |
552 | 0 | if (GCB_b == MVM_UNICODE_PVALUE_GCB_REGIONAL_INDICATOR) { |
553 | 0 | if (!norm->regional_indicator) |
554 | 0 | norm->regional_indicator = 2; |
555 | 0 | else |
556 | 0 | norm->regional_indicator++; |
557 | 0 | return 0; |
558 | 0 | } |
559 | 0 | break; |
560 | 0 | /* Don't break after Prepend Grapheme_Cluster_Break=Prepend */ |
561 | 0 | case MVM_UNICODE_PVALUE_GCB_PREPEND: |
562 | 0 | /* If it's a control character remember to break */ |
563 | 0 | if (MVM_string_is_control_full(tc, b )) { |
564 | 0 | return 1; |
565 | 0 | } |
566 | 0 | /* Otherwise don't break */ |
567 | 0 | return 0; |
568 | 0 | /* Don't break after ZWJ for E_Base_GAZ or Glue_After_ZWJ */ |
569 | 3 | case MVM_UNICODE_PVALUE_GCB_ZWJ: |
570 | 3 | switch (GCB_b) { |
571 | 3 | case MVM_UNICODE_PVALUE_GCB_E_BASE_GAZ: |
572 | 3 | case MVM_UNICODE_PVALUE_GCB_ZWJ: |
573 | 3 | case MVM_UNICODE_PVALUE_GCB_GLUE_AFTER_ZWJ: |
574 | 3 | return 0; |
575 | 3 | } |
576 | 0 | if ( b == UNI_CP_FEMALE_SIGN || b == UNI_CP_MALE_SIGN ) |
577 | 0 | return 0; |
578 | 0 | /* Don't break after ZWJ for Emoji property characters that have |
579 | 0 | * GCB=Other. This is *not* a unicode text segmentation rule but |
580 | 0 | * is needed to not break inside Emoji sequences. As the rule to |
581 | 0 | * not break in Emoji sequences is specified by Unicode to need |
582 | 0 | * customization to perform properly. */ |
583 | 0 | if (GCB_b == MVM_UNICODE_PVALUE_GCB_OTHER |
584 | 0 | && 127 < b /* Numbers and # have property Emoji. So make sure we're not in ASCII range */ |
585 | 0 | && MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_EMOJI) ) |
586 | 0 | return 0; |
587 | 0 | case MVM_UNICODE_PVALUE_GCB_E_MODIFIER: |
588 | 0 | if (MVM_unicode_codepoint_get_property_int(tc, b, MVM_UNICODE_PROPERTY_EMOJI_MODIFIER_BASE)) { |
589 | 0 | /* Don't break after ZWJ if it's an Emoji Sequence. |
590 | 0 | * At the moment FEMALE SIGN and MALE SIGN don't have different |
591 | 0 | * GCB properties, or any special Emoji properties (Unicode 9.0), |
592 | 0 | * so we explictly check these codepoints here */ |
593 | 0 | if ( b == UNI_CP_FEMALE_SIGN || b == UNI_CP_MALE_SIGN ) |
594 | 0 | return 0; |
595 | 0 | } |
596 | 0 | break; |
597 | 0 | case MVM_UNICODE_PVALUE_GCB_L: |
598 | 0 | if (GCB_b == MVM_UNICODE_PVALUE_GCB_L || GCB_b == MVM_UNICODE_PVALUE_GCB_V || |
599 | 0 | GCB_b == MVM_UNICODE_PVALUE_GCB_LV || GCB_b == MVM_UNICODE_PVALUE_GCB_LVT) |
600 | 0 | return 0; |
601 | 0 | break; |
602 | 0 | case MVM_UNICODE_PVALUE_GCB_LV: |
603 | 0 | case MVM_UNICODE_PVALUE_GCB_V: |
604 | 0 | if (GCB_b == MVM_UNICODE_PVALUE_GCB_V || GCB_b == MVM_UNICODE_PVALUE_GCB_T) |
605 | 0 | return 0; |
606 | 0 | break; |
607 | 0 | case MVM_UNICODE_PVALUE_GCB_LVT: |
608 | 0 | case MVM_UNICODE_PVALUE_GCB_T: |
609 | 0 | if (GCB_b == MVM_UNICODE_PVALUE_GCB_T) |
610 | 0 | return 0; |
611 | 0 | break; |
612 | 8.34k | } |
613 | 8.34k | switch (GCB_b) { |
614 | 8.34k | /* Don't break before extending chars */ |
615 | 16 | case MVM_UNICODE_PVALUE_GCB_EXTEND: |
616 | 16 | return 0; |
617 | 16 | /* Don't break before ZWJ */ |
618 | 5 | case MVM_UNICODE_PVALUE_GCB_ZWJ: |
619 | 5 | return 0; |
620 | 3 | case MVM_UNICODE_PVALUE_GCB_E_MODIFIER: |
621 | 3 | switch (GCB_a) { |
622 | 0 | case MVM_UNICODE_PVALUE_GCB_E_BASE_GAZ: |
623 | 0 | return 0; |
624 | 3 | case MVM_UNICODE_PVALUE_GCB_E_BASE: |
625 | 3 | return 0; |
626 | 3 | } |
627 | 0 | if (MVM_unicode_codepoint_get_property_int(tc, a, MVM_UNICODE_PROPERTY_EMOJI_MODIFIER_BASE)) { |
628 | 0 | /* Not all emoji modifiers have E_BASE or E_BASE_GAZ, some cases we need to check the |
629 | 0 | * Emoji_Modifier_Base property */ |
630 | 0 | return 0; |
631 | 0 | } |
632 | 0 | break; |
633 | 0 | /* Don't break before spacing marks. */ |
634 | 0 | case MVM_UNICODE_PVALUE_GCB_SPACINGMARK: |
635 | 0 | return 0; |
636 | 8.34k | } |
637 | 8.34k | |
638 | 8.34k | /* Otherwise break. */ |
639 | 8.31k | return 1; |
640 | 8.34k | } |
641 | 50.4k | static void grapheme_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) { |
642 | 50.4k | if (to - from >= 2) { |
643 | 2.27k | MVMint32 starterish = from; |
644 | 2.27k | MVMint32 insert_pos = from; |
645 | 2.27k | MVMint32 pos = from; |
646 | 6.86k | while (pos < to) { |
647 | 4.59k | MVMint32 next_pos = pos + 1; |
648 | 4.59k | if (next_pos == to || MVM_unicode_normalize_should_break(tc, n->buffer[pos], n->buffer[next_pos], n)) { |
649 | 4.21k | /* Last in buffer or next code point is a non-starter; turn |
650 | 4.21k | * sequence into a synthetic. */ |
651 | 4.21k | MVMGrapheme32 g = MVM_nfg_codes_to_grapheme(tc, n->buffer + starterish, next_pos - starterish); |
652 | 4.21k | if (n->translate_newlines && g == MVM_nfg_crlf_grapheme(tc)) |
653 | 5 | g = '\n'; |
654 | 4.21k | n->buffer[insert_pos++] = g; |
655 | 4.21k | |
656 | 4.21k | /* The next code point is our new starterish (harmless if we |
657 | 4.21k | * are already at the end of the buffer). */ |
658 | 4.21k | starterish = next_pos; |
659 | 4.21k | } |
660 | 4.59k | pos++; |
661 | 4.59k | } |
662 | 2.27k | memmove(n->buffer + insert_pos, n->buffer + to, (n->buffer_end - to) * sizeof(MVMCodepoint)); |
663 | 2.27k | n->buffer_end -= to - insert_pos; |
664 | 2.27k | } |
665 | 50.4k | } |
666 | | |
667 | | /* Called when the very fast case of normalization fails (that is, when we get |
668 | | * any two codepoints in a row where at least one is greater than the first |
669 | | * significant codepoint identified by a quick check for the target form). We |
670 | | * may find the quick check itself is enough; if not, we have to do real work |
671 | | * compute the normalization. */ |
672 | 101k | MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *norm, MVMCodepoint in, MVMCodepoint *out) { |
673 | 101k | MVMint64 qc_in, ccc_in; |
674 | 101k | int is_prepend = is_grapheme_prepend(tc, in); |
675 | 101k | |
676 | 101k | if (MVM_UNLIKELY(0 < norm->prepend_buffer)) |
677 | 0 | norm->prepend_buffer--; |
678 | 101k | if (MVM_UNLIKELY(is_prepend)) |
679 | 0 | norm->prepend_buffer = 2; |
680 | 101k | |
681 | 101k | /* If it's a control character (outside of the range we checked in the |
682 | 101k | * fast path) then it's a normalization terminator. */ |
683 | 101k | if (in > 0xFF && MVM_string_is_control_full(tc, in) && !is_prepend) { |
684 | 300 | return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, norm, in, out); |
685 | 300 | } |
686 | 101k | |
687 | 101k | /* Do a quickcheck on the codepoint we got in and get its CCC. */ |
688 | 101k | qc_in = passes_quickcheck(tc, norm, in); |
689 | 101k | ccc_in = MVM_unicode_relative_ccc(tc, in); |
690 | 101k | /* Fast cases when we pass quick check and what we got in has CCC = 0, |
691 | 101k | * and it does not follow a prepend character. */ |
692 | 101k | if (qc_in && ccc_in == 0 && norm->prepend_buffer == 0) { |
693 | 100k | if (MVM_NORMALIZE_COMPOSE(norm->form)) { |
694 | 100k | /* We're composing. If we have exactly one thing in the buffer and |
695 | 100k | * it also passes the quick check, and both it and the thing in the |
696 | 100k | * buffer have a CCC of zero, we can hand back the first of the |
697 | 100k | * two - effectively replacing what's in the buffer with the new |
698 | 100k | * codepoint coming in. Note that the NFG quick-check property |
699 | 100k | * factors in grapheme extenders that don't have a CCC of zero, |
700 | 100k | * so we're safe. */ |
701 | 100k | if (norm->buffer_end - norm->buffer_start == 1) { |
702 | 52.7k | MVMCodepoint maybe_result = norm->buffer[norm->buffer_start]; |
703 | 52.7k | if (passes_quickcheck(tc, norm, maybe_result) && MVM_unicode_relative_ccc(tc, maybe_result) == 0) { |
704 | 52.7k | *out = norm->buffer[norm->buffer_start]; |
705 | 52.7k | norm->buffer[norm->buffer_start] = in; |
706 | 52.7k | return 1; |
707 | 52.7k | } |
708 | 52.7k | } |
709 | 100k | } |
710 | 0 | else { |
711 | 0 | /* We're only decomposing. There should probably be nothing in the |
712 | 0 | * buffer in this case; if so we can simply return the codepoint. */ |
713 | 0 | if (norm->buffer_start == norm->buffer_end) { |
714 | 0 | *out = in; |
715 | 0 | return 1; |
716 | 0 | } |
717 | 0 | } |
718 | 100k | } |
719 | 101k | |
720 | 101k | /* If we didn't pass quick check... */ |
721 | 48.6k | if (!qc_in || 0 < norm->prepend_buffer) { |
722 | 1.10k | /* If we're composing, then decompose the last thing placed in the |
723 | 1.10k | * buffer, if any. We need to do this since it may have passed |
724 | 1.10k | * quickcheck, but having seen some character that does pass then we |
725 | 1.10k | * must make sure we decomposed the prior passing one too. */ |
726 | 1.10k | if (MVM_NORMALIZE_COMPOSE(norm->form) && norm->buffer_end != norm->buffer_norm_end && !is_prepend) { |
727 | 244 | MVMCodepoint decomp = norm->buffer[norm->buffer_end - 1]; |
728 | 244 | norm->buffer_end--; |
729 | 244 | decomp_codepoint_to_buffer(tc, norm, decomp); |
730 | 244 | } |
731 | 1.10k | |
732 | 1.10k | /* Decompose this new character into the buffer. We'll need to see |
733 | 1.10k | * more before we can go any further. */ |
734 | 1.10k | decomp_codepoint_to_buffer(tc, norm, in); |
735 | 1.10k | return 0; |
736 | 1.10k | } |
737 | 48.6k | |
738 | 48.6k | /* Since anything we have at this point does pass quick check, add it to |
739 | 48.6k | * the buffer directly. */ |
740 | 47.5k | add_codepoint_to_buffer(tc, norm, in); |
741 | 47.5k | |
742 | 47.5k | /* If the codepoint has a CCC that is non-zero, it's not a starter so we |
743 | 47.5k | * should see more before normalizing. */ |
744 | 47.5k | if (ccc_in > 0) |
745 | 0 | return 0; |
746 | 47.5k | |
747 | 47.5k | /* If we don't have at least one codepoint in the buffer, it's too early |
748 | 47.5k | * to hand anything back. */ |
749 | 47.5k | if (norm->buffer_end - norm->buffer_start <= 1) |
750 | 47.3k | return 0; |
751 | 47.5k | |
752 | 47.5k | /* Perform canonical sorting on everything from the start of the not yet |
753 | 47.5k | * normalized things in the buffer, up to but excluding the quick-check |
754 | 47.5k | * passing thing we just added. */ |
755 | 189 | canonical_sort(tc, norm, norm->buffer_norm_end, norm->buffer_end - 1); |
756 | 189 | |
757 | 189 | /* Perform canonical composition and grapheme composition if needed. */ |
758 | 189 | if (MVM_NORMALIZE_COMPOSE(norm->form)) { |
759 | 189 | canonical_composition(tc, norm, norm->buffer_norm_end, norm->buffer_end - 1); |
760 | 189 | if (MVM_NORMALIZE_GRAPHEME(norm->form)) |
761 | 189 | grapheme_composition(tc, norm, norm->buffer_norm_end, norm->buffer_end - 1); |
762 | 189 | } |
763 | 189 | |
764 | 189 | /* We've now normalized all except the latest, quick-check-passing |
765 | 189 | * codepoint. */ |
766 | 189 | norm->buffer_norm_end = norm->buffer_end - 1; |
767 | 189 | |
768 | 189 | /* Hand back a codepoint, and flag how many more are available. */ |
769 | 189 | *out = norm->buffer[norm->buffer_start]; |
770 | 189 | return norm->buffer_norm_end - norm->buffer_start++; |
771 | 47.5k | } |
772 | | |
773 | | /* Push a number of codepoints into the "to normalize" buffer. */ |
774 | 168 | void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints) { |
775 | 168 | MVMint32 i; |
776 | 484 | for (i = 0; i < num_codepoints; i++) |
777 | 316 | decomp_codepoint_to_buffer(tc, n, in[i]); |
778 | 168 | } |
779 | | |
780 | | /* Processes a codepoint that we regard as a "normalization terminator". These |
781 | | * never have a decomposition, and for all practical purposes will not have a |
782 | | * combiner on them. We treat them specially so we don't, during I/O, block on |
783 | | * seeing a codepoint after them, which for things like REPLs that need to see |
784 | | * input right after a \n makes for problems. */ |
785 | 3.38k | MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) { |
786 | 3.38k | /* Add the codepoint into the buffer. */ |
787 | 3.38k | add_codepoint_to_buffer(tc, n, in); |
788 | 3.38k | |
789 | 3.38k | /* Treat this as an "eof", which really means "normalize what ya got". */ |
790 | 3.38k | MVM_unicode_normalizer_eof(tc, n); |
791 | 3.38k | |
792 | 3.38k | /* Hand back a normalized codepoint, and the number available (have to |
793 | 3.38k | * compensate for the one we steal for *out). */ |
794 | 3.38k | *out = MVM_unicode_normalizer_get_codepoint(tc, n); |
795 | 3.38k | return 1 + MVM_unicode_normalizer_available(tc, n); |
796 | 3.38k | } |
797 | | |
798 | | /* Called when we are expecting no more codepoints. */ |
799 | 53.0k | void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n) { |
800 | 53.0k | /* Perform canonical ordering and, if needed, canonical composition on |
801 | 53.0k | * what remains. */ |
802 | 53.0k | canonical_sort(tc, n, n->buffer_norm_end, n->buffer_end); |
803 | 53.0k | if (MVM_NORMALIZE_COMPOSE(n->form)) { |
804 | 50.2k | canonical_composition(tc, n, n->buffer_norm_end, n->buffer_end); |
805 | 50.2k | if (MVM_NORMALIZE_GRAPHEME(n->form)) |
806 | 50.2k | grapheme_composition(tc, n, n->buffer_norm_end, n->buffer_end); |
807 | 50.2k | } |
808 | 53.0k | /* Reset these two to ensure their value doesn't stick around */ |
809 | 53.0k | n->prepend_buffer = 0; |
810 | 53.0k | n->regional_indicator = 0; |
811 | 53.0k | /* We've now normalized all that remains. */ |
812 | 53.0k | n->buffer_norm_end = n->buffer_end; |
813 | 53.0k | } |