/home/travis/build/MoarVM/MoarVM/src/strings/utf16.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | | |
3 | 1 | #define BOM_UTF16LE "\xff\xfe" |
4 | 1 | #define BOM_UTF16BE "\xfe\xff" |
5 | | |
6 | | /* mostly from YAML-LibYAML */ |
7 | | |
8 | | /* Decodes the specified number of bytes of utf16 into an NFG string, creating |
9 | | * a result of the specified type. The type must have the MVMString REPR. */ |
10 | | MVMString * MVM_string_utf16_decode(MVMThreadContext *tc, |
11 | 1 | const MVMObject *result_type, char *utf16_chars, size_t bytes) { |
12 | 1 | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
13 | 1 | size_t str_pos = 0; |
14 | 1 | MVMuint8 *utf16 = (MVMuint8 *)utf16_chars; |
15 | 1 | MVMuint8 *utf16_end; |
16 | 1 | /* set the default byte order */ |
17 | 1 | #ifdef MVM_BIGENDIAN |
18 | | int low = 1; |
19 | | int high = 0; |
20 | | #else |
21 | 1 | int low = 0; |
22 | 1 | int high = 1; |
23 | 1 | #endif |
24 | 1 | MVMNormalizer norm; |
25 | 1 | MVMint32 ready; |
26 | 1 | |
27 | 1 | if (bytes % 2) { |
28 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-16; odd number of bytes"); |
29 | 0 | } |
30 | 1 | |
31 | 1 | /* set the byte order if there's a BOM */ |
32 | 1 | if (bytes >= 2) { |
33 | 1 | if (!memcmp(utf16, BOM_UTF16LE, 2)) { |
34 | 0 | low = 0; |
35 | 0 | high = 1; |
36 | 0 | utf16 += 2; |
37 | 0 | bytes -= 2; |
38 | 0 | } |
39 | 1 | else if (!memcmp(utf16, BOM_UTF16BE, 2)) { |
40 | 0 | low = 1; |
41 | 0 | high = 0; |
42 | 0 | utf16 += 2; |
43 | 0 | bytes -= 2; |
44 | 0 | } |
45 | 1 | } |
46 | 1 | utf16_end = utf16 + bytes; |
47 | 1 | |
48 | 1 | /* possibly allocating extra space; oh well */ |
49 | 1 | result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes / 2); |
50 | 1 | |
51 | 1 | /* Need to normalize to NFG as we decode. */ |
52 | 1 | MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG); |
53 | 1 | |
54 | 2 | for (; utf16 < utf16_end; utf16 += 2) { |
55 | 1 | MVMuint32 value = (utf16[high] << 8) + utf16[low]; |
56 | 1 | MVMuint32 value2; |
57 | 1 | MVMGrapheme32 g; |
58 | 1 | |
59 | 1 | if ((value & 0xFC00) == 0xDC00) { |
60 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
61 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-16; unexpected low surrogate"); |
62 | 0 | } |
63 | 1 | |
64 | 1 | if ((value & 0xFC00) == 0xD800) { /* high surrogate */ |
65 | 0 | utf16 += 2; |
66 | 0 | if (utf16 == utf16_end) { |
67 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
68 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair"); |
69 | 0 | } |
70 | 0 | value2 = (utf16[high] << 8) + utf16[low]; |
71 | 0 | if ((value2 & 0xFC00) != 0xDC00) { |
72 | 0 | MVM_unicode_normalizer_cleanup(tc, &norm); |
73 | 0 | MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair"); |
74 | 0 | } |
75 | 0 | value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); |
76 | 0 | } |
77 | 1 | |
78 | 1 | /* TODO: check for invalid values */ |
79 | 1 | ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, value, &g); |
80 | 1 | if (ready) { |
81 | 0 | result->body.storage.blob_32[str_pos++] = g; |
82 | 0 | while (--ready > 0) |
83 | 0 | result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
84 | 0 | } |
85 | 1 | } |
86 | 1 | |
87 | 1 | /* Get any final graphemes from the normalizer, and clean it up. */ |
88 | 1 | MVM_unicode_normalizer_eof(tc, &norm); |
89 | 1 | ready = MVM_unicode_normalizer_available(tc, &norm); |
90 | 2 | while (ready--) |
91 | 1 | result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm); |
92 | 1 | MVM_unicode_normalizer_cleanup(tc, &norm); |
93 | 1 | |
94 | 1 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
95 | 1 | result->body.num_graphs = str_pos; |
96 | 1 | |
97 | 1 | return result; |
98 | 1 | } |
99 | | |
100 | | /* Encodes the specified substring to utf16. The result string is NULL terminated, but |
101 | | * the specified size is the non-null part. (This being UTF-16, there are 2 null bytes |
102 | | * on the end.) */ |
103 | 1 | char * MVM_string_utf16_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { |
104 | 1 | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
105 | 1 | MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - start : length); |
106 | 1 | MVMuint16 *result; |
107 | 1 | MVMuint16 *result_pos; |
108 | 1 | MVMCodepointIter ci; |
109 | 1 | MVMuint8 *repl_bytes = NULL; |
110 | 1 | MVMuint64 repl_length = 0; |
111 | 1 | MVMint32 alloc_size; |
112 | 1 | MVMuint64 scratch_space = 0; |
113 | 1 | |
114 | 1 | /* must check start first since it's used in the length check */ |
115 | 1 | if (start < 0 || start > strgraphs) |
116 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
117 | 1 | if (start + lengthu > strgraphs) |
118 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
119 | 1 | |
120 | 1 | if (replacement) |
121 | 0 | repl_bytes = (MVMuint8 *) MVM_string_utf16_encode_substr(tc, |
122 | 0 | replacement, &repl_length, 0, -1, NULL, translate_newlines); |
123 | 1 | |
124 | 1 | alloc_size = lengthu * 2; |
125 | 1 | result = MVM_malloc(alloc_size + 2); |
126 | 1 | result_pos = result; |
127 | 1 | MVM_string_ci_init(tc, &ci, str, translate_newlines, 0); |
128 | 2 | while (MVM_string_ci_has_more(tc, &ci)) { |
129 | 1 | int bytes_needed; |
130 | 1 | MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci); |
131 | 1 | |
132 | 1 | if (value < 0x10000) { |
133 | 1 | bytes_needed = 2; |
134 | 1 | } |
135 | 0 | else if (value <= 0x1FFFFF) { |
136 | 0 | bytes_needed = 4; |
137 | 0 | } |
138 | 0 | else { |
139 | 0 | bytes_needed = repl_length; |
140 | 0 | } |
141 | 1 | |
142 | 1 | while ((alloc_size - 2 * (result_pos - result)) < bytes_needed) { |
143 | 0 | MVMuint16 *new_result; |
144 | 0 |
|
145 | 0 | alloc_size *= 2; |
146 | 0 | new_result = MVM_realloc(result, alloc_size + 2); |
147 | 0 |
|
148 | 0 | result_pos = new_result + (result_pos - result); |
149 | 0 | result = new_result; |
150 | 0 | } |
151 | 1 | |
152 | 1 | if (value < 0x10000) { |
153 | 1 | result_pos[0] = value; |
154 | 1 | result_pos++; |
155 | 1 | } |
156 | 0 | else if (value <= 0x1FFFFF) { |
157 | 0 | value -= 0x10000; |
158 | 0 | result_pos[0] = 0xD800 + (value >> 10); |
159 | 0 | result_pos[1] = 0xDC00 + (value & 0x3FF); |
160 | 0 | result_pos += 2; |
161 | 0 | } |
162 | 0 | else if (replacement) { |
163 | 0 | memcpy(result_pos, repl_bytes, repl_length); |
164 | 0 | result_pos += repl_length/2; |
165 | 0 | } |
166 | 0 | else { |
167 | 0 | MVM_free(result); |
168 | 0 | MVM_free(repl_bytes); |
169 | 0 | MVM_exception_throw_adhoc(tc, |
170 | 0 | "Error encoding UTF-16 string: could not encode codepoint %d", |
171 | 0 | value); |
172 | 0 | } |
173 | 1 | } |
174 | 1 | result_pos[0] = 0; |
175 | 1 | if (!output_size) |
176 | 0 | output_size = &scratch_space; |
177 | 1 | *output_size = (char *)result_pos - (char *)result; |
178 | 1 | result = MVM_realloc(result, *output_size); |
179 | 1 | MVM_free(repl_bytes); |
180 | 1 | return (char *)result; |
181 | 1 | } |
182 | | |
183 | | /* Encodes the whole string, double-NULL terminated. */ |
184 | 0 | char * MVM_string_utf16_encode(MVMThreadContext *tc, MVMString *str, MVMint32 translate_newlines) { |
185 | 0 | return MVM_string_utf16_encode_substr(tc, str, NULL, 0, -1, NULL, translate_newlines); |
186 | 0 | } |