Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/utf16.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
1
#define BOM_UTF16LE "\xff\xfe"
4
1
#define BOM_UTF16BE "\xfe\xff"
5
6
/* mostly from YAML-LibYAML */
7
8
/* Decodes the specified number of bytes of utf16 into an NFG string, creating
9
 * a result of the specified type. The type must have the MVMString REPR. */
10
MVMString * MVM_string_utf16_decode(MVMThreadContext *tc,
11
1
        const MVMObject *result_type, char *utf16_chars, size_t bytes) {
12
1
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
13
1
    size_t str_pos = 0;
14
1
    MVMuint8 *utf16 = (MVMuint8 *)utf16_chars;
15
1
    MVMuint8 *utf16_end;
16
1
    /* set the default byte order */
17
1
#ifdef MVM_BIGENDIAN
18
    int low = 1;
19
    int high = 0;
20
#else
21
1
    int low = 0;
22
1
    int high = 1;
23
1
#endif
24
1
    MVMNormalizer norm;
25
1
    MVMint32 ready;
26
1
27
1
    if (bytes % 2) {
28
0
        MVM_exception_throw_adhoc(tc, "Malformed UTF-16; odd number of bytes");
29
0
    }
30
1
31
1
    /* set the byte order if there's a BOM */
32
1
    if (bytes >= 2) {
33
1
        if (!memcmp(utf16, BOM_UTF16LE, 2)) {
34
0
            low = 0;
35
0
            high = 1;
36
0
            utf16 += 2;
37
0
            bytes -= 2;
38
0
        }
39
1
        else if (!memcmp(utf16, BOM_UTF16BE, 2)) {
40
0
            low = 1;
41
0
            high = 0;
42
0
            utf16 += 2;
43
0
            bytes -= 2;
44
0
        }
45
1
    }
46
1
    utf16_end = utf16 + bytes;
47
1
48
1
    /* possibly allocating extra space; oh well */
49
1
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes / 2);
50
1
51
1
    /* Need to normalize to NFG as we decode. */
52
1
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
53
1
54
2
    for (; utf16 < utf16_end; utf16 += 2) {
55
1
        MVMuint32 value = (utf16[high] << 8) + utf16[low];
56
1
        MVMuint32 value2;
57
1
        MVMGrapheme32 g;
58
1
59
1
        if ((value & 0xFC00) == 0xDC00) {
60
0
            MVM_unicode_normalizer_cleanup(tc, &norm);
61
0
            MVM_exception_throw_adhoc(tc, "Malformed UTF-16; unexpected low surrogate");
62
0
        }
63
1
64
1
        if ((value & 0xFC00) == 0xD800) { /* high surrogate */
65
0
            utf16 += 2;
66
0
            if (utf16 == utf16_end) {
67
0
                MVM_unicode_normalizer_cleanup(tc, &norm);
68
0
                MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair");
69
0
            }
70
0
            value2 = (utf16[high] << 8) + utf16[low];
71
0
            if ((value2 & 0xFC00) != 0xDC00) {
72
0
                MVM_unicode_normalizer_cleanup(tc, &norm);
73
0
                MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair");
74
0
            }
75
0
            value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
76
0
        }
77
1
78
1
        /* TODO: check for invalid values */
79
1
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, value, &g);
80
1
        if (ready) {
81
0
            result->body.storage.blob_32[str_pos++] = g;
82
0
            while (--ready > 0)
83
0
                result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
84
0
        }
85
1
    }
86
1
87
1
    /* Get any final graphemes from the normalizer, and clean it up. */
88
1
    MVM_unicode_normalizer_eof(tc, &norm);
89
1
    ready = MVM_unicode_normalizer_available(tc, &norm);
90
2
    while (ready--)
91
1
        result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
92
1
    MVM_unicode_normalizer_cleanup(tc, &norm);
93
1
94
1
    result->body.storage_type = MVM_STRING_GRAPHEME_32;
95
1
    result->body.num_graphs   = str_pos;
96
1
97
1
    return result;
98
1
}
99
100
/* Encodes the specified substring to utf16. The result string is NULL terminated, but
101
 * the specified size is the non-null part. (This being UTF-16, there are 2 null bytes
102
 * on the end.) */
103
1
char * MVM_string_utf16_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) {
104
1
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
105
1
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - start : length);
106
1
    MVMuint16 *result;
107
1
    MVMuint16 *result_pos;
108
1
    MVMCodepointIter ci;
109
1
    MVMuint8 *repl_bytes = NULL;
110
1
    MVMuint64 repl_length = 0;
111
1
    MVMint32 alloc_size;
112
1
    MVMuint64 scratch_space = 0;
113
1
114
1
    /* must check start first since it's used in the length check */
115
1
    if (start < 0 || start > strgraphs)
116
0
        MVM_exception_throw_adhoc(tc, "start out of range");
117
1
    if (start + lengthu > strgraphs)
118
0
        MVM_exception_throw_adhoc(tc, "length out of range");
119
1
120
1
    if (replacement)
121
0
        repl_bytes = (MVMuint8 *) MVM_string_utf16_encode_substr(tc,
122
0
            replacement, &repl_length, 0, -1, NULL, translate_newlines);
123
1
124
1
    alloc_size = lengthu * 2;
125
1
    result = MVM_malloc(alloc_size + 2);
126
1
    result_pos = result;
127
1
    MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
128
2
    while (MVM_string_ci_has_more(tc, &ci)) {
129
1
        int bytes_needed;
130
1
        MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci);
131
1
132
1
        if (value < 0x10000) {
133
1
            bytes_needed = 2;
134
1
        }
135
0
        else if (value <= 0x1FFFFF) {
136
0
            bytes_needed = 4;
137
0
        }
138
0
        else {
139
0
            bytes_needed = repl_length;
140
0
        }
141
1
142
1
        while ((alloc_size - 2 * (result_pos - result)) < bytes_needed) {
143
0
            MVMuint16 *new_result;
144
0
145
0
            alloc_size *= 2;
146
0
            new_result  = MVM_realloc(result, alloc_size + 2);
147
0
148
0
            result_pos = new_result + (result_pos - result);
149
0
            result     = new_result;
150
0
        }
151
1
152
1
        if (value < 0x10000) {
153
1
            result_pos[0] = value;
154
1
            result_pos++;
155
1
        }
156
0
        else if (value <= 0x1FFFFF) {
157
0
            value -= 0x10000;
158
0
            result_pos[0] = 0xD800 + (value >> 10);
159
0
            result_pos[1] = 0xDC00 + (value & 0x3FF);
160
0
            result_pos += 2;
161
0
        }
162
0
        else if (replacement) {
163
0
            memcpy(result_pos, repl_bytes, repl_length);
164
0
            result_pos += repl_length/2;
165
0
        }
166
0
        else {
167
0
            MVM_free(result);
168
0
            MVM_free(repl_bytes);
169
0
            MVM_exception_throw_adhoc(tc,
170
0
                "Error encoding UTF-16 string: could not encode codepoint %d",
171
0
                value);
172
0
        }
173
1
    }
174
1
    result_pos[0] = 0;
175
1
    if (!output_size)
176
0
        output_size = &scratch_space;
177
1
    *output_size = (char *)result_pos - (char *)result;
178
1
    result = MVM_realloc(result, *output_size);
179
1
    MVM_free(repl_bytes);
180
1
    return (char *)result;
181
1
}
182
183
/* Encodes the whole string, double-NULL terminated. */
184
0
char * MVM_string_utf16_encode(MVMThreadContext *tc, MVMString *str, MVMint32 translate_newlines) {
185
0
    return MVM_string_utf16_encode_substr(tc, str, NULL, 0, -1, NULL, translate_newlines);
186
0
}