/home/travis/build/MoarVM/MoarVM/src/strings/utf16.c

Source (jump to first uncovered line)
#include "moar.h"

#define BOM_UTF16LE "\xff\xfe"
#define BOM_UTF16BE "\xfe\xff"

/* mostly from YAML-LibYAML */

/* Decodes the specified number of bytes of utf16 into an NFG string, creating
 * a result of the specified type. The type must have the MVMString REPR. */
MVMString * MVM_string_utf16_decode(MVMThreadContext *tc,
        const MVMObject *result_type, char *utf16_chars, size_t bytes) {
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
    size_t str_pos = 0;
    MVMuint8 *utf16 = (MVMuint8 *)utf16_chars;
    MVMuint8 *utf16_end;
    /* set the default byte order */
#ifdef MVM_BIGENDIAN
    int low = 1;
    int high = 0;
#else
    int low = 0;
    int high = 1;
#endif
    MVMNormalizer norm;
    MVMint32 ready;

    if (bytes % 2) {
        MVM_exception_throw_adhoc(tc, "Malformed UTF-16; odd number of bytes");
    }

    /* set the byte order if there's a BOM */
    if (bytes >= 2) {
        if (!memcmp(utf16, BOM_UTF16LE, 2)) {
            low = 0;
            high = 1;
            utf16 += 2;
            bytes -= 2;
        }
        else if (!memcmp(utf16, BOM_UTF16BE, 2)) {
            low = 1;
            high = 0;
            utf16 += 2;
            bytes -= 2;
        }
    }
    utf16_end = utf16 + bytes;

    /* possibly allocating extra space; oh well */
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes / 2);

    /* Need to normalize to NFG as we decode. */
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);

    for (; utf16 < utf16_end; utf16 += 2) {
        MVMuint32 value = (utf16[high] << 8) + utf16[low];
        MVMuint32 value2;
        MVMGrapheme32 g;

        if ((value & 0xFC00) == 0xDC00) {
            MVM_unicode_normalizer_cleanup(tc, &norm);
            MVM_exception_throw_adhoc(tc, "Malformed UTF-16; unexpected low surrogate");
        }

        if ((value & 0xFC00) == 0xD800) { /* high surrogate */
            utf16 += 2;
            if (utf16 == utf16_end) {
                MVM_unicode_normalizer_cleanup(tc, &norm);
                MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair");
            }
            value2 = (utf16[high] << 8) + utf16[low];
            if ((value2 & 0xFC00) != 0xDC00) {
                MVM_unicode_normalizer_cleanup(tc, &norm);
                MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair");
            }
            value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
        }

        /* TODO: check for invalid values */
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, value, &g);
        if (ready) {
            result->body.storage.blob_32[str_pos++] = g;
            while (--ready > 0)
                result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        }
    }

    /* Get any final graphemes from the normalizer, and clean it up. */
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    while (ready--)
        result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
    MVM_unicode_normalizer_cleanup(tc, &norm);

    result->body.storage_type = MVM_STRING_GRAPHEME_32;
    result->body.num_graphs   = str_pos;

    return result;
}

/* Encodes the specified substring to utf16. The result string is NULL terminated, but
 * the specified size is the non-null part. (This being UTF-16, there are 2 null bytes
 * on the end.) */
char * MVM_string_utf16_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) {
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - start : length);
    MVMuint16 *result;
    MVMuint16 *result_pos;
    MVMCodepointIter ci;
    MVMuint8 *repl_bytes = NULL;
    MVMuint64 repl_length = 0;
    MVMint32 alloc_size;
    MVMuint64 scratch_space = 0;

    /* must check start first since it's used in the length check */
    if (start < 0 || start > strgraphs)
        MVM_exception_throw_adhoc(tc, "start out of range");
    if (start + lengthu > strgraphs)
        MVM_exception_throw_adhoc(tc, "length out of range");

    if (replacement)
        repl_bytes = (MVMuint8 *) MVM_string_utf16_encode_substr(tc,
            replacement, &repl_length, 0, -1, NULL, translate_newlines);

    alloc_size = lengthu * 2;
    result = MVM_malloc(alloc_size + 2);
    result_pos = result;
    MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
    while (MVM_string_ci_has_more(tc, &ci)) {
        int bytes_needed;
        MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci);

        if (value < 0x10000) {
            bytes_needed = 2;
        }
        else if (value <= 0x1FFFFF) {
            bytes_needed = 4;
        }
        else {
            bytes_needed = repl_length;
        }

        while ((alloc_size - 2 * (result_pos - result)) < bytes_needed) {
            MVMuint16 *new_result;

            alloc_size *= 2;
            new_result  = MVM_realloc(result, alloc_size + 2);

            result_pos = new_result + (result_pos - result);
            result     = new_result;
        }

        if (value < 0x10000) {
            result_pos[0] = value;
            result_pos++;
        }
        else if (value <= 0x1FFFFF) {
            value -= 0x10000;
            result_pos[0] = 0xD800 + (value >> 10);
            result_pos[1] = 0xDC00 + (value & 0x3FF);
            result_pos += 2;
        }
        else if (replacement) {
            memcpy(result_pos, repl_bytes, repl_length);
            result_pos += repl_length/2;
        }
        else {
            MVM_free(result);
            MVM_free(repl_bytes);
            MVM_exception_throw_adhoc(tc,
                "Error encoding UTF-16 string: could not encode codepoint %d",
                value);
        }
    }
    result_pos[0] = 0;
    if (!output_size)
        output_size = &scratch_space;
    *output_size = (char *)result_pos - (char *)result;
    result = MVM_realloc(result, *output_size);
    MVM_free(repl_bytes);
    return (char *)result;
}

/* Encodes the whole string, double-NULL terminated. */
char * MVM_string_utf16_encode(MVMThreadContext *tc, MVMString *str, MVMint32 translate_newlines) {
    return MVM_string_utf16_encode_substr(tc, str, NULL, 0, -1, NULL, translate_newlines);
}

Line	Count	Source (jump to first uncovered line)
1		#include "moar.h"
2
3	1	#define BOM_UTF16LE "\xff\xfe"
4	1	#define BOM_UTF16BE "\xfe\xff"
5
6		/* mostly from YAML-LibYAML */
7
8		/* Decodes the specified number of bytes of utf16 into an NFG string, creating
9		* a result of the specified type. The type must have the MVMString REPR. */
10		MVMString * MVM_string_utf16_decode(MVMThreadContext *tc,
11	1	const MVMObject result_type, char utf16_chars, size_t bytes) {
12	1	MVMString result = (MVMString )REPR(result_type)->allocate(tc, STABLE(result_type));
13	1	size_t str_pos = 0;
14	1	MVMuint8 utf16 = (MVMuint8 )utf16_chars;
15	1	MVMuint8 *utf16_end;
16	1	/* set the default byte order */
17	1	#ifdef MVM_BIGENDIAN
18		int low = 1;
19		int high = 0;
20		#else
21	1	int low = 0;
22	1	int high = 1;
23	1	#endif
24	1	MVMNormalizer norm;
25	1	MVMint32 ready;
26	1
27	1	if (bytes % 2) {
28	0	MVM_exception_throw_adhoc(tc, "Malformed UTF-16; odd number of bytes");
29	0	}
30	1
31	1	/* set the byte order if there's a BOM */
32	1	if (bytes >= 2) {
33	1	if (!memcmp(utf16, BOM_UTF16LE, 2)) {
34	0	low = 0;
35	0	high = 1;
36	0	utf16 += 2;
37	0	bytes -= 2;
38	0	}
39	1	else if (!memcmp(utf16, BOM_UTF16BE, 2)) {
40	0	low = 1;
41	0	high = 0;
42	0	utf16 += 2;
43	0	bytes -= 2;
44	0	}
45	1	}
46	1	utf16_end = utf16 + bytes;
47	1
48	1	/* possibly allocating extra space; oh well */
49	1	result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes / 2);
50	1
51	1	/* Need to normalize to NFG as we decode. */
52	1	MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
53	1
54	2	for (; utf16 < utf16_end; utf16 += 2) {
55	1	MVMuint32 value = (utf16[high] << 8) + utf16[low];
56	1	MVMuint32 value2;
57	1	MVMGrapheme32 g;
58	1
59	1	if ((value & 0xFC00) == 0xDC00) {
60	0	MVM_unicode_normalizer_cleanup(tc, &norm);
61	0	MVM_exception_throw_adhoc(tc, "Malformed UTF-16; unexpected low surrogate");
62	0	}
63	1
64	1	if ((value & 0xFC00) == 0xD800) { /* high surrogate */
65	0	utf16 += 2;
66	0	if (utf16 == utf16_end) {
67	0	MVM_unicode_normalizer_cleanup(tc, &norm);
68	0	MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair");
69	0	}
70	0	value2 = (utf16[high] << 8) + utf16[low];
71	0	if ((value2 & 0xFC00) != 0xDC00) {
72	0	MVM_unicode_normalizer_cleanup(tc, &norm);
73	0	MVM_exception_throw_adhoc(tc, "Malformed UTF-16; incomplete surrogate pair");
74	0	}
75	0	value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
76	0	}
77	1
78	1	/* TODO: check for invalid values */
79	1	ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, value, &g);
80	1	if (ready) {
81	0	result->body.storage.blob_32[str_pos++] = g;
82	0	while (--ready > 0)
83	0	result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
84	0	}
85	1	}
86	1
87	1	/* Get any final graphemes from the normalizer, and clean it up. */
88	1	MVM_unicode_normalizer_eof(tc, &norm);
89	1	ready = MVM_unicode_normalizer_available(tc, &norm);
90	2	while (ready--)
91	1	result->body.storage.blob_32[str_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
92	1	MVM_unicode_normalizer_cleanup(tc, &norm);
93	1
94	1	result->body.storage_type = MVM_STRING_GRAPHEME_32;
95	1	result->body.num_graphs = str_pos;
96	1
97	1	return result;
98	1	}
99
100		/* Encodes the specified substring to utf16. The result string is NULL terminated, but
101		* the specified size is the non-null part. (This being UTF-16, there are 2 null bytes
102		* on the end.) */
103	1	char * MVM_string_utf16_encode_substr(MVMThreadContext tc, MVMString str, MVMuint64 output_size, MVMint64 start, MVMint64 length, MVMString replacement, MVMint32 translate_newlines) {
104	1	MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
105	1	MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - start : length);
106	1	MVMuint16 *result;
107	1	MVMuint16 *result_pos;
108	1	MVMCodepointIter ci;
109	1	MVMuint8 *repl_bytes = NULL;
110	1	MVMuint64 repl_length = 0;
111	1	MVMint32 alloc_size;
112	1	MVMuint64 scratch_space = 0;
113	1
114	1	/* must check start first since it's used in the length check */
115	1	if (start < 0 \|\| start > strgraphs)
116	0	MVM_exception_throw_adhoc(tc, "start out of range");
117	1	if (start + lengthu > strgraphs)
118	0	MVM_exception_throw_adhoc(tc, "length out of range");
119	1
120	1	if (replacement)
121	0	repl_bytes = (MVMuint8 *) MVM_string_utf16_encode_substr(tc,
122	0	replacement, &repl_length, 0, -1, NULL, translate_newlines);
123	1
124	1	alloc_size = lengthu * 2;
125	1	result = MVM_malloc(alloc_size + 2);
126	1	result_pos = result;
127	1	MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
128	2	while (MVM_string_ci_has_more(tc, &ci)) {
129	1	int bytes_needed;
130	1	MVMCodepoint value = MVM_string_ci_get_codepoint(tc, &ci);
131	1
132	1	if (value < 0x10000) {
133	1	bytes_needed = 2;
134	1	}
135	0	else if (value <= 0x1FFFFF) {
136	0	bytes_needed = 4;
137	0	}
138	0	else {
139	0	bytes_needed = repl_length;
140	0	}
141	1
142	1	while ((alloc_size - 2 * (result_pos - result)) < bytes_needed) {
143	0	MVMuint16 *new_result;
144	0
145	0	alloc_size *= 2;
146	0	new_result = MVM_realloc(result, alloc_size + 2);
147	0
148	0	result_pos = new_result + (result_pos - result);
149	0	result = new_result;
150	0	}
151	1
152	1	if (value < 0x10000) {
153	1	result_pos[0] = value;
154	1	result_pos++;
155	1	}
156	0	else if (value <= 0x1FFFFF) {
157	0	value -= 0x10000;
158	0	result_pos[0] = 0xD800 + (value >> 10);
159	0	result_pos[1] = 0xDC00 + (value & 0x3FF);
160	0	result_pos += 2;
161	0	}
162	0	else if (replacement) {
163	0	memcpy(result_pos, repl_bytes, repl_length);
164	0	result_pos += repl_length/2;
165	0	}
166	0	else {
167	0	MVM_free(result);
168	0	MVM_free(repl_bytes);
169	0	MVM_exception_throw_adhoc(tc,
170	0	"Error encoding UTF-16 string: could not encode codepoint %d",
171	0	value);
172	0	}
173	1	}
174	1	result_pos[0] = 0;
175	1	if (!output_size)
176	0	output_size = &scratch_space;
177	1	output_size = (char )result_pos - (char *)result;
178	1	result = MVM_realloc(result, *output_size);
179	1	MVM_free(repl_bytes);
180	1	return (char *)result;
181	1	}
182
183		/* Encodes the whole string, double-NULL terminated. */
184	0	char * MVM_string_utf16_encode(MVMThreadContext tc, MVMString str, MVMint32 translate_newlines) {
185	0	return MVM_string_utf16_encode_substr(tc, str, NULL, 0, -1, NULL, translate_newlines);
186	0	}

Coverage Report

Created: 2018-07-03 15:31