/home/travis/build/MoarVM/MoarVM/src/strings/latin1.c

Source (jump to first uncovered line)
#include "moar.h"

/* Decodes the specified number of bytes of latin1 into an NFG string,
 * creating a result of the specified type. The type must have the MVMString
 * REPR. */
MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, const MVMObject *result_type,
                                     char *latin1_c, size_t bytes) {
    MVMuint8  *latin1 = (MVMuint8 *)latin1_c;
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
    size_t i, k, result_graphs;

    MVMuint8 writing_32bit = 0;

    result->body.storage_type   = MVM_STRING_GRAPHEME_8;
    result->body.storage.blob_8 = MVM_malloc(sizeof(MVMint8) * bytes);

    result_graphs = 0;
    for (i = 0; i < bytes; i++) {
        if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') {
            if (writing_32bit)
                result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
            else
                result->body.storage.blob_8[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
            i++;
        }
        else {
            if (latin1[i] > 127 && !writing_32bit) {
                MVMGrapheme8 *old_storage = result->body.storage.blob_8;

                result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
                result->body.storage_type = MVM_STRING_GRAPHEME_32;
                writing_32bit = 1;

                for (k = 0; k < i; k++)
                    result->body.storage.blob_32[k] = old_storage[k];
                MVM_free(old_storage);
            }
            if (writing_32bit)
                result->body.storage.blob_32[result_graphs++] = latin1[i];
            else
                result->body.storage.blob_8[result_graphs++] = latin1[i];
        }
    }
    result->body.num_graphs = result_graphs;

    return result;
}

/* Decodes using a decodestream. Decodes as far as it can with the input
 * buffers, or until a stopper is reached. */
MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
                                    const MVMint32 *stopper_chars,
                                    MVMDecodeStreamSeparators *seps) {
    MVMint32 count = 0, total = 0;
    MVMint32 bufsize;
    MVMGrapheme32 *buffer;
    MVMDecodeStreamBytes *cur_bytes;
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
    MVMint32 last_accept_pos, last_was_cr;
    MVMuint32 reached_stopper;

    /* If there's no buffers, we're done. */
    if (!ds->bytes_head)
        return 0;
    last_accept_pos = ds->bytes_head_pos;

    /* If we're asked for zero chars, also done. */
    if (stopper_chars && *stopper_chars == 0)
        return 1;

    bufsize = ds->result_size_guess;
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));

    /* Decode each of the buffers. */
    cur_bytes = ds->bytes_head;
    last_was_cr = 0;
    reached_stopper = 0;
    while (cur_bytes) {
        /* Process this buffer. */
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
        unsigned char *bytes = (unsigned char *)cur_bytes->bytes;
        while (pos < cur_bytes->length) {
            MVMCodepoint codepoint = bytes[pos++];
            MVMGrapheme32 graph;
            if (last_was_cr) {
                if (codepoint == '\n') {
                    graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
                }
                else {
                    graph = '\r';
                    pos--;
                }
                last_was_cr = 0;
            }
            else if (codepoint == '\r') {
                last_was_cr = 1;
                continue;
            }
            else {
                graph = codepoint;
            }
            if (count == bufsize) {
                /* We filled the buffer. Attach this one to the buffers
                 * linked list, and continue with a new one. */
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
                count = 0;
            }
            buffer[count++] = graph;
            last_accept_bytes = cur_bytes;
            last_accept_pos = pos;
            total++;
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
                reached_stopper = 1;
                goto done;
            }
            else if (stopper_chars && *stopper_chars == total) {
                reached_stopper = 1;
                goto done;
            }
        }
        cur_bytes = cur_bytes->next;
    }
  done:

    /* Attach what we successfully parsed as a result buffer, and trim away
     * what we chewed through. */
    if (count) {
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
    else {
        MVM_free(buffer);
    }
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);

    return reached_stopper;
}

/* Encodes the specified substring to latin-1. Anything outside of latin-1 range
 * will become a ?. The result string is NULL terminated, but the specified
 * size is the non-null part. */
char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
        MVMString *replacement, MVMint32 translate_newlines) {
    /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we
     * may have to resize as we go. */
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - (MVMuint32)start : length);
    MVMuint8 *result;
    size_t result_alloc;
    MVMuint8 *repl_bytes = NULL;
    MVMuint64 repl_length;

    /* must check start first since it's used in the length check */
    if (start < 0 || start > strgraphs)
        MVM_exception_throw_adhoc(tc, "start out of range");
    if (length < -1 || start + lengthu > strgraphs)
        MVM_exception_throw_adhoc(tc, "length out of range");

    if (replacement)
        repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc,
            replacement, &repl_length, 0, -1, NULL, translate_newlines);

    result_alloc = lengthu;
    result = MVM_malloc(result_alloc + 1);
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
        /* No encoding needed; directly copy. */
        memcpy(result, str->body.storage.blob_ascii, lengthu);
        result[lengthu] = 0;
        if (output_size)
            *output_size = lengthu;
    }
    else {
        MVMuint32 i = 0;
        MVMCodepointIter ci;
        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
        while (MVM_string_ci_has_more(tc, &ci)) {
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
            if (i == result_alloc) {
                result_alloc += 8;
                result = MVM_realloc(result, result_alloc + 1);
            }
            if (ord >= 0 && ord <= 255) {
                result[i] = (MVMuint8)ord;
                i++;
            }
            else if (replacement) {
                if (repl_length >= result_alloc || i >= result_alloc - repl_length) {
                    result_alloc += repl_length;
                    result = MVM_realloc(result, result_alloc + 1);
                }
                memcpy(result + i, repl_bytes, repl_length);
                i += repl_length;
            }
            else {
                MVM_free(result);
                MVM_free(repl_bytes);
                MVM_exception_throw_adhoc(tc,
                    "Error encoding Latin-1 string: could not encode codepoint %d",
                    ord);
            }
        }
        result[i] = 0;
        if (output_size)
            *output_size = i;
    }
    MVM_free(repl_bytes);
    return (char *)result;
}

/* Encodes the specified string to latin-1. Anything outside of latin-1 range
 * will become a ?. The result string is NULL terminated, but the specified
 * size is the non-null part. */
char * MVM_string_latin1_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size,
        MVMint32 translate_newlines) {
    return MVM_string_latin1_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines);
}

Coverage Report

Created: 2018-07-03 15:31

Line	Count	Source (jump to first uncovered line)
1		#include "moar.h"
2
3		/* Decodes the specified number of bytes of latin1 into an NFG string,
4		* creating a result of the specified type. The type must have the MVMString
5		* REPR. */
6		MVMString * MVM_string_latin1_decode(MVMThreadContext tc, const MVMObject result_type,
7	1.17M	char *latin1_c, size_t bytes) {
8	1.17M	MVMuint8 latin1 = (MVMuint8 )latin1_c;
9	1.17M	MVMString result = (MVMString )REPR(result_type)->allocate(tc, STABLE(result_type));
10	1.17M	size_t i, k, result_graphs;
11	1.17M
12	1.17M	MVMuint8 writing_32bit = 0;
13	1.17M
14	1.17M	result->body.storage_type = MVM_STRING_GRAPHEME_8;
15	1.17M	result->body.storage.blob_8 = MVM_malloc(sizeof(MVMint8) * bytes);
16	1.17M
17	1.17M	result_graphs = 0;
18	11.1M	for (i = 0; i < bytes; i++) {
19	9.95M	if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') {
20	0	if (writing_32bit)
21	0	result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
22	0	else
23	0	result->body.storage.blob_8[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
24	0	i++;
25	0	}
26	9.95M	else {
27	9.95M	if (latin1[i] > 127 && !writing_32bit) {
28	2.56k	MVMGrapheme8 *old_storage = result->body.storage.blob_8;
29	2.56k
30	2.56k	result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
31	2.56k	result->body.storage_type = MVM_STRING_GRAPHEME_32;
32	2.56k	writing_32bit = 1;
33	2.56k
34	11.5k	for (k = 0; k < i; k++)
35	9.01k	result->body.storage.blob_32[k] = old_storage[k];
36	2.56k	MVM_free(old_storage);
37	2.56k	}
38	9.95M	if (writing_32bit)
39	4.51k	result->body.storage.blob_32[result_graphs++] = latin1[i];
40	9.95M	else
41	9.95M	result->body.storage.blob_8[result_graphs++] = latin1[i];
42	9.95M	}
43	9.95M	}
44	1.17M	result->body.num_graphs = result_graphs;
45	1.17M
46	1.17M	return result;
47	1.17M	}
48
49		/* Decodes using a decodestream. Decodes as far as it can with the input
50		* buffers, or until a stopper is reached. */
51		MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext tc, MVMDecodeStream ds,
52		const MVMint32 *stopper_chars,
53	1	MVMDecodeStreamSeparators *seps) {
54	1	MVMint32 count = 0, total = 0;
55	1	MVMint32 bufsize;
56	1	MVMGrapheme32 *buffer;
57	1	MVMDecodeStreamBytes *cur_bytes;
58	1	MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
59	1	MVMint32 last_accept_pos, last_was_cr;
60	1	MVMuint32 reached_stopper;
61	1
62	1	/* If there's no buffers, we're done. */
63	1	if (!ds->bytes_head)
64	0	return 0;
65	1	last_accept_pos = ds->bytes_head_pos;
66	1
67	1	/* If we're asked for zero chars, also done. */
68	1	if (stopper_chars && *stopper_chars == 0)
69	0	return 1;
70	1
71	1	bufsize = ds->result_size_guess;
72	1	buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
73	1
74	1	/* Decode each of the buffers. */
75	1	cur_bytes = ds->bytes_head;
76	1	last_was_cr = 0;
77	1	reached_stopper = 0;
78	2	while (cur_bytes) {
79	1	/* Process this buffer. */
80	1	MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
81	1	unsigned char bytes = (unsigned char )cur_bytes->bytes;
82	3	while (pos < cur_bytes->length) {
83	2	MVMCodepoint codepoint = bytes[pos++];
84	2	MVMGrapheme32 graph;
85	2	if (last_was_cr) {
86	0	if (codepoint == '\n') {
87	0	graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
88	0	}
89	0	else {
90	0	graph = '\r';
91	0	pos--;
92	0	}
93	0	last_was_cr = 0;
94	0	}
95	2	else if (codepoint == '\r') {
96	0	last_was_cr = 1;
97	0	continue;
98	0	}
99	2	else {
100	2	graph = codepoint;
101	2	}
102	2	if (count == bufsize) {
103	0	/* We filled the buffer. Attach this one to the buffers
104	0	* linked list, and continue with a new one. */
105	0	MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
106	0	buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
107	0	count = 0;
108	0	}
109	2	buffer[count++] = graph;
110	2	last_accept_bytes = cur_bytes;
111	2	last_accept_pos = pos;
112	2	total++;
113	2	if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
114	0	reached_stopper = 1;
115	0	goto done;
116	0	}
117	2	else if (stopper_chars && *stopper_chars == total) {
118	0	reached_stopper = 1;
119	0	goto done;
120	0	}
121	2	}
122	1	cur_bytes = cur_bytes->next;
123	1	}
124	1	done:
125	1
126	1	/* Attach what we successfully parsed as a result buffer, and trim away
127	1	* what we chewed through. */
128	1	if (count) {
129	1	MVM_string_decodestream_add_chars(tc, ds, buffer, count);
130	1	}
131	0	else {
132	0	MVM_free(buffer);
133	0	}
134	1	MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
135	1
136	1	return reached_stopper;
137	1	}
138
139		/* Encodes the specified substring to latin-1. Anything outside of latin-1 range
140		* will become a ?. The result string is NULL terminated, but the specified
141		* size is the non-null part. */
142		char * MVM_string_latin1_encode_substr(MVMThreadContext tc, MVMString str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
143	33.9k	MVMString *replacement, MVMint32 translate_newlines) {
144	33.9k	/* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we
145	33.9k	* may have to resize as we go. */
146	33.9k	MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
147	33.9k	MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - (MVMuint32)start : length);
148	33.9k	MVMuint8 *result;
149	33.9k	size_t result_alloc;
150	33.9k	MVMuint8 *repl_bytes = NULL;
151	33.9k	MVMuint64 repl_length;
152	33.9k
153	33.9k	/* must check start first since it's used in the length check */
154	33.9k	if (start < 0 \|\| start > strgraphs)
155	0	MVM_exception_throw_adhoc(tc, "start out of range");
156	33.9k	if (length < -1 \|\| start + lengthu > strgraphs)
157	0	MVM_exception_throw_adhoc(tc, "length out of range");
158	33.9k
159	33.9k	if (replacement)
160	2	repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc,
161	2	replacement, &repl_length, 0, -1, NULL, translate_newlines);
162	33.9k
163	33.9k	result_alloc = lengthu;
164	33.9k	result = MVM_malloc(result_alloc + 1);
165	33.9k	if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
166	0	/* No encoding needed; directly copy. */
167	0	memcpy(result, str->body.storage.blob_ascii, lengthu);
168	0	result[lengthu] = 0;
169	0	if (output_size)
170	0	*output_size = lengthu;
171	0	}
172	33.9k	else {
173	33.9k	MVMuint32 i = 0;
174	33.9k	MVMCodepointIter ci;
175	33.9k	MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
176	412k	while (MVM_string_ci_has_more(tc, &ci)) {
177	378k	MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
178	378k	if (i == result_alloc) {
179	2	result_alloc += 8;
180	2	result = MVM_realloc(result, result_alloc + 1);
181	2	}
182	378k	if (ord >= 0 && ord <= 255) {
183	378k	result[i] = (MVMuint8)ord;
184	378k	i++;
185	378k	}
186	4	else if (replacement) {
187	4	if (repl_length >= result_alloc \|\| i >= result_alloc - repl_length) {
188	2	result_alloc += repl_length;
189	2	result = MVM_realloc(result, result_alloc + 1);
190	2	}
191	4	memcpy(result + i, repl_bytes, repl_length);
192	4	i += repl_length;
193	4	}
194	0	else {
195	0	MVM_free(result);
196	0	MVM_free(repl_bytes);
197	0	MVM_exception_throw_adhoc(tc,
198	0	"Error encoding Latin-1 string: could not encode codepoint %d",
199	0	ord);
200	0	}
201	378k	}
202	33.9k	result[i] = 0;
203	33.9k	if (output_size)
204	33.9k	*output_size = i;
205	33.9k	}
206	33.9k	MVM_free(repl_bytes);
207	33.9k	return (char *)result;
208	33.9k	}
209
210		/* Encodes the specified string to latin-1. Anything outside of latin-1 range
211		* will become a ?. The result string is NULL terminated, but the specified
212		* size is the non-null part. */
213		char * MVM_string_latin1_encode(MVMThreadContext tc, MVMString str, MVMuint64 *output_size,
214	33.9k	MVMint32 translate_newlines) {
215	33.9k	return MVM_string_latin1_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines);
216	33.9k	}