Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/latin1.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* Decodes the specified number of bytes of latin1 into an NFG string,
4
 * creating a result of the specified type. The type must have the MVMString
5
 * REPR. */
6
MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, const MVMObject *result_type,
7
1.17M
                                     char *latin1_c, size_t bytes) {
8
1.17M
    MVMuint8  *latin1 = (MVMuint8 *)latin1_c;
9
1.17M
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
10
1.17M
    size_t i, k, result_graphs;
11
1.17M
12
1.17M
    MVMuint8 writing_32bit = 0;
13
1.17M
14
1.17M
    result->body.storage_type   = MVM_STRING_GRAPHEME_8;
15
1.17M
    result->body.storage.blob_8 = MVM_malloc(sizeof(MVMint8) * bytes);
16
1.17M
17
1.17M
    result_graphs = 0;
18
11.1M
    for (i = 0; i < bytes; i++) {
19
9.95M
        if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') {
20
0
            if (writing_32bit)
21
0
                result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
22
0
            else
23
0
                result->body.storage.blob_8[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
24
0
            i++;
25
0
        }
26
9.95M
        else {
27
9.95M
            if (latin1[i] > 127 && !writing_32bit) {
28
2.56k
                MVMGrapheme8 *old_storage = result->body.storage.blob_8;
29
2.56k
30
2.56k
                result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
31
2.56k
                result->body.storage_type = MVM_STRING_GRAPHEME_32;
32
2.56k
                writing_32bit = 1;
33
2.56k
34
11.5k
                for (k = 0; k < i; k++)
35
9.01k
                    result->body.storage.blob_32[k] = old_storage[k];
36
2.56k
                MVM_free(old_storage);
37
2.56k
            }
38
9.95M
            if (writing_32bit)
39
4.51k
                result->body.storage.blob_32[result_graphs++] = latin1[i];
40
9.95M
            else
41
9.95M
                result->body.storage.blob_8[result_graphs++] = latin1[i];
42
9.95M
        }
43
9.95M
    }
44
1.17M
    result->body.num_graphs = result_graphs;
45
1.17M
46
1.17M
    return result;
47
1.17M
}
48
49
/* Decodes using a decodestream. Decodes as far as it can with the input
50
 * buffers, or until a stopper is reached. */
51
MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
52
                                    const MVMint32 *stopper_chars,
53
1
                                    MVMDecodeStreamSeparators *seps) {
54
1
    MVMint32 count = 0, total = 0;
55
1
    MVMint32 bufsize;
56
1
    MVMGrapheme32 *buffer;
57
1
    MVMDecodeStreamBytes *cur_bytes;
58
1
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
59
1
    MVMint32 last_accept_pos, last_was_cr;
60
1
    MVMuint32 reached_stopper;
61
1
62
1
    /* If there's no buffers, we're done. */
63
1
    if (!ds->bytes_head)
64
0
        return 0;
65
1
    last_accept_pos = ds->bytes_head_pos;
66
1
67
1
    /* If we're asked for zero chars, also done. */
68
1
    if (stopper_chars && *stopper_chars == 0)
69
0
        return 1;
70
1
71
1
    bufsize = ds->result_size_guess;
72
1
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
73
1
74
1
    /* Decode each of the buffers. */
75
1
    cur_bytes = ds->bytes_head;
76
1
    last_was_cr = 0;
77
1
    reached_stopper = 0;
78
2
    while (cur_bytes) {
79
1
        /* Process this buffer. */
80
1
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
81
1
        unsigned char *bytes = (unsigned char *)cur_bytes->bytes;
82
3
        while (pos < cur_bytes->length) {
83
2
            MVMCodepoint codepoint = bytes[pos++];
84
2
            MVMGrapheme32 graph;
85
2
            if (last_was_cr) {
86
0
                if (codepoint == '\n') {
87
0
                    graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
88
0
                }
89
0
                else {
90
0
                    graph = '\r';
91
0
                    pos--;
92
0
                }
93
0
                last_was_cr = 0;
94
0
            }
95
2
            else if (codepoint == '\r') {
96
0
                last_was_cr = 1;
97
0
                continue;
98
0
            }
99
2
            else {
100
2
                graph = codepoint;
101
2
            }
102
2
            if (count == bufsize) {
103
0
                /* We filled the buffer. Attach this one to the buffers
104
0
                 * linked list, and continue with a new one. */
105
0
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
106
0
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
107
0
                count = 0;
108
0
            }
109
2
            buffer[count++] = graph;
110
2
            last_accept_bytes = cur_bytes;
111
2
            last_accept_pos = pos;
112
2
            total++;
113
2
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
114
0
                reached_stopper = 1;
115
0
                goto done;
116
0
            }
117
2
            else if (stopper_chars && *stopper_chars == total) {
118
0
                reached_stopper = 1;
119
0
                goto done;
120
0
            }
121
2
        }
122
1
        cur_bytes = cur_bytes->next;
123
1
    }
124
1
  done:
125
1
126
1
    /* Attach what we successfully parsed as a result buffer, and trim away
127
1
     * what we chewed through. */
128
1
    if (count) {
129
1
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
130
1
    }
131
0
    else {
132
0
        MVM_free(buffer);
133
0
    }
134
1
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
135
1
136
1
    return reached_stopper;
137
1
}
138
139
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range
140
 * will become a ?. The result string is NULL terminated, but the specified
141
 * size is the non-null part. */
142
char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
143
33.9k
        MVMString *replacement, MVMint32 translate_newlines) {
144
33.9k
    /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we
145
33.9k
     * may have to resize as we go. */
146
33.9k
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
147
33.9k
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - (MVMuint32)start : length);
148
33.9k
    MVMuint8 *result;
149
33.9k
    size_t result_alloc;
150
33.9k
    MVMuint8 *repl_bytes = NULL;
151
33.9k
    MVMuint64 repl_length;
152
33.9k
153
33.9k
    /* must check start first since it's used in the length check */
154
33.9k
    if (start < 0 || start > strgraphs)
155
0
        MVM_exception_throw_adhoc(tc, "start out of range");
156
33.9k
    if (length < -1 || start + lengthu > strgraphs)
157
0
        MVM_exception_throw_adhoc(tc, "length out of range");
158
33.9k
159
33.9k
    if (replacement)
160
2
        repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc,
161
2
            replacement, &repl_length, 0, -1, NULL, translate_newlines);
162
33.9k
163
33.9k
    result_alloc = lengthu;
164
33.9k
    result = MVM_malloc(result_alloc + 1);
165
33.9k
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
166
0
        /* No encoding needed; directly copy. */
167
0
        memcpy(result, str->body.storage.blob_ascii, lengthu);
168
0
        result[lengthu] = 0;
169
0
        if (output_size)
170
0
            *output_size = lengthu;
171
0
    }
172
33.9k
    else {
173
33.9k
        MVMuint32 i = 0;
174
33.9k
        MVMCodepointIter ci;
175
33.9k
        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
176
412k
        while (MVM_string_ci_has_more(tc, &ci)) {
177
378k
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
178
378k
            if (i == result_alloc) {
179
2
                result_alloc += 8;
180
2
                result = MVM_realloc(result, result_alloc + 1);
181
2
            }
182
378k
            if (ord >= 0 && ord <= 255) {
183
378k
                result[i] = (MVMuint8)ord;
184
378k
                i++;
185
378k
            }
186
4
            else if (replacement) {
187
4
                if (repl_length >= result_alloc || i >= result_alloc - repl_length) {
188
2
                    result_alloc += repl_length;
189
2
                    result = MVM_realloc(result, result_alloc + 1);
190
2
                }
191
4
                memcpy(result + i, repl_bytes, repl_length);
192
4
                i += repl_length;
193
4
            }
194
0
            else {
195
0
                MVM_free(result);
196
0
                MVM_free(repl_bytes);
197
0
                MVM_exception_throw_adhoc(tc,
198
0
                    "Error encoding Latin-1 string: could not encode codepoint %d",
199
0
                    ord);
200
0
            }
201
378k
        }
202
33.9k
        result[i] = 0;
203
33.9k
        if (output_size)
204
33.9k
            *output_size = i;
205
33.9k
    }
206
33.9k
    MVM_free(repl_bytes);
207
33.9k
    return (char *)result;
208
33.9k
}
209
210
/* Encodes the specified string to latin-1. Anything outside of latin-1 range
211
 * will become a ?. The result string is NULL terminated, but the specified
212
 * size is the non-null part. */
213
char * MVM_string_latin1_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size,
214
33.9k
        MVMint32 translate_newlines) {
215
33.9k
    return MVM_string_latin1_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines);
216
33.9k
}