Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/latin1.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* Decodes the specified number of bytes of latin1 into an NFG string,
4
 * creating a result of the specified type. The type must have the MVMString
5
 * REPR. */
6
MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, const MVMObject *result_type,
7
1.02M
                                     char *latin1_c, size_t bytes) {
8
1.02M
    MVMuint8  *latin1 = (MVMuint8 *)latin1_c;
9
1.02M
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
10
1.02M
    size_t i, k, result_graphs;
11
1.02M
12
1.02M
    MVMuint8 writing_32bit = 0;
13
1.02M
14
1.02M
    result->body.storage_type   = MVM_STRING_GRAPHEME_8;
15
1.02M
    result->body.storage.blob_8 = MVM_malloc(sizeof(MVMint8) * bytes);
16
1.02M
17
1.02M
    result_graphs = 0;
18
9.55M
    for (i = 0; i < bytes; i++) {
19
8.53M
        if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') {
20
0
            if (writing_32bit)
21
0
                result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
22
0
            else
23
0
                result->body.storage.blob_8[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
24
0
            i++;
25
0
        }
26
8.53M
        else {
27
8.53M
            if (latin1[i] > 127 && !writing_32bit) {
28
2.36k
                MVMGrapheme8 *old_storage = result->body.storage.blob_8;
29
2.36k
30
2.36k
                result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
31
2.36k
                result->body.storage_type = MVM_STRING_GRAPHEME_32;
32
2.36k
                writing_32bit = 1;
33
2.36k
34
10.5k
                for (k = 0; k < i; k++)
35
8.15k
                    result->body.storage.blob_32[k] = old_storage[k];
36
2.36k
                MVM_free(old_storage);
37
2.36k
            }
38
8.53M
            if (writing_32bit)
39
4.07k
                result->body.storage.blob_32[result_graphs++] = latin1[i];
40
8.53M
            else
41
8.52M
                result->body.storage.blob_8[result_graphs++] = latin1[i];
42
8.53M
        }
43
8.53M
    }
44
1.02M
    result->body.num_graphs = result_graphs;
45
1.02M
46
1.02M
    return result;
47
1.02M
}
48
49
/* Decodes using a decodestream. Decodes as far as it can with the input
50
 * buffers, or until a stopper is reached. */
51
MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
52
                                    const MVMint32 *stopper_chars,
53
1
                                    MVMDecodeStreamSeparators *seps) {
54
1
    MVMint32 count = 0, total = 0;
55
1
    MVMint32 bufsize;
56
1
    MVMGrapheme32 *buffer;
57
1
    MVMDecodeStreamBytes *cur_bytes;
58
1
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
59
1
    MVMint32 last_accept_pos, last_was_cr;
60
1
    MVMuint32 reached_stopper;
61
1
62
1
    /* If there's no buffers, we're done. */
63
1
    if (!ds->bytes_head)
64
0
        return 0;
65
1
    last_accept_pos = ds->bytes_head_pos;
66
1
67
1
    /* If we're asked for zero chars, also done. */
68
1
    if (stopper_chars && *stopper_chars == 0)
69
0
        return 1;
70
1
71
1
    /* Take length of head buffer as initial guess. */
72
1
    bufsize = ds->bytes_head->length;
73
1
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
74
1
75
1
    /* Decode each of the buffers. */
76
1
    cur_bytes = ds->bytes_head;
77
1
    last_was_cr = 0;
78
1
    reached_stopper = 0;
79
2
    while (cur_bytes) {
80
1
        /* Process this buffer. */
81
1
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
82
1
        unsigned char *bytes = (unsigned char *)cur_bytes->bytes;
83
3
        while (pos < cur_bytes->length) {
84
2
            MVMCodepoint codepoint = bytes[pos++];
85
2
            MVMGrapheme32 graph;
86
2
            if (last_was_cr) {
87
0
                if (codepoint == '\n') {
88
0
                    graph = MVM_nfg_crlf_grapheme(tc);
89
0
                }
90
0
                else {
91
0
                    graph = '\r';
92
0
                    pos--;
93
0
                }
94
0
                last_was_cr = 0;
95
0
            }
96
2
            else if (codepoint == '\r') {
97
0
                last_was_cr = 1;
98
0
                continue;
99
0
            }
100
2
            else {
101
2
                graph = codepoint;
102
2
            }
103
2
            if (count == bufsize) {
104
0
                /* We filled the buffer. Attach this one to the buffers
105
0
                 * linked list, and continue with a new one. */
106
0
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
107
0
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
108
0
                count = 0;
109
0
            }
110
2
            buffer[count++] = graph;
111
2
            last_accept_bytes = cur_bytes;
112
2
            last_accept_pos = pos;
113
2
            total++;
114
2
            if (stopper_chars && *stopper_chars == total) {
115
0
                reached_stopper = 1;
116
0
                goto done;
117
0
            }
118
2
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
119
0
                reached_stopper = 1;
120
0
                goto done;
121
0
            }
122
2
        }
123
1
        cur_bytes = cur_bytes->next;
124
1
    }
125
1
  done:
126
1
127
1
    /* Attach what we successfully parsed as a result buffer, and trim away
128
1
     * what we chewed through. */
129
1
    if (count) {
130
1
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
131
1
    }
132
0
    else {
133
0
        MVM_free(buffer);
134
0
    }
135
1
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
136
1
137
1
    return reached_stopper;
138
1
}
139
140
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range
141
 * will become a ?. The result string is NULL terminated, but the specified
142
 * size is the non-null part. */
143
char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
144
32.4k
        MVMString *replacement, MVMint32 translate_newlines) {
145
32.4k
    /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we
146
32.4k
     * may have to resize as we go. */
147
32.4k
    MVMuint32 startu = (MVMuint32)start;
148
32.4k
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
149
32.4k
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
150
32.4k
    MVMuint8 *result;
151
32.4k
    size_t result_alloc;
152
32.4k
    MVMuint8 *repl_bytes = NULL;
153
32.4k
    MVMuint64 repl_length;
154
32.4k
155
32.4k
    /* must check start first since it's used in the length check */
156
32.4k
    if (start < 0 || start > strgraphs)
157
0
        MVM_exception_throw_adhoc(tc, "start out of range");
158
32.4k
    if (length < -1 || start + lengthu > strgraphs)
159
0
        MVM_exception_throw_adhoc(tc, "length out of range");
160
32.4k
161
32.4k
    if (replacement)
162
0
        repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc,
163
0
            replacement, &repl_length, 0, -1, NULL, translate_newlines);
164
32.4k
165
32.4k
    result_alloc = lengthu;
166
32.4k
    result = MVM_malloc(result_alloc + 1);
167
32.4k
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
168
0
        /* No encoding needed; directly copy. */
169
0
        memcpy(result, str->body.storage.blob_ascii, lengthu);
170
0
        result[lengthu] = 0;
171
0
        if (output_size)
172
0
            *output_size = lengthu;
173
0
    }
174
32.4k
    else {
175
32.4k
        MVMuint32 i = 0;
176
32.4k
        MVMCodepointIter ci;
177
32.4k
        MVM_string_ci_init(tc, &ci, str, translate_newlines);
178
400k
        while (MVM_string_ci_has_more(tc, &ci)) {
179
368k
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
180
368k
            if (i == result_alloc) {
181
0
                result_alloc += 8;
182
0
                result = MVM_realloc(result, result_alloc + 1);
183
0
            }
184
368k
            if (ord >= 0 && ord <= 255) {
185
368k
                result[i] = (MVMuint8)ord;
186
368k
                i++;
187
368k
            }
188
0
            else if (replacement) {
189
0
                if (repl_length >= result_alloc || i >= result_alloc - repl_length) {
190
0
                    result_alloc += repl_length;
191
0
                    result = MVM_realloc(result, result_alloc + 1);
192
0
                }
193
0
                memcpy(result + i, repl_bytes, repl_length);
194
0
                i += repl_length;
195
0
            }
196
0
            else {
197
0
                MVM_free(result);
198
0
                MVM_free(repl_bytes);
199
0
                MVM_exception_throw_adhoc(tc,
200
0
                    "Error encoding Latin-1 string: could not encode codepoint %d",
201
0
                    ord);
202
0
            }
203
368k
        }
204
32.4k
        result[i] = 0;
205
32.4k
        if (output_size)
206
32.4k
            *output_size = i;
207
32.4k
    }
208
32.4k
    MVM_free(repl_bytes);
209
32.4k
    return (char *)result;
210
32.4k
}
211
212
/* Encodes the specified string to latin-1. Anything outside of latin-1 range
213
 * will become a ?. The result string is NULL terminated, but the specified
214
 * size is the non-null part. */
215
char * MVM_string_latin1_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size,
216
32.4k
        MVMint32 translate_newlines) {
217
32.4k
    return MVM_string_latin1_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines);
218
32.4k
}