Coverage Report

Created: 2017-04-15 07:07

/home/travis/build/MoarVM/MoarVM/src/strings/ascii.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* Decodes the specified number of bytes of ASCII into an NFG string, creating
4
 * a result of the specified type. The type must have the MVMString REPR. */
5
154k
MVMString * MVM_string_ascii_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *ascii, size_t bytes) {
6
154k
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
7
154k
    size_t i, result_graphs;
8
154k
9
154k
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
10
154k
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
11
154k
12
154k
    result_graphs = 0;
13
2.13M
    for (i = 0; i < bytes; i++) {
14
1.98M
        if (ascii[i] == '\r' && i + 1 < bytes && ascii[i + 1] == '\n') {
15
0
            result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
16
0
            i++;
17
0
        }
18
1.98M
        else if (ascii[i] >= 0) {
19
1.98M
            result->body.storage.blob_32[result_graphs++] = ascii[i];
20
1.98M
        }
21
0
        else {
22
0
            MVM_exception_throw_adhoc(tc,
23
0
                "Will not decode invalid ASCII (code point > 127 found)");
24
0
        }
25
1.98M
    }
26
154k
    result->body.num_graphs = result_graphs;
27
154k
28
154k
    return result;
29
154k
}
30
31
/* Decodes a NULL-terminated ASCII string into an NFG string, creating
32
 * a result of the specified type. The type must have the MVMString REPR. */
33
55.0k
MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, const MVMObject *result_type, const char *ascii) {
34
55.0k
    return MVM_string_ascii_decode(tc, result_type, ascii, strlen(ascii));
35
55.0k
}
36
37
/* Decodes using a decodestream. Decodes as far as it can with the input
38
 * buffers, or until a stopper is reached. */
39
MVMuint32 MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
40
                                   const MVMint32 *stopper_chars,
41
1
                                   MVMDecodeStreamSeparators *seps) {
42
1
    MVMint32              count = 0, total = 0;
43
1
    MVMint32              bufsize;
44
1
    MVMGrapheme32        *buffer;
45
1
    MVMDecodeStreamBytes *cur_bytes;
46
1
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
47
1
    MVMint32 last_accept_pos, last_was_cr;
48
1
    MVMuint32 reached_stopper;
49
1
50
1
    /* If there's no buffers, we're done. */
51
1
    if (!ds->bytes_head)
52
0
        return 0;
53
1
    last_accept_pos = ds->bytes_head_pos;
54
1
55
1
    /* If we're asked for zero chars, also done. */
56
1
    if (stopper_chars && *stopper_chars == 0)
57
0
        return 1;
58
1
59
1
    /* Take length of head buffer as initial guess. */
60
1
    bufsize = ds->bytes_head->length;
61
1
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
62
1
63
1
    /* Decode each of the buffers. */
64
1
    cur_bytes = ds->bytes_head;
65
1
    last_was_cr = 0;
66
1
    reached_stopper = 0;
67
3
    while (cur_bytes) {
68
2
        /* Process this buffer. */
69
1
        MVMint32  pos   = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
70
2
        char     *bytes = cur_bytes->bytes;
71
11
        while (pos < cur_bytes->length) {
72
9
            MVMCodepoint codepoint = bytes[pos++];
73
9
            MVMGrapheme32 graph;
74
9
            if (codepoint > 127)
75
0
                MVM_exception_throw_adhoc(tc,
76
0
                    "Will not decode invalid ASCII (code point > 127 found)");
77
9
            if (last_was_cr) {
78
0
                if (codepoint == '\n') {
79
0
                    graph = MVM_nfg_crlf_grapheme(tc);
80
0
                }
81
0
                else {
82
0
                    graph = '\r';
83
0
                    pos--;
84
0
                }
85
0
                last_was_cr = 0;
86
0
            }
87
9
            else if (codepoint == '\r') {
88
0
                last_was_cr = 1;
89
0
                continue;
90
0
            }
91
9
            else {
92
9
                graph = codepoint;
93
9
            }
94
9
            if (count == bufsize) {
95
2
                /* We filled the buffer. Attach this one to the buffers
96
2
                 * linked list, and continue with a new one. */
97
2
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
98
2
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
99
2
                count = 0;
100
2
            }
101
9
            buffer[count++] = graph;
102
9
            last_accept_bytes = cur_bytes;
103
9
            last_accept_pos = pos;
104
9
            total++;
105
9
            if (stopper_chars && *stopper_chars == total) {
106
0
                reached_stopper = 1;
107
0
                goto done;
108
0
            }
109
9
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
110
0
                reached_stopper = 1;
111
0
                goto done;
112
0
            }
113
9
        }
114
2
        cur_bytes = cur_bytes->next;
115
2
    }
116
1
  done:
117
1
118
1
    /* Attach what we successfully parsed as a result buffer, and trim away
119
1
     * what we chewed through. */
120
1
    if (count) {
121
1
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
122
1
    }
123
0
    else {
124
0
        MVM_free(buffer);
125
0
    }
126
1
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
127
1
128
1
    return reached_stopper;
129
1
}
130
131
/* Encodes the specified substring to ASCII. Anything outside of ASCII range
132
 * will become replaced with the supplied replacement, or an exception will be
133
 * thrown if there isn't one. The result string is NULL terminated, but the
134
 * specified size is the non-null part. */
135
4.37k
char * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) {
136
4.37k
    /* ASCII is a single byte encoding, but \r\n is a 2-byte grapheme, so we
137
4.37k
     * may have to resize as we go. */
138
4.37k
    MVMuint32      startu    = (MVMuint32)start;
139
4.37k
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
140
4.37k
    MVMuint32      lengthu   = (MVMuint32)(length == -1 ? strgraphs - startu : length);
141
4.37k
    MVMuint8      *result;
142
4.37k
    size_t         result_alloc;
143
4.37k
    MVMuint8      *repl_bytes = NULL;
144
4.37k
    MVMuint64      repl_length;
145
4.37k
146
4.37k
    /* must check start first since it's used in the length check */
147
4.37k
    if (start < 0 || start > strgraphs)
148
0
        MVM_exception_throw_adhoc(tc, "start out of range");
149
4.37k
    if (length < -1 || start + lengthu > strgraphs)
150
0
        MVM_exception_throw_adhoc(tc, "length out of range");
151
4.37k
152
4.37k
    if (replacement)
153
0
        repl_bytes = (MVMuint8 *) MVM_string_ascii_encode_substr(tc, replacement,
154
0
            &repl_length, 0, -1, NULL, translate_newlines);
155
4.37k
156
4.37k
    result_alloc = lengthu;
157
4.37k
    result = MVM_malloc(result_alloc + 1);
158
4.37k
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
159
0
        /* No encoding needed; directly copy. */
160
0
        memcpy(result, str->body.storage.blob_ascii, lengthu);
161
0
        result[lengthu] = 0;
162
0
        if (output_size)
163
0
            *output_size = lengthu;
164
0
    }
165
4.37k
    else {
166
4.37k
        MVMuint32 i = 0;
167
4.37k
        MVMCodepointIter ci;
168
4.37k
        MVM_string_ci_init(tc, &ci, str, translate_newlines);
169
43.6k
        while (MVM_string_ci_has_more(tc, &ci)) {
170
39.2k
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
171
39.2k
            if (i == result_alloc) {
172
0
                result_alloc += 8;
173
0
                result = MVM_realloc(result, result_alloc + 1);
174
0
            }
175
39.2k
            if (ord >= 0 && ord <= 127) {
176
39.2k
                result[i] = (MVMuint8)ord;
177
39.2k
                i++;
178
39.2k
            }
179
0
            else if (replacement) {
180
0
                if (repl_length >= result_alloc || i >= result_alloc - repl_length) {
181
0
                    result_alloc += repl_length;
182
0
                    result = MVM_realloc(result, result_alloc + 1);
183
0
                }
184
0
                memcpy(result + i, repl_bytes, repl_length);
185
0
                i += repl_length;
186
0
            }
187
0
            else {
188
0
                MVM_free(result);
189
0
                MVM_free(repl_bytes);
190
0
                MVM_exception_throw_adhoc(tc,
191
0
                    "Error encoding ASCII string: could not encode codepoint %d",
192
0
                    ord);
193
0
            }
194
39.2k
        }
195
4.37k
        result[i] = 0;
196
4.37k
        if (output_size)
197
2
            *output_size = i;
198
4.37k
    }
199
4.37k
200
4.37k
    MVM_free(repl_bytes);
201
4.37k
    return (char *)result;
202
4.37k
}
203
204
/* Encodes the specified string to ASCII.  */
205
4.37k
char * MVM_string_ascii_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint32 translate_newlines) {
206
4.37k
    return MVM_string_ascii_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines);
207
4.37k
}
208
209
/* Encodes the specified string to ASCII not returning length.  */
210
0
char * MVM_string_ascii_encode_any(MVMThreadContext *tc, MVMString *str) {
211
0
    return MVM_string_ascii_encode(tc, str, NULL, 0);
212
0
}