Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/ascii.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
3
/* Decodes the specified number of bytes of ASCII into an NFG string, creating
4
 * a result of the specified type. The type must have the MVMString REPR. */
5
183k
MVMString * MVM_string_ascii_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *ascii, size_t bytes) {
6
183k
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
7
183k
    size_t i, result_graphs;
8
183k
9
183k
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
10
183k
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
11
183k
12
183k
    result_graphs = 0;
13
2.48M
    for (i = 0; i < bytes; i++) {
14
2.30M
        if (ascii[i] == '\r' && i + 1 < bytes && ascii[i + 1] == '\n') {
15
0
            result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
16
0
            i++;
17
0
        }
18
2.30M
        else if (ascii[i] >= 0) {
19
2.30M
            result->body.storage.blob_32[result_graphs++] = ascii[i];
20
2.30M
        }
21
0
        else {
22
0
            MVM_exception_throw_adhoc(tc,
23
0
                "Will not decode invalid ASCII (code point > 127 found)");
24
0
        }
25
2.30M
    }
26
183k
    result->body.num_graphs = result_graphs;
27
183k
28
183k
    return result;
29
183k
}
30
31
/* Decodes a NULL-terminated ASCII string into an NFG string, creating
32
 * a result of the specified type. The type must have the MVMString REPR. */
33
66.2k
MVMString * MVM_string_ascii_decode_nt(MVMThreadContext *tc, const MVMObject *result_type, const char *ascii) {
34
66.2k
    return MVM_string_ascii_decode(tc, result_type, ascii, strlen(ascii));
35
66.2k
}
36
37
/* Decodes using a decodestream. Decodes as far as it can with the input
38
 * buffers, or until a stopper is reached. */
39
MVMuint32 MVM_string_ascii_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
40
                                   const MVMint32 *stopper_chars,
41
1
                                   MVMDecodeStreamSeparators *seps) {
42
1
    MVMint32              count = 0, total = 0;
43
1
    MVMint32              bufsize;
44
1
    MVMGrapheme32        *buffer;
45
1
    MVMDecodeStreamBytes *cur_bytes;
46
1
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
47
1
    MVMint32 last_accept_pos, last_was_cr;
48
1
    MVMuint32 reached_stopper;
49
1
50
1
    /* If there's no buffers, we're done. */
51
1
    if (!ds->bytes_head)
52
0
        return 0;
53
1
    last_accept_pos = ds->bytes_head_pos;
54
1
55
1
    /* If we're asked for zero chars, also done. */
56
1
    if (stopper_chars && *stopper_chars == 0)
57
0
        return 1;
58
1
59
1
    bufsize = ds->result_size_guess;
60
1
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
61
1
62
1
    /* Decode each of the buffers. */
63
1
    cur_bytes = ds->bytes_head;
64
1
    last_was_cr = 0;
65
1
    reached_stopper = 0;
66
3
    while (cur_bytes) {
67
2
        /* Process this buffer. */
68
1
        MVMint32  pos   = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
69
2
        MVMuint8 *bytes = (MVMuint8*)cur_bytes->bytes;
70
11
        while (pos < cur_bytes->length) {
71
9
            MVMCodepoint codepoint = bytes[pos++];
72
9
            MVMGrapheme32 graph;
73
9
            if (codepoint > 127)
74
0
                MVM_exception_throw_adhoc(tc,
75
0
                    "Will not decode invalid ASCII (code point > 127 found)");
76
9
            if (last_was_cr) {
77
0
                if (codepoint == '\n') {
78
0
                    graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
79
0
                }
80
0
                else {
81
0
                    graph = '\r';
82
0
                    pos--;
83
0
                }
84
0
                last_was_cr = 0;
85
0
            }
86
9
            else if (codepoint == '\r') {
87
0
                last_was_cr = 1;
88
0
                continue;
89
0
            }
90
9
            else {
91
9
                graph = codepoint;
92
9
            }
93
9
            if (count == bufsize) {
94
0
                /* We filled the buffer. Attach this one to the buffers
95
0
                 * linked list, and continue with a new one. */
96
0
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
97
0
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
98
0
                count = 0;
99
0
            }
100
9
            buffer[count++] = graph;
101
9
            last_accept_bytes = cur_bytes;
102
9
            last_accept_pos = pos;
103
9
            total++;
104
9
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
105
0
                reached_stopper = 1;
106
0
                goto done;
107
0
            }
108
9
            else if (stopper_chars && *stopper_chars == total) {
109
0
                reached_stopper = 1;
110
0
                goto done;
111
0
            }
112
9
        }
113
2
        cur_bytes = cur_bytes->next;
114
2
    }
115
1
  done:
116
1
117
1
    /* Attach what we successfully parsed as a result buffer, and trim away
118
1
     * what we chewed through. */
119
1
    if (count) {
120
1
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
121
1
    }
122
0
    else {
123
0
        MVM_free(buffer);
124
0
    }
125
1
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
126
1
127
1
    return reached_stopper;
128
1
}
129
130
/* Encodes the specified substring to ASCII. Anything outside of ASCII range
131
 * will become replaced with the supplied replacement, or an exception will be
132
 * thrown if there isn't one. The result string is NULL terminated, but the
133
 * specified size is the non-null part. */
134
1.96k
char * MVM_string_ascii_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) {
135
1.96k
    /* ASCII is a single byte encoding, but \r\n is a 2-byte grapheme, so we
136
1.96k
     * may have to resize as we go. */
137
1.96k
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
138
1.95k
    MVMuint32      lengthu   = (MVMuint32)(length == -1 ? strgraphs - (MVMuint32)start : length);
139
1.96k
    MVMuint8      *result;
140
1.96k
    size_t         result_alloc;
141
1.96k
    MVMuint8      *repl_bytes = NULL;
142
1.96k
    MVMuint64      repl_length;
143
1.96k
144
1.96k
    /* must check start first since it's used in the length check */
145
1.96k
    if (start < 0 || start > strgraphs)
146
0
        MVM_exception_throw_adhoc(tc, "start out of range");
147
1.96k
    if (length < -1 || start + lengthu > strgraphs)
148
0
        MVM_exception_throw_adhoc(tc, "length out of range");
149
1.96k
150
1.96k
    if (replacement)
151
2
        repl_bytes = (MVMuint8 *) MVM_string_ascii_encode_substr(tc, replacement,
152
2
            &repl_length, 0, -1, NULL, translate_newlines);
153
1.96k
154
1.96k
    result_alloc = lengthu;
155
1.96k
    result = MVM_malloc(result_alloc + 1);
156
1.96k
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
157
0
        /* No encoding needed; directly copy. */
158
0
        memcpy(result, str->body.storage.blob_ascii, lengthu);
159
0
        result[lengthu] = 0;
160
0
        if (output_size)
161
0
            *output_size = lengthu;
162
0
    }
163
1.96k
    else {
164
1.96k
        MVMuint32 i = 0;
165
1.96k
        MVMCodepointIter ci;
166
1.96k
        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
167
33.6k
        while (MVM_string_ci_has_more(tc, &ci)) {
168
31.6k
            MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci);
169
31.6k
            if (i == result_alloc) {
170
2
                result_alloc += 8;
171
2
                result = MVM_realloc(result, result_alloc + 1);
172
2
            }
173
31.6k
            if (0 <= ord && ord <= 127) {
174
31.6k
                result[i++] = (MVMuint8)ord;
175
31.6k
            }
176
5
            else if (replacement) {
177
4
                if (repl_length >= result_alloc || i >= result_alloc - repl_length) {
178
2
                    result_alloc += repl_length;
179
2
                    result = MVM_realloc(result, result_alloc + 1);
180
2
                }
181
4
                memcpy(result + i, repl_bytes, repl_length);
182
4
                i += repl_length;
183
4
            }
184
1
            else {
185
1
                MVM_free(result);
186
1
                MVM_free(repl_bytes);
187
1
                MVM_exception_throw_adhoc(tc,
188
1
                    "Error encoding ASCII string: could not encode codepoint %d",
189
1
                    ord);
190
1
            }
191
31.6k
        }
192
1.96k
        result[i] = 0;
193
1.96k
        if (output_size)
194
23
            *output_size = i;
195
1.96k
    }
196
1.96k
197
1.96k
    if (repl_bytes) MVM_free(repl_bytes);
198
1.96k
    return (char *)result;
199
1.96k
}
200
201
/* Encodes the specified string to ASCII.  */
202
1.95k
char * MVM_string_ascii_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint32 translate_newlines) {
203
1.95k
    return MVM_string_ascii_encode_substr(tc, str, output_size, 0, -1, NULL, translate_newlines);
204
1.95k
}
205
206
/* Encodes the specified string to ASCII not returning length.  */
207
0
char * MVM_string_ascii_encode_any(MVMThreadContext *tc, MVMString *str) {
208
0
    return MVM_string_ascii_encode(tc, str, NULL, 0);
209
0
}