Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/shiftjis.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
#include "shiftjis_codeindex.h"
3
/* Encodes the specified substring to ShiftJIS as specified here:
4
 * https://encoding.spec.whatwg.org/#shift_jis-decoder
5
 * The result string is NULL terminated, but the specified size is the non-null part. */
6
char * MVM_string_shiftjis_encode_substr(MVMThreadContext *tc, MVMString *str,
7
        MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
8
0
        MVMint32 translate_newlines, MVMint64 config) {
9
0
    MVMuint32 startu = (MVMuint32)start;
10
0
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
11
0
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
12
0
    MVMuint8 *result = NULL;
13
0
    size_t result_alloc;
14
0
    MVMuint8 *repl_bytes = NULL;
15
0
    MVMuint64 repl_length;
16
0
17
0
    /* must check start first since it's used in the length check */
18
0
    if (start < 0 || start > strgraphs)
19
0
        MVM_exception_throw_adhoc(tc, "start out of range");
20
0
    if (length < -1 || start + lengthu > strgraphs)
21
0
        MVM_exception_throw_adhoc(tc, "length out of range");
22
0
23
0
    if (replacement)
24
0
        repl_bytes = (MVMuint8 *) MVM_string_shiftjis_encode_substr(tc,
25
0
            replacement, &repl_length, 0, -1, NULL, translate_newlines, config);
26
0
27
0
    result_alloc = lengthu;
28
0
    result = MVM_malloc(result_alloc + 1);
29
0
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
30
0
        /* No encoding needed; directly copy. */
31
0
        memcpy(result, str->body.storage.blob_ascii, lengthu);
32
0
        result[lengthu] = 0;
33
0
        if (output_size)
34
0
            *output_size = lengthu;
35
0
    }
36
0
    else {
37
0
        MVMuint32 out_pos = 0;
38
0
        MVMCodepointIter ci;
39
0
        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
40
0
        while (MVM_string_ci_has_more(tc, &ci)) {
41
0
            MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
42
0
            if (result_alloc <= out_pos + 1) {
43
0
                result_alloc += 8;
44
0
                result = MVM_realloc(result, result_alloc + 2);
45
0
            }
46
0
            /* If code point is an ASCII code point or U+0080, return a byte
47
0
             * whose value is code point. */
48
0
            if (codepoint <= 0x7F || codepoint == 0x80) {
49
0
                result[out_pos++] = codepoint;
50
0
            }
51
0
            /* If code point is U+00A5, return byte 0x5C. */
52
0
            else if (codepoint == 0xA5) {
53
0
                result[out_pos++] = 0x5C;
54
0
            }
55
0
            /* If code point is U+203E, return byte 0x7E. */
56
0
            else if (codepoint == 0x203E) {
57
0
                result[out_pos++] = 0x7E;
58
0
            }
59
0
            /* If code point is in the range U+FF61 to U+FF9F, inclusive, return
60
0
             * a byte whose value is code point − 0xFF61 + 0xA1. */
61
0
            else if (0xFF61 <= codepoint && codepoint <= 0xFF9F) {
62
0
                result[out_pos++] = codepoint - 0xFF61 + 0xA1;
63
0
            }
64
0
            else {
65
0
                MVMint16 pointer;
66
0
                unsigned int lead, lead_offset, trail, offset;
67
0
                /* If code point is U+2212, set it to U+FF0D. */
68
0
                if (codepoint == 0x2212) {
69
0
                    codepoint = 0xFF0D;
70
0
                }
71
0
                /* Let pointer be the index Shift_JIS pointer for code point. */
72
0
                pointer = shift_jis_cp_to_index(tc, codepoint);
73
0
                /* If pointer is null, return error with code point. */
74
0
                if (pointer == SHIFTJIS_NULL) {
75
0
                    if (replacement) {
76
0
                        size_t i;
77
0
                        if (result_alloc <= out_pos + repl_length) {
78
0
                            result_alloc += repl_length;
79
0
                            result = MVM_realloc(result, result_alloc + 1);
80
0
                        }
81
0
                        for (i = 0; i < repl_length; i++) {
82
0
                            result[out_pos++] = repl_bytes[i];
83
0
                        }
84
0
                        continue;
85
0
                    }
86
0
                    else {
87
0
                        MVM_free(result);
88
0
                        MVM_exception_throw_adhoc(tc,
89
0
                            "Error encoding shiftjis string: could not encode codepoint %d",
90
0
                             codepoint);
91
0
                    }
92
0
                }
93
0
                /* Let lead be pointer / 188. */
94
0
                lead = pointer/188;
95
0
                /* Let lead offset be 0x81, if lead is less than 0x1F, and 0xC1
96
0
                 * otherwise. */
97
0
                lead_offset = lead < 0x1F ? 0x81 : 0xC1;
98
0
                /* Let trail be pointer % 188 */
99
0
                trail = pointer % 188;
100
0
                /* Let offset be 0x40, if trail is less than 0x3F, and 0x41 otherwise. */
101
0
                offset = trail < 0x3F ? 0x40 : 0x41;
102
0
                /* Return two bytes whose values are lead + lead offset and
103
0
                 * trail + offset. */
104
0
                result[out_pos++] = lead + lead_offset;
105
0
                result[out_pos++] = trail + offset;
106
0
            }
107
0
108
0
        }
109
0
        result[out_pos] = 0;
110
0
        if (output_size)
111
0
            *output_size = out_pos;
112
0
    }
113
0
114
0
    MVM_free(repl_bytes);
115
0
    return (char *)result;
116
0
}
117
0
#define DECODE_ERROR -1
118
0
#define DECODE_CONTINUE -2
119
0
#define DECODE_CODEPOINT -4
120
0
#define DECODE_PREPEND_TO_STREAM -5
121
0
static int decoder_handler (MVMThreadContext *tc, MVMuint8 *Shift_JIS_lead, MVMuint8 byte, MVMCodepoint *out) {
122
0
    /* If Shift_JIS lead is not 0x00 */
123
0
    if (*Shift_JIS_lead != 0x00) {
124
0
        /* let lead be Shift_JIS lead, */
125
0
        MVMuint8 lead = *Shift_JIS_lead;
126
0
        /* Let pointer be null */
127
0
        MVMint16 pointer = SHIFTJIS_NULL;
128
0
        /* Let offset be 0x40, if byte is less than 0x7F, and 0x41 otherwise. */
129
0
        MVMuint8 offset = byte < 0x7F ? 0x40 : 0x41;
130
0
        /* Let lead offset be 0x81, if lead is less than 0xA0, and 0xC1 otherwise. */
131
0
        MVMuint8 lead_offset = lead < 0xA0 ? 0x81 : 0xC1;
132
0
        MVMGrapheme32 codepoint;
133
0
        /* Set Shift_JIS lead to 0x00 */
134
0
        *Shift_JIS_lead = 0x00;
135
0
        /* 3. If byte is in the range 0x40 to 0x7E, inclusive, or 0x80 to 0xFC,
136
0
         * inclusive, set pointer to (lead − lead offset) × 188 + byte − offset. */
137
0
        if ((0x40 <= byte && byte <= 0x7E) || (0x80 <= byte && byte <= 0xFC)) {
138
0
            pointer = (lead - lead_offset) * 188 + byte - offset;
139
0
        }
140
0
        /* 4. If pointer is in the range 8836 to 10715, inclusive, return a code
141
0
         * point whose value is 0xE000 − 8836 + pointer. */
142
0
        if (8836 <= pointer && pointer <= 10715) {
143
0
            *out = 0xE000 - 8836 + pointer;
144
0
            return DECODE_CODEPOINT;
145
0
        }
146
0
        /* 5. Let code point be null, if pointer is null */
147
0
        if (pointer == SHIFTJIS_NULL) {
148
0
            codepoint = SHIFTJIS_NULL;
149
0
        }
150
0
        /*  And the index code point for pointer in index jis0208 otherwise. */
151
0
        else {
152
0
            codepoint = shift_jis_index_to_cp(tc, pointer);
153
0
        }
154
0
        /* 6. If code point is non-null, return a code point whose value is code point. */
155
0
        if (codepoint != SHIFTJIS_NULL) {
156
0
            *out = codepoint;
157
0
            return DECODE_CODEPOINT;
158
0
        }
159
0
        /* 7. If byte is an ASCII byte, prepend byte to stream. */
160
0
        if (byte <= 0x7F) {
161
0
            *out = byte;
162
0
            return DECODE_PREPEND_TO_STREAM;
163
0
        }
164
0
        /* 8. Return error. */
165
0
        return DECODE_ERROR;
166
0
    }
167
0
    /* 4. If byte is an ASCII byte or 0x80, return a code point whose value is byte. */
168
0
    if (byte <= 0x7F || byte == 0x80) {
169
0
        *out = byte;
170
0
        return DECODE_CODEPOINT;
171
0
    }
172
0
    /* 5. If byte is in the range 0xA1 to 0xDF, inclusive, return a code point
173
0
     * whose value is 0xFF61 − 0xA1 + byte. */
174
0
    if (0xA1 <= byte && byte <= 0xDF) {
175
0
        *out = 0xFF61 - 0xA1 + byte;
176
0
        return DECODE_CODEPOINT;
177
0
    }
178
0
    /* 6. If byte is in the range 0x81 to 0x9F, inclusive, or 0xE0 to 0xFC,
179
0
     * inclusive, set Shift_JIS lead to byte and return continue. */
180
0
    if ((0x81 <= byte && byte <= 0x9F) || (0xE0 <= byte && byte <= 0xFC)) {
181
0
        *Shift_JIS_lead = byte;
182
0
        return DECODE_CONTINUE;
183
0
    }
184
0
    return DECODE_ERROR;
185
0
186
0
}
187
MVMString * MVM_string_shiftjis_decode(MVMThreadContext *tc,
188
        const MVMObject *result_type, char *windows125X_c, size_t num_bytes,
189
0
        MVMString *replacement, MVMint64 config) {
190
0
    MVMuint8 *bytes = (MVMuint8 *)windows125X_c;
191
0
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
192
0
    size_t pos = 0, result_graphs, additional_bytes = 0;
193
0
    MVMStringIndex repl_length = replacement ? MVM_string_graphs(tc, replacement) : 0;
194
0
    MVMuint8 Shift_JIS_lead = 0x00;
195
0
    /* Stores a byte that we must run through the decoder a second time. Instead
196
0
     * of prepending to the last position of the buffer we just store it so as not
197
0
     * to modify the buffer. */
198
0
    MVMuint8 prepended = 0;
199
0
    int is_prepended   = 0;
200
0
    MVMStringIndex repl_pos = 0;
201
0
    int last_was_cr         = 0;
202
0
    MVMStringIndex result_size = num_bytes;
203
0
204
0
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
205
0
    /* TODO allocate less? */
206
0
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * result_size);
207
0
208
0
    result_graphs = 0;
209
0
    while (pos < num_bytes || repl_pos) {
210
0
        MVMGrapheme32 graph;
211
0
        MVMGrapheme32 codepoint;
212
0
        MVMuint8 byte;
213
0
        int graph_is_set = 0;
214
0
        if (repl_pos) {
215
0
            graph = MVM_string_get_grapheme_at_nocheck(tc, replacement, repl_pos++);
216
0
            graph_is_set = 1;
217
0
            if (repl_length <= repl_pos) repl_pos = 0;
218
0
        }
219
0
        else if (is_prepended) {
220
0
            byte = prepended;
221
0
            is_prepended = 0;
222
0
        }
223
0
        else {
224
0
            byte = bytes[pos++];
225
0
        }
226
0
        /* graph_is_set will be 0 unless we just grabbed a replacement grapheme */
227
0
        if (!graph_is_set) {
228
0
            int handler_rtrn = decoder_handler(tc, &Shift_JIS_lead, byte, &codepoint);
229
0
            if (handler_rtrn == DECODE_CODEPOINT) {
230
0
                graph = codepoint;
231
0
            }
232
0
            else if (handler_rtrn == DECODE_CONTINUE) {
233
0
                continue;
234
0
            }
235
0
            else if (handler_rtrn == DECODE_ERROR) {
236
0
                /* Clearing this seems like the right thing to do, in case
237
0
                 * a replacement is used. */
238
0
                Shift_JIS_lead = 0x00;
239
0
                if (replacement) {
240
0
                    graph = MVM_string_get_grapheme_at_nocheck(tc, replacement, repl_pos);
241
0
                    /* If the replacement is more than one grapheme we need
242
0
                     * to set repl_pos++ so we will grab the next grapheme on
243
0
                     * the next loop */
244
0
                    if (1 < repl_length) repl_pos++;
245
0
                }
246
0
                else {
247
0
                    /* Throw if it's unmapped */
248
0
                    MVM_exception_throw_adhoc(tc,
249
0
                        "Error decoding shiftjis string: could not decode byte 0x%hhX",
250
0
                         byte);
251
0
                }
252
0
            }
253
0
            else if (handler_rtrn == DECODE_PREPEND_TO_STREAM) {
254
0
                is_prepended = 1;
255
0
                prepended    = codepoint;
256
0
                continue;
257
0
            }
258
0
            else {
259
0
                MVM_exception_throw_adhoc(tc, "shiftjis decoder encountered an internal error.\n");
260
0
            }
261
0
        }
262
0
        if (last_was_cr) {
263
0
            if (graph == '\n') {
264
0
                graph = MVM_nfg_crlf_grapheme(tc);
265
0
            }
266
0
            else {
267
0
                graph = '\r';
268
0
                pos--;
269
0
            }
270
0
            last_was_cr = 0;
271
0
        }
272
0
        else if (graph == '\r') {
273
0
            last_was_cr = 1;
274
0
            continue;
275
0
        }
276
0
        if (result_graphs == result_size) {
277
0
            result_size += repl_length;
278
0
            result->body.storage.blob_32 = MVM_realloc(result->body.storage.blob_32,
279
0
                result_size * sizeof(MVMGrapheme32));
280
0
        }
281
0
        result->body.storage.blob_32[result_graphs++] = graph;
282
0
    }
283
0
    /* If we end up with Shift_JIS_lead still set, that means we're missing a byte
284
0
     * that should have followed it. */
285
0
    if (Shift_JIS_lead != 0x00) {
286
0
        MVM_exception_throw_adhoc(tc, "Error, ended decode of shiftjis expecting another byte. "
287
0
            "Last byte seen was 0x%hhX\n", Shift_JIS_lead);
288
0
    }
289
0
    result->body.storage.blob_32 = MVM_realloc(result->body.storage.blob_32,
290
0
        result_graphs * sizeof(MVMGrapheme32));
291
0
    result->body.num_graphs = result_graphs;
292
0
293
0
    return result;
294
0
}
295
/* Decodes using a decodestream. Decodes as far as it can with the input
296
 * buffers, or until a stopper is reached. */
297
MVMuint32 MVM_string_shiftjis_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
298
                                         const MVMint32 *stopper_chars,
299
0
                                         MVMDecodeStreamSeparators *seps) {
300
0
    MVMint32 count = 0, total = 0;
301
0
    MVMint32 bufsize;
302
0
    MVMGrapheme32 *buffer = NULL;
303
0
    MVMDecodeStreamBytes *cur_bytes = NULL;
304
0
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
305
0
    MVMint32 last_accept_pos, last_was_cr;
306
0
    MVMuint32 reached_stopper;
307
0
    MVMStringIndex repl_length = ds->replacement ? MVM_string_graphs(tc, ds->replacement) : 0;
308
0
    MVMStringIndex repl_pos = 0;
309
0
    MVMuint8 Shift_JIS_lead = 0x00;
310
0
    MVMuint8 prepended = 0;
311
0
    int is_prepended = 0;
312
0
    /* If there's no buffers, we're done. */
313
0
    if (!ds->bytes_head)
314
0
        return 0;
315
0
    last_accept_pos = ds->bytes_head_pos;
316
0
317
0
    /* If we're asked for zero chars, also done. */
318
0
    if (stopper_chars && *stopper_chars == 0)
319
0
        return 1;
320
0
321
0
    bufsize = ds->result_size_guess;
322
0
    buffer  = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
323
0
324
0
    /* Decode each of the buffers. */
325
0
    cur_bytes = ds->bytes_head;
326
0
    last_was_cr = 0;
327
0
    reached_stopper = 0;
328
0
    while (cur_bytes) {
329
0
        /* Process this buffer. */
330
0
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
331
0
        MVMuint8 *bytes = (MVMuint8 *)cur_bytes->bytes;
332
0
        while (pos < cur_bytes->length || repl_pos) {
333
0
            MVMGrapheme32 graph;
334
0
            MVMCodepoint codepoint;
335
0
            MVMuint8 byte;
336
0
            int graph_is_set = 0;
337
0
            int handler_rtrn = 0;
338
0
            if (repl_pos) {
339
0
                graph = MVM_string_get_grapheme_at_nocheck(tc, ds->replacement, repl_pos++);
340
0
                graph_is_set = 1;
341
0
                if (repl_length <= repl_pos) repl_pos = 0;
342
0
            }
343
0
            else if (is_prepended) {
344
0
                byte = prepended;
345
0
                is_prepended = 0;
346
0
            }
347
0
            else {
348
0
                byte = bytes[pos++];
349
0
            }
350
0
            /* graph_is_set will be 0 unless we just grabbed a replacement grapheme */
351
0
            if (!graph_is_set) {
352
0
                handler_rtrn = decoder_handler(tc, &Shift_JIS_lead, byte, &codepoint);
353
0
                if (handler_rtrn == DECODE_CODEPOINT) {
354
0
                    graph = codepoint;
355
0
                }
356
0
                else if (handler_rtrn == DECODE_CONTINUE) {
357
0
                    continue;
358
0
                }
359
0
                else if (handler_rtrn == DECODE_ERROR) {
360
0
                    /* Clearing this seems like the right thing to do, in case
361
0
                     * a replacement is used. */
362
0
                    Shift_JIS_lead = 0x00;
363
0
                    if (ds->replacement) {
364
0
                        graph = MVM_string_get_grapheme_at_nocheck(tc, ds->replacement, repl_pos);
365
0
                        /* If the replacement is more than one grapheme we need
366
0
                         * to set repl_pos++ so we will grab the next grapheme on
367
0
                         * the next loop */
368
0
                        if (1 < repl_length) repl_pos++;
369
0
                    }
370
0
                    else {
371
0
                        /* Throw if it's unmapped */
372
0
                        MVM_free(buffer);
373
0
                        MVM_exception_throw_adhoc(tc,
374
0
                            "Error decoding shiftjis string: could not byte 0x%hhx",
375
0
                             byte);
376
0
                    }
377
0
                }
378
0
                else if (handler_rtrn == DECODE_PREPEND_TO_STREAM) {
379
0
                    is_prepended = 1;
380
0
                    prepended    = codepoint;
381
0
                    continue;
382
0
                }
383
0
                else {
384
0
                    MVM_exception_throw_adhoc(tc, "shiftjis decoder encountered an internal error. This bug should be reported.\n");
385
0
                }
386
0
            }
387
0
            if (last_was_cr) {
388
0
                if (graph == '\n') {
389
0
                    graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
390
0
                }
391
0
                else {
392
0
                    graph = '\r';
393
0
                    pos--;
394
0
                }
395
0
                last_was_cr = 0;
396
0
            }
397
0
            else if (graph == '\r') {
398
0
                last_was_cr = 1;
399
0
                continue;
400
0
            }
401
0
            if (count == bufsize) {
402
0
                /* We filled the buffer. Attach this one to the buffers
403
0
                 * linked list, and continue with a new one. */
404
0
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
405
0
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
406
0
                count = 0;
407
0
            }
408
0
            buffer[count++]   = graph;
409
0
            last_accept_bytes = cur_bytes;
410
0
            last_accept_pos   = pos;
411
0
            total++;
412
0
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
413
0
                reached_stopper = 1;
414
0
                goto done;
415
0
            }
416
0
            else if (stopper_chars && *stopper_chars == total) {
417
0
                reached_stopper = 1;
418
0
                goto done;
419
0
            }
420
0
        }
421
0
        cur_bytes = cur_bytes->next;
422
0
    }
423
0
  done:
424
0
    /* Attach what we successfully parsed as a result buffer, and trim away
425
0
     * what we chewed through. */
426
0
    if (count) {
427
0
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
428
0
    }
429
0
    else {
430
0
        MVM_free(buffer);
431
0
    }
432
0
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
433
0
    return reached_stopper;
434
0
}