Coverage Report

Created: 2018-07-03 15:31

/home/travis/build/MoarVM/MoarVM/src/strings/windows1252.c
Line
Count
Source (jump to first uncovered line)
1
#include "moar.h"
2
60
#define UNMAPPED 0xFFFF
3
4
/* Windows-1252 Latin */
5
static const MVMuint16 windows1252_codepoints[] = {
6
    0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
7
    0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
8
    0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
9
    0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
10
    0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
11
    0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
12
    0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
13
    0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
14
    0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
15
    0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
16
    0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
17
    0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
18
    0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
19
    0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
20
    0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
21
    0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
22
    0x20AC,0xFFFF,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021,
23
    0x02C6,0x2030,0x0160,0x2039,0x0152,0xFFFF,0x017D,0xFFFF,
24
    0xFFFF,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,
25
    0x02DC,0x2122,0x0161,0x203A,0x0153,0xFFFF,0x017E,0x0178,
26
    0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7,
27
    0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF,
28
    0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7,
29
    0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF,
30
    0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7,
31
    0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF,
32
    0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7,
33
    0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF,
34
    0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7,
35
    0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF,
36
    0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7,
37
    0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF
38
};
39
/* Windows-1251 Cyrillic */
40
static const MVMuint16 windows1251_codepoints[] = {
41
    0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007,
42
    0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F,
43
    0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017,
44
    0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F,
45
    0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027,
46
    0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F,
47
    0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,
48
    0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F,
49
    0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047,
50
    0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F,
51
    0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057,
52
    0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F,
53
    0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067,
54
    0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F,
55
    0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077,
56
    0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F,
57
    0x0402,0x0403,0x201A,0x0453,0x201E,0x2026,0x2020,0x2021,
58
    0x20AC,0x2030,0x0409,0x2039,0x040A,0x040C,0x040B,0x040F,
59
    0x0452,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014,
60
    0xFFFF,0x2122,0x0459,0x203A,0x045A,0x045C,0x045B,0x045F,
61
    0x00A0,0x040E,0x045E,0x0408,0x00A4,0x0490,0x00A6,0x00A7,
62
    0x0401,0x00A9,0x0404,0x00AB,0x00AC,0x00AD,0x00AE,0x0407,
63
    0x00B0,0x00B1,0x0406,0x0456,0x0491,0x00B5,0x00B6,0x00B7,
64
    0x0451,0x2116,0x0454,0x00BB,0x0458,0x0405,0x0455,0x0457,
65
    0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417,
66
    0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,0x041F,
67
    0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427,
68
    0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,0x042F,
69
    0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437,
70
    0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x043E,0x043F,
71
    0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447,
72
    0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,0x044E,0x044F
73
};
74
32
static MVMuint8 windows1252_cp_to_char(MVMint32 codepoint) {
75
32
    if (8482 < codepoint || codepoint < 0)
76
6
        return '\0';
77
26
    switch (codepoint) {
78
0
        case 160: return 160;
79
0
        case 161: return 161;
80
0
        case 162: return 162;
81
0
        case 163: return 163;
82
0
        case 164: return 164;
83
0
        case 165: return 165;
84
0
        case 166: return 166;
85
0
        case 167: return 167;
86
0
        case 168: return 168;
87
0
        case 169: return 169;
88
0
        case 170: return 170;
89
0
        case 171: return 171;
90
0
        case 172: return 172;
91
0
        case 173: return 173;
92
0
        case 174: return 174;
93
0
        case 175: return 175;
94
0
        case 176: return 176;
95
0
        case 177: return 177;
96
0
        case 178: return 178;
97
0
        case 179: return 179;
98
0
        case 180: return 180;
99
0
        case 181: return 181;
100
0
        case 182: return 182;
101
0
        case 183: return 183;
102
0
        case 184: return 184;
103
0
        case 185: return 185;
104
0
        case 186: return 186;
105
0
        case 187: return 187;
106
0
        case 188: return 188;
107
0
        case 189: return 189;
108
0
        case 190: return 190;
109
0
        case 191: return 191;
110
0
        case 192: return 192;
111
0
        case 193: return 193;
112
0
        case 194: return 194;
113
0
        case 195: return 195;
114
0
        case 196: return 196;
115
0
        case 197: return 197;
116
0
        case 198: return 198;
117
0
        case 199: return 199;
118
0
        case 200: return 200;
119
0
        case 201: return 201;
120
0
        case 202: return 202;
121
0
        case 203: return 203;
122
0
        case 204: return 204;
123
0
        case 205: return 205;
124
0
        case 206: return 206;
125
0
        case 207: return 207;
126
0
        case 208: return 208;
127
0
        case 209: return 209;
128
0
        case 210: return 210;
129
0
        case 211: return 211;
130
0
        case 212: return 212;
131
0
        case 213: return 213;
132
0
        case 214: return 214;
133
0
        case 215: return 215;
134
0
        case 216: return 216;
135
0
        case 217: return 217;
136
0
        case 218: return 218;
137
0
        case 219: return 219;
138
0
        case 220: return 220;
139
0
        case 221: return 221;
140
0
        case 222: return 222;
141
0
        case 223: return 223;
142
0
        case 224: return 224;
143
0
        case 225: return 225;
144
0
        case 226: return 226;
145
0
        case 227: return 227;
146
0
        case 228: return 228;
147
6
        case 229: return 229;
148
0
        case 230: return 230;
149
0
        case 231: return 231;
150
0
        case 232: return 232;
151
0
        case 233: return 233;
152
0
        case 234: return 234;
153
0
        case 235: return 235;
154
0
        case 236: return 236;
155
0
        case 237: return 237;
156
0
        case 238: return 238;
157
0
        case 239: return 239;
158
0
        case 240: return 240;
159
0
        case 241: return 241;
160
0
        case 242: return 242;
161
0
        case 243: return 243;
162
0
        case 244: return 244;
163
0
        case 245: return 245;
164
0
        case 246: return 246;
165
0
        case 247: return 247;
166
0
        case 248: return 248;
167
0
        case 249: return 249;
168
0
        case 250: return 250;
169
0
        case 251: return 251;
170
0
        case 252: return 252;
171
0
        case 253: return 253;
172
0
        case 254: return 254;
173
0
        case 255: return 255;
174
1
        case 338: return 140;
175
0
        case 339: return 156;
176
1
        case 352: return 138;
177
0
        case 353: return 154;
178
0
        case 376: return 159;
179
1
        case 381: return 142;
180
0
        case 382: return 158;
181
1
        case 402: return 131;
182
1
        case 710: return 136;
183
0
        case 732: return 152;
184
0
        case 8211: return 150;
185
0
        case 8212: return 151;
186
0
        case 8216: return 145;
187
0
        case 8217: return 146;
188
1
        case 8218: return 130;
189
0
        case 8220: return 147;
190
0
        case 8221: return 148;
191
1
        case 8222: return 132;
192
1
        case 8224: return 134;
193
1
        case 8225: return 135;
194
0
        case 8226: return 149;
195
1
        case 8230: return 133;
196
1
        case 8240: return 137;
197
1
        case 8249: return 139;
198
0
        case 8250: return 155;
199
1
        case 8364: return 128;
200
0
        case 8482: return 153;
201
7
        default: return '\0';
202
0
    };
203
0
}
204
0
static MVMuint8 windows1251_cp_to_char(MVMint32 codepoint) {
205
0
    if (8482 < codepoint || codepoint < 0)
206
0
        return '\0';
207
0
    switch (codepoint) {
208
0
        case 160: return 160;
209
0
        case 164: return 164;
210
0
        case 166: return 166;
211
0
        case 167: return 167;
212
0
        case 169: return 169;
213
0
        case 171: return 171;
214
0
        case 172: return 172;
215
0
        case 173: return 173;
216
0
        case 174: return 174;
217
0
        case 176: return 176;
218
0
        case 177: return 177;
219
0
        case 181: return 181;
220
0
        case 182: return 182;
221
0
        case 183: return 183;
222
0
        case 187: return 187;
223
0
        case 1025: return 168;
224
0
        case 1026: return 128;
225
0
        case 1027: return 129;
226
0
        case 1028: return 170;
227
0
        case 1029: return 189;
228
0
        case 1030: return 178;
229
0
        case 1031: return 175;
230
0
        case 1032: return 163;
231
0
        case 1033: return 138;
232
0
        case 1034: return 140;
233
0
        case 1035: return 142;
234
0
        case 1036: return 141;
235
0
        case 1038: return 161;
236
0
        case 1039: return 143;
237
0
        case 1040: return 192;
238
0
        case 1041: return 193;
239
0
        case 1042: return 194;
240
0
        case 1043: return 195;
241
0
        case 1044: return 196;
242
0
        case 1045: return 197;
243
0
        case 1046: return 198;
244
0
        case 1047: return 199;
245
0
        case 1048: return 200;
246
0
        case 1049: return 201;
247
0
        case 1050: return 202;
248
0
        case 1051: return 203;
249
0
        case 1052: return 204;
250
0
        case 1053: return 205;
251
0
        case 1054: return 206;
252
0
        case 1055: return 207;
253
0
        case 1056: return 208;
254
0
        case 1057: return 209;
255
0
        case 1058: return 210;
256
0
        case 1059: return 211;
257
0
        case 1060: return 212;
258
0
        case 1061: return 213;
259
0
        case 1062: return 214;
260
0
        case 1063: return 215;
261
0
        case 1064: return 216;
262
0
        case 1065: return 217;
263
0
        case 1066: return 218;
264
0
        case 1067: return 219;
265
0
        case 1068: return 220;
266
0
        case 1069: return 221;
267
0
        case 1070: return 222;
268
0
        case 1071: return 223;
269
0
        case 1072: return 224;
270
0
        case 1073: return 225;
271
0
        case 1074: return 226;
272
0
        case 1075: return 227;
273
0
        case 1076: return 228;
274
0
        case 1077: return 229;
275
0
        case 1078: return 230;
276
0
        case 1079: return 231;
277
0
        case 1080: return 232;
278
0
        case 1081: return 233;
279
0
        case 1082: return 234;
280
0
        case 1083: return 235;
281
0
        case 1084: return 236;
282
0
        case 1085: return 237;
283
0
        case 1086: return 238;
284
0
        case 1087: return 239;
285
0
        case 1088: return 240;
286
0
        case 1089: return 241;
287
0
        case 1090: return 242;
288
0
        case 1091: return 243;
289
0
        case 1092: return 244;
290
0
        case 1093: return 245;
291
0
        case 1094: return 246;
292
0
        case 1095: return 247;
293
0
        case 1096: return 248;
294
0
        case 1097: return 249;
295
0
        case 1098: return 250;
296
0
        case 1099: return 251;
297
0
        case 1100: return 252;
298
0
        case 1101: return 253;
299
0
        case 1102: return 254;
300
0
        case 1103: return 255;
301
0
        case 1105: return 184;
302
0
        case 1106: return 144;
303
0
        case 1107: return 131;
304
0
        case 1108: return 186;
305
0
        case 1109: return 190;
306
0
        case 1110: return 179;
307
0
        case 1111: return 191;
308
0
        case 1112: return 188;
309
0
        case 1113: return 154;
310
0
        case 1114: return 156;
311
0
        case 1115: return 158;
312
0
        case 1116: return 157;
313
0
        case 1118: return 162;
314
0
        case 1119: return 159;
315
0
        case 1168: return 165;
316
0
        case 1169: return 180;
317
0
        case 8211: return 150;
318
0
        case 8212: return 151;
319
0
        case 8216: return 145;
320
0
        case 8217: return 146;
321
0
        case 8218: return 130;
322
0
        case 8220: return 147;
323
0
        case 8221: return 148;
324
0
        case 8222: return 132;
325
0
        case 8224: return 134;
326
0
        case 8225: return 135;
327
0
        case 8226: return 149;
328
0
        case 8230: return 133;
329
0
        case 8240: return 137;
330
0
        case 8249: return 139;
331
0
        case 8250: return 155;
332
0
        case 8364: return 136;
333
0
        case 8470: return 185;
334
0
        case 8482: return 153;
335
0
        default: return '\0';
336
0
    };
337
0
}
338
339
/* Decodes using a decodestream. Decodes as far as it can with the input
340
 * buffers, or until a stopper is reached. */
341
MVMuint32 MVM_string_windows125X_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
342
                                         const MVMint32 *stopper_chars,
343
                                         MVMDecodeStreamSeparators *seps,
344
0
                                         const MVMuint16 *codetable) {
345
0
    MVMint32 count = 0, total = 0;
346
0
    MVMint32 bufsize;
347
0
    MVMGrapheme32 *buffer = NULL;
348
0
    MVMDecodeStreamBytes *cur_bytes = NULL;
349
0
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
350
0
    MVMint32 last_accept_pos, last_was_cr;
351
0
    MVMuint32 reached_stopper;
352
0
    MVMStringIndex repl_length = ds->replacement ? MVM_string_graphs(tc, ds->replacement) : 0;
353
0
    MVMStringIndex repl_pos = 0;
354
0
355
0
    /* If there's no buffers, we're done. */
356
0
    if (!ds->bytes_head)
357
0
        return 0;
358
0
    last_accept_pos = ds->bytes_head_pos;
359
0
360
0
    /* If we're asked for zero chars, also done. */
361
0
    if (stopper_chars && *stopper_chars == 0)
362
0
        return 1;
363
0
364
0
    bufsize = ds->result_size_guess;
365
0
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
366
0
367
0
    /* Decode each of the buffers. */
368
0
    cur_bytes = ds->bytes_head;
369
0
    last_was_cr = 0;
370
0
    reached_stopper = 0;
371
0
    while (cur_bytes) {
372
0
        /* Process this buffer. */
373
0
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
374
0
        unsigned char *bytes = (unsigned char *)cur_bytes->bytes;
375
0
        while (pos < cur_bytes->length || repl_pos) {
376
0
            MVMGrapheme32 graph;
377
0
            MVMCodepoint codepoint = codetable[bytes[pos++]];
378
0
            if (repl_pos) {
379
0
                graph = MVM_string_get_grapheme_at_nocheck(tc, ds->replacement, repl_pos++);
380
0
                if (repl_length <= repl_pos) repl_pos = 0;
381
0
                pos--;
382
0
            }
383
0
            else if (codepoint == UNMAPPED) {
384
0
                if (MVM_ENCODING_CONFIG_STRICT(ds->config)) {
385
0
                    if (ds->replacement) {
386
0
                        graph = MVM_string_get_grapheme_at_nocheck(tc, ds->replacement, repl_pos);
387
0
                        /* If the replacement is more than one grapheme we need
388
0
                         * to set repl_pos++ so we will grab the next grapheme on
389
0
                         * the next loop */
390
0
                        if (1 < repl_length) repl_pos++;
391
0
                    }
392
0
                    else {
393
0
                        /* Throw if it's unmapped */
394
0
                        char *enc_name = codetable == windows1252_codepoints
395
0
                            ? "Windows-1252" : "Windows-1251";
396
0
                        MVM_free(buffer);
397
0
                        MVM_exception_throw_adhoc(tc,
398
0
                            "Error decoding %s string: could not decode codepoint %d",
399
0
                             enc_name,
400
0
                             bytes[pos - 1]);
401
0
                    }
402
0
                }
403
0
                else {
404
0
                    /* Set it without translating, even though it creates
405
0
                     * standards uncompliant results */
406
0
                    graph = bytes[pos-1];
407
0
                }
408
0
            }
409
0
            else if (last_was_cr) {
410
0
                if (codepoint == '\n') {
411
0
                    graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
412
0
                }
413
0
                else {
414
0
                    graph = '\r';
415
0
                    pos--;
416
0
                }
417
0
                last_was_cr = 0;
418
0
            }
419
0
            else if (codepoint == '\r') {
420
0
                last_was_cr = 1;
421
0
                continue;
422
0
            }
423
0
            else {
424
0
                graph = codepoint;
425
0
            }
426
0
            if (count == bufsize) {
427
0
                /* We filled the buffer. Attach this one to the buffers
428
0
                 * linked list, and continue with a new one. */
429
0
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
430
0
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
431
0
                count = 0;
432
0
            }
433
0
            buffer[count++] = graph;
434
0
            last_accept_bytes = cur_bytes;
435
0
            last_accept_pos = pos;
436
0
            total++;
437
0
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
438
0
                reached_stopper = 1;
439
0
                goto done;
440
0
            }
441
0
            else if (stopper_chars && *stopper_chars == total) {
442
0
                reached_stopper = 1;
443
0
                goto done;
444
0
            }
445
0
        }
446
0
        cur_bytes = cur_bytes->next;
447
0
    }
448
0
  done:
449
0
450
0
    /* Attach what we successfully parsed as a result buffer, and trim away
451
0
     * what we chewed through. */
452
0
    if (count) {
453
0
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
454
0
    }
455
0
    else {
456
0
        MVM_free(buffer);
457
0
    }
458
0
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
459
0
460
0
    return reached_stopper;
461
0
}
462
/* Decodes using a decodestream. Decodes as far as it can with the input
463
 * buffers, or until a stopper is reached. */
464
MVMuint32 MVM_string_windows1252_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
465
                                         const MVMint32 *stopper_chars,
466
0
                                         MVMDecodeStreamSeparators *seps) {
467
0
    return MVM_string_windows125X_decodestream(tc, ds, stopper_chars, seps, windows1252_codepoints);
468
0
}
469
/* Decodes using a decodestream. Decodes as far as it can with the input
470
 * buffers, or until a stopper is reached. */
471
MVMuint32 MVM_string_windows1251_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
472
                                         const MVMint32 *stopper_chars,
473
0
                                         MVMDecodeStreamSeparators *seps) {
474
0
    return MVM_string_windows125X_decodestream(tc, ds, stopper_chars, seps, windows1251_codepoints);
475
0
}
476
477
/* Decodes the specified number of bytes of windows1252 into an NFG string,
478
 * creating a result of the specified type. The type must have the MVMString
479
 * REPR. */
480
MVMString * MVM_string_windows125X_decode(MVMThreadContext *tc,
481
        const MVMObject *result_type, char *windows125X_c, size_t bytes,
482
7
        MVMString *replacement, const MVMuint16 *codetable, MVMint64 config) {
483
7
    MVMuint8 *windows125X = (MVMuint8 *)windows125X_c;
484
7
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
485
7
    size_t pos, result_graphs, additional_bytes = 0;
486
5
    MVMStringIndex repl_length = replacement ? MVM_string_graphs(tc, replacement) : 0;
487
7
488
7
    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
489
7
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
490
7
491
7
    result_graphs = 0;
492
67
    for (pos = 0; pos < bytes; pos++) {
493
60
        MVMGrapheme32 codepoint;
494
60
        if (windows125X[pos] == '\r' && pos + 1 < bytes && windows125X[pos + 1] == '\n') {
495
0
            codepoint = MVM_nfg_crlf_grapheme(tc);
496
0
            pos++;
497
0
        }
498
60
        else {
499
60
            codepoint = codetable[windows125X[pos]];
500
60
            if (codepoint == UNMAPPED) {
501
6
                /* Since things we are decoding always fit into Unicode, if we are
502
6
                 * using a replacement, it won't get used unless we use strict */
503
6
                if (replacement && MVM_ENCODING_CONFIG_STRICT(config)) {
504
3
                    int i = 0;
505
3
                    /* Only triggered if repl_length > 1. Copies all but the last
506
3
                     * grapheme in the replacement string */
507
3
                    if (1 < repl_length) {
508
3
                        additional_bytes += repl_length - 1;
509
3
                        result->body.storage.blob_32 = realloc(result->body.storage.blob_32,
510
3
                            sizeof(MVMGrapheme32) * (additional_bytes + bytes));
511
15
                        for (; i < repl_length - 1; i++) {
512
12
                            MVMGrapheme32 graph = MVM_string_get_grapheme_at(tc, replacement, i);
513
12
                            result->body.storage.blob_32[result_graphs++] = graph;
514
12
                        }
515
3
                    }
516
3
                    /* Now we set `codepoint` to the last grapheme in the replacement
517
3
                     * and proceed normally from here. */
518
3
                    codepoint = MVM_string_get_grapheme_at(tc, replacement, i);
519
3
                }
520
3
                else if (MVM_ENCODING_CONFIG_STRICT(config)) {
521
0
                    /* Throw an exception if that codepoint has no mapping */
522
0
                    char *enc_name = codetable == windows1252_codepoints
523
0
                        ? "Windows-1252" : "Windows-1251";
524
0
                    MVM_exception_throw_adhoc(tc,
525
0
                        "Error decoding %s string: could not decode codepoint %d",
526
0
                         enc_name,
527
0
                         windows125X[pos]);
528
0
                }
529
3
                else {
530
3
                    /* Don't convert and just map to identical. This creates
531
3
                     * standards uncompliant results, but will decode buggy
532
3
                     * input */
533
3
                    codepoint = windows125X[pos];
534
3
                }
535
6
            }
536
60
        }
537
60
        result->body.storage.blob_32[result_graphs++] = codepoint;
538
60
    }
539
7
    result->body.num_graphs = result_graphs;
540
7
541
7
    return result;
542
7
}
543
MVMString * MVM_string_windows1252_decode(MVMThreadContext *tc,
544
0
        const MVMObject *result_type, char *windows125X_c, size_t bytes) {
545
0
    return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, NULL, windows1252_codepoints, MVM_ENCODING_PERMISSIVE);
546
0
}
547
MVMString * MVM_string_windows1251_decode(MVMThreadContext *tc,
548
0
        const MVMObject *result_type, char *windows125X_c, size_t bytes) {
549
0
    return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, NULL, windows1251_codepoints, MVM_ENCODING_PERMISSIVE);
550
0
}
551
MVMString * MVM_string_windows1252_decode_config(MVMThreadContext *tc,
552
7
        const MVMObject *result_type, char *windows125X_c, size_t bytes, MVMString *replacement, MVMint64 config) {
553
7
    return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, replacement, windows1252_codepoints, config);
554
7
}
555
MVMString * MVM_string_windows1251_decode_config(MVMThreadContext *tc,
556
0
        const MVMObject *result_type, char *windows125X_c, size_t bytes, MVMString *replacement, MVMint64 config) {
557
0
    return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, replacement, windows1251_codepoints, config);
558
0
}
559
/* Encodes the specified substring to Windows-1252 or Windows-1251. It is passed
560
 * in the encoding, as well as the function that resolves Unicode to the result
561
 * encoding. Anything not in range will cause an exception unless a replacement
562
 * string is supplied. The result string is NULL terminated, but the specified
563
 * size is the non-null part. */
564
char * MVM_string_windows125X_encode_substr(MVMThreadContext *tc, MVMString *str,
565
        MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
566
13
        MVMint32 translate_newlines, MVMuint8(*cp_to_char)(MVMint32), MVMint64 config) {
567
13
    /* Windows-1252 and Windows-1251 are single byte encodings, so each grapheme
568
13
     * will just become a single byte. */
569
13
    MVMuint32 startu = (MVMuint32)start;
570
13
    MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
571
9
    MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
572
13
    MVMuint8 *result  = NULL;
573
13
    size_t result_alloc;
574
13
    MVMuint8 *repl_bytes = NULL;
575
13
    MVMuint64 repl_length;
576
13
577
13
    /* must check start first since it's used in the length check */
578
13
    if (start < 0 || strgraphs < start)
579
0
        MVM_exception_throw_adhoc(tc, "start out of range");
580
13
    if (length < -1 || strgraphs < start + lengthu)
581
0
        MVM_exception_throw_adhoc(tc, "length out of range");
582
13
583
13
    if (replacement)
584
4
        repl_bytes = (MVMuint8 *) MVM_string_windows125X_encode_substr(tc,
585
4
            replacement, &repl_length, 0, -1, NULL, translate_newlines, cp_to_char, config);
586
13
587
13
    result_alloc = lengthu;
588
13
    result = MVM_malloc(result_alloc + 1);
589
13
    if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
590
0
        /* No encoding needed; directly copy. */
591
0
        memcpy(result, str->body.storage.blob_ascii, lengthu);
592
0
        result[lengthu] = 0;
593
0
        if (output_size)
594
0
            *output_size = lengthu;
595
0
    }
596
13
    else {
597
13
        MVMuint32 pos = 0;
598
13
        MVMCodepointIter ci;
599
13
        MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
600
87
        while (MVM_string_ci_has_more(tc, &ci)) {
601
74
            MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
602
74
            if (result_alloc <= pos) {
603
3
                result_alloc += 8;
604
3
                result = MVM_realloc(result, result_alloc + 1);
605
3
            }
606
74
            /* If it's within ASCII just pass it through */
607
74
            if (0 <= codepoint && codepoint <= 127) {
608
42
                result[pos] = (MVMuint8)codepoint;
609
42
                pos++;
610
42
            }
611
32
            else if ((result[pos] = cp_to_char(codepoint)) != '\0') {
612
19
                pos++;
613
19
            }
614
32
            /* If we have a replacement and are we either have it set to strict,
615
32
             * or the codepoint can't fit within one byte, insert a replacement */
616
13
            else if (replacement && (MVM_ENCODING_CONFIG_STRICT(config) || codepoint < 0 || 255 < codepoint)) {
617
6
                if (result_alloc <= pos + repl_length) {
618
3
                    result_alloc += repl_length;
619
3
                    result = MVM_realloc(result, result_alloc + 1);
620
3
                }
621
6
                memcpy(result + pos, repl_bytes, repl_length);
622
6
                pos += repl_length;
623
6
            }
624
7
            else {
625
7
                /* If we're decoding strictly or the codepoint cannot fit in
626
7
                 * one byte, throw an exception */
627
7
                if (MVM_ENCODING_CONFIG_STRICT(config) || codepoint < 0 || 255 < codepoint) {
628
2
                    char *enc_name = cp_to_char == windows1252_cp_to_char
629
2
                        ? "Windows-1252" : "Windows-1251";
630
2
                    MVM_free(result);
631
2
                    MVM_free(repl_bytes);
632
2
                    MVM_exception_throw_adhoc(tc,
633
2
                        "Error encoding %s string: could not encode codepoint %d",
634
2
                         enc_name,
635
2
                         codepoint);
636
2
                }
637
7
                /* It fits in one byte and we're not decoding strictly, so pass
638
7
                 * it through unchanged */
639
5
                else {
640
5
                    result[pos++] = codepoint;
641
5
                }
642
7
            }
643
74
        }
644
13
        result[pos] = 0;
645
13
        if (output_size)
646
11
            *output_size = pos;
647
13
    }
648
13
649
13
    MVM_free(repl_bytes);
650
13
    return (char *)result;
651
13
}
652
char * MVM_string_windows1252_encode_substr(MVMThreadContext *tc, MVMString *str,
653
        MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
654
0
        MVMint32 translate_newlines) {
655
0
    return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1252_cp_to_char, MVM_ENCODING_PERMISSIVE);
656
0
}
657
char * MVM_string_windows1251_encode_substr(MVMThreadContext *tc, MVMString *str,
658
        MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
659
0
        MVMint32 translate_newlines) {
660
0
    return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1251_cp_to_char, MVM_ENCODING_PERMISSIVE);
661
0
}
662
char * MVM_string_windows1252_encode_substr_config(MVMThreadContext *tc, MVMString *str,
663
        MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
664
9
        MVMint32 translate_newlines, MVMint64 config) {
665
9
    return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1252_cp_to_char, config);
666
9
}
667
char * MVM_string_windows1251_encode_substr_config(MVMThreadContext *tc, MVMString *str,
668
        MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
669
0
        MVMint32 translate_newlines, MVMint64 config) {
670
0
    return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1251_cp_to_char, config);
671
0
}