/home/travis/build/MoarVM/MoarVM/src/strings/windows1252.c
Line | Count | Source (jump to first uncovered line) |
1 | | #include "moar.h" |
2 | 60 | #define UNMAPPED 0xFFFF |
3 | | |
4 | | /* Windows-1252 Latin */ |
5 | | static const MVMuint16 windows1252_codepoints[] = { |
6 | | 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007, |
7 | | 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F, |
8 | | 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017, |
9 | | 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F, |
10 | | 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027, |
11 | | 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F, |
12 | | 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037, |
13 | | 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F, |
14 | | 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047, |
15 | | 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F, |
16 | | 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057, |
17 | | 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F, |
18 | | 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067, |
19 | | 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F, |
20 | | 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077, |
21 | | 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F, |
22 | | 0x20AC,0xFFFF,0x201A,0x0192,0x201E,0x2026,0x2020,0x2021, |
23 | | 0x02C6,0x2030,0x0160,0x2039,0x0152,0xFFFF,0x017D,0xFFFF, |
24 | | 0xFFFF,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014, |
25 | | 0x02DC,0x2122,0x0161,0x203A,0x0153,0xFFFF,0x017E,0x0178, |
26 | | 0x00A0,0x00A1,0x00A2,0x00A3,0x00A4,0x00A5,0x00A6,0x00A7, |
27 | | 0x00A8,0x00A9,0x00AA,0x00AB,0x00AC,0x00AD,0x00AE,0x00AF, |
28 | | 0x00B0,0x00B1,0x00B2,0x00B3,0x00B4,0x00B5,0x00B6,0x00B7, |
29 | | 0x00B8,0x00B9,0x00BA,0x00BB,0x00BC,0x00BD,0x00BE,0x00BF, |
30 | | 0x00C0,0x00C1,0x00C2,0x00C3,0x00C4,0x00C5,0x00C6,0x00C7, |
31 | | 0x00C8,0x00C9,0x00CA,0x00CB,0x00CC,0x00CD,0x00CE,0x00CF, |
32 | | 0x00D0,0x00D1,0x00D2,0x00D3,0x00D4,0x00D5,0x00D6,0x00D7, |
33 | | 0x00D8,0x00D9,0x00DA,0x00DB,0x00DC,0x00DD,0x00DE,0x00DF, |
34 | | 0x00E0,0x00E1,0x00E2,0x00E3,0x00E4,0x00E5,0x00E6,0x00E7, |
35 | | 0x00E8,0x00E9,0x00EA,0x00EB,0x00EC,0x00ED,0x00EE,0x00EF, |
36 | | 0x00F0,0x00F1,0x00F2,0x00F3,0x00F4,0x00F5,0x00F6,0x00F7, |
37 | | 0x00F8,0x00F9,0x00FA,0x00FB,0x00FC,0x00FD,0x00FE,0x00FF |
38 | | }; |
39 | | /* Windows-1251 Cyrillic */ |
40 | | static const MVMuint16 windows1251_codepoints[] = { |
41 | | 0x0000,0x0001,0x0002,0x0003,0x0004,0x0005,0x0006,0x0007, |
42 | | 0x0008,0x0009,0x000A,0x000B,0x000C,0x000D,0x000E,0x000F, |
43 | | 0x0010,0x0011,0x0012,0x0013,0x0014,0x0015,0x0016,0x0017, |
44 | | 0x0018,0x0019,0x001A,0x001B,0x001C,0x001D,0x001E,0x001F, |
45 | | 0x0020,0x0021,0x0022,0x0023,0x0024,0x0025,0x0026,0x0027, |
46 | | 0x0028,0x0029,0x002A,0x002B,0x002C,0x002D,0x002E,0x002F, |
47 | | 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037, |
48 | | 0x0038,0x0039,0x003A,0x003B,0x003C,0x003D,0x003E,0x003F, |
49 | | 0x0040,0x0041,0x0042,0x0043,0x0044,0x0045,0x0046,0x0047, |
50 | | 0x0048,0x0049,0x004A,0x004B,0x004C,0x004D,0x004E,0x004F, |
51 | | 0x0050,0x0051,0x0052,0x0053,0x0054,0x0055,0x0056,0x0057, |
52 | | 0x0058,0x0059,0x005A,0x005B,0x005C,0x005D,0x005E,0x005F, |
53 | | 0x0060,0x0061,0x0062,0x0063,0x0064,0x0065,0x0066,0x0067, |
54 | | 0x0068,0x0069,0x006A,0x006B,0x006C,0x006D,0x006E,0x006F, |
55 | | 0x0070,0x0071,0x0072,0x0073,0x0074,0x0075,0x0076,0x0077, |
56 | | 0x0078,0x0079,0x007A,0x007B,0x007C,0x007D,0x007E,0x007F, |
57 | | 0x0402,0x0403,0x201A,0x0453,0x201E,0x2026,0x2020,0x2021, |
58 | | 0x20AC,0x2030,0x0409,0x2039,0x040A,0x040C,0x040B,0x040F, |
59 | | 0x0452,0x2018,0x2019,0x201C,0x201D,0x2022,0x2013,0x2014, |
60 | | 0xFFFF,0x2122,0x0459,0x203A,0x045A,0x045C,0x045B,0x045F, |
61 | | 0x00A0,0x040E,0x045E,0x0408,0x00A4,0x0490,0x00A6,0x00A7, |
62 | | 0x0401,0x00A9,0x0404,0x00AB,0x00AC,0x00AD,0x00AE,0x0407, |
63 | | 0x00B0,0x00B1,0x0406,0x0456,0x0491,0x00B5,0x00B6,0x00B7, |
64 | | 0x0451,0x2116,0x0454,0x00BB,0x0458,0x0405,0x0455,0x0457, |
65 | | 0x0410,0x0411,0x0412,0x0413,0x0414,0x0415,0x0416,0x0417, |
66 | | 0x0418,0x0419,0x041A,0x041B,0x041C,0x041D,0x041E,0x041F, |
67 | | 0x0420,0x0421,0x0422,0x0423,0x0424,0x0425,0x0426,0x0427, |
68 | | 0x0428,0x0429,0x042A,0x042B,0x042C,0x042D,0x042E,0x042F, |
69 | | 0x0430,0x0431,0x0432,0x0433,0x0434,0x0435,0x0436,0x0437, |
70 | | 0x0438,0x0439,0x043A,0x043B,0x043C,0x043D,0x043E,0x043F, |
71 | | 0x0440,0x0441,0x0442,0x0443,0x0444,0x0445,0x0446,0x0447, |
72 | | 0x0448,0x0449,0x044A,0x044B,0x044C,0x044D,0x044E,0x044F |
73 | | }; |
74 | 32 | static MVMuint8 windows1252_cp_to_char(MVMint32 codepoint) { |
75 | 32 | if (8482 < codepoint || codepoint < 0) |
76 | 6 | return '\0'; |
77 | 26 | switch (codepoint) { |
78 | 0 | case 160: return 160; |
79 | 0 | case 161: return 161; |
80 | 0 | case 162: return 162; |
81 | 0 | case 163: return 163; |
82 | 0 | case 164: return 164; |
83 | 0 | case 165: return 165; |
84 | 0 | case 166: return 166; |
85 | 0 | case 167: return 167; |
86 | 0 | case 168: return 168; |
87 | 0 | case 169: return 169; |
88 | 0 | case 170: return 170; |
89 | 0 | case 171: return 171; |
90 | 0 | case 172: return 172; |
91 | 0 | case 173: return 173; |
92 | 0 | case 174: return 174; |
93 | 0 | case 175: return 175; |
94 | 0 | case 176: return 176; |
95 | 0 | case 177: return 177; |
96 | 0 | case 178: return 178; |
97 | 0 | case 179: return 179; |
98 | 0 | case 180: return 180; |
99 | 0 | case 181: return 181; |
100 | 0 | case 182: return 182; |
101 | 0 | case 183: return 183; |
102 | 0 | case 184: return 184; |
103 | 0 | case 185: return 185; |
104 | 0 | case 186: return 186; |
105 | 0 | case 187: return 187; |
106 | 0 | case 188: return 188; |
107 | 0 | case 189: return 189; |
108 | 0 | case 190: return 190; |
109 | 0 | case 191: return 191; |
110 | 0 | case 192: return 192; |
111 | 0 | case 193: return 193; |
112 | 0 | case 194: return 194; |
113 | 0 | case 195: return 195; |
114 | 0 | case 196: return 196; |
115 | 0 | case 197: return 197; |
116 | 0 | case 198: return 198; |
117 | 0 | case 199: return 199; |
118 | 0 | case 200: return 200; |
119 | 0 | case 201: return 201; |
120 | 0 | case 202: return 202; |
121 | 0 | case 203: return 203; |
122 | 0 | case 204: return 204; |
123 | 0 | case 205: return 205; |
124 | 0 | case 206: return 206; |
125 | 0 | case 207: return 207; |
126 | 0 | case 208: return 208; |
127 | 0 | case 209: return 209; |
128 | 0 | case 210: return 210; |
129 | 0 | case 211: return 211; |
130 | 0 | case 212: return 212; |
131 | 0 | case 213: return 213; |
132 | 0 | case 214: return 214; |
133 | 0 | case 215: return 215; |
134 | 0 | case 216: return 216; |
135 | 0 | case 217: return 217; |
136 | 0 | case 218: return 218; |
137 | 0 | case 219: return 219; |
138 | 0 | case 220: return 220; |
139 | 0 | case 221: return 221; |
140 | 0 | case 222: return 222; |
141 | 0 | case 223: return 223; |
142 | 0 | case 224: return 224; |
143 | 0 | case 225: return 225; |
144 | 0 | case 226: return 226; |
145 | 0 | case 227: return 227; |
146 | 0 | case 228: return 228; |
147 | 6 | case 229: return 229; |
148 | 0 | case 230: return 230; |
149 | 0 | case 231: return 231; |
150 | 0 | case 232: return 232; |
151 | 0 | case 233: return 233; |
152 | 0 | case 234: return 234; |
153 | 0 | case 235: return 235; |
154 | 0 | case 236: return 236; |
155 | 0 | case 237: return 237; |
156 | 0 | case 238: return 238; |
157 | 0 | case 239: return 239; |
158 | 0 | case 240: return 240; |
159 | 0 | case 241: return 241; |
160 | 0 | case 242: return 242; |
161 | 0 | case 243: return 243; |
162 | 0 | case 244: return 244; |
163 | 0 | case 245: return 245; |
164 | 0 | case 246: return 246; |
165 | 0 | case 247: return 247; |
166 | 0 | case 248: return 248; |
167 | 0 | case 249: return 249; |
168 | 0 | case 250: return 250; |
169 | 0 | case 251: return 251; |
170 | 0 | case 252: return 252; |
171 | 0 | case 253: return 253; |
172 | 0 | case 254: return 254; |
173 | 0 | case 255: return 255; |
174 | 1 | case 338: return 140; |
175 | 0 | case 339: return 156; |
176 | 1 | case 352: return 138; |
177 | 0 | case 353: return 154; |
178 | 0 | case 376: return 159; |
179 | 1 | case 381: return 142; |
180 | 0 | case 382: return 158; |
181 | 1 | case 402: return 131; |
182 | 1 | case 710: return 136; |
183 | 0 | case 732: return 152; |
184 | 0 | case 8211: return 150; |
185 | 0 | case 8212: return 151; |
186 | 0 | case 8216: return 145; |
187 | 0 | case 8217: return 146; |
188 | 1 | case 8218: return 130; |
189 | 0 | case 8220: return 147; |
190 | 0 | case 8221: return 148; |
191 | 1 | case 8222: return 132; |
192 | 1 | case 8224: return 134; |
193 | 1 | case 8225: return 135; |
194 | 0 | case 8226: return 149; |
195 | 1 | case 8230: return 133; |
196 | 1 | case 8240: return 137; |
197 | 1 | case 8249: return 139; |
198 | 0 | case 8250: return 155; |
199 | 1 | case 8364: return 128; |
200 | 0 | case 8482: return 153; |
201 | 7 | default: return '\0'; |
202 | 0 | }; |
203 | 0 | } |
204 | 0 | static MVMuint8 windows1251_cp_to_char(MVMint32 codepoint) { |
205 | 0 | if (8482 < codepoint || codepoint < 0) |
206 | 0 | return '\0'; |
207 | 0 | switch (codepoint) { |
208 | 0 | case 160: return 160; |
209 | 0 | case 164: return 164; |
210 | 0 | case 166: return 166; |
211 | 0 | case 167: return 167; |
212 | 0 | case 169: return 169; |
213 | 0 | case 171: return 171; |
214 | 0 | case 172: return 172; |
215 | 0 | case 173: return 173; |
216 | 0 | case 174: return 174; |
217 | 0 | case 176: return 176; |
218 | 0 | case 177: return 177; |
219 | 0 | case 181: return 181; |
220 | 0 | case 182: return 182; |
221 | 0 | case 183: return 183; |
222 | 0 | case 187: return 187; |
223 | 0 | case 1025: return 168; |
224 | 0 | case 1026: return 128; |
225 | 0 | case 1027: return 129; |
226 | 0 | case 1028: return 170; |
227 | 0 | case 1029: return 189; |
228 | 0 | case 1030: return 178; |
229 | 0 | case 1031: return 175; |
230 | 0 | case 1032: return 163; |
231 | 0 | case 1033: return 138; |
232 | 0 | case 1034: return 140; |
233 | 0 | case 1035: return 142; |
234 | 0 | case 1036: return 141; |
235 | 0 | case 1038: return 161; |
236 | 0 | case 1039: return 143; |
237 | 0 | case 1040: return 192; |
238 | 0 | case 1041: return 193; |
239 | 0 | case 1042: return 194; |
240 | 0 | case 1043: return 195; |
241 | 0 | case 1044: return 196; |
242 | 0 | case 1045: return 197; |
243 | 0 | case 1046: return 198; |
244 | 0 | case 1047: return 199; |
245 | 0 | case 1048: return 200; |
246 | 0 | case 1049: return 201; |
247 | 0 | case 1050: return 202; |
248 | 0 | case 1051: return 203; |
249 | 0 | case 1052: return 204; |
250 | 0 | case 1053: return 205; |
251 | 0 | case 1054: return 206; |
252 | 0 | case 1055: return 207; |
253 | 0 | case 1056: return 208; |
254 | 0 | case 1057: return 209; |
255 | 0 | case 1058: return 210; |
256 | 0 | case 1059: return 211; |
257 | 0 | case 1060: return 212; |
258 | 0 | case 1061: return 213; |
259 | 0 | case 1062: return 214; |
260 | 0 | case 1063: return 215; |
261 | 0 | case 1064: return 216; |
262 | 0 | case 1065: return 217; |
263 | 0 | case 1066: return 218; |
264 | 0 | case 1067: return 219; |
265 | 0 | case 1068: return 220; |
266 | 0 | case 1069: return 221; |
267 | 0 | case 1070: return 222; |
268 | 0 | case 1071: return 223; |
269 | 0 | case 1072: return 224; |
270 | 0 | case 1073: return 225; |
271 | 0 | case 1074: return 226; |
272 | 0 | case 1075: return 227; |
273 | 0 | case 1076: return 228; |
274 | 0 | case 1077: return 229; |
275 | 0 | case 1078: return 230; |
276 | 0 | case 1079: return 231; |
277 | 0 | case 1080: return 232; |
278 | 0 | case 1081: return 233; |
279 | 0 | case 1082: return 234; |
280 | 0 | case 1083: return 235; |
281 | 0 | case 1084: return 236; |
282 | 0 | case 1085: return 237; |
283 | 0 | case 1086: return 238; |
284 | 0 | case 1087: return 239; |
285 | 0 | case 1088: return 240; |
286 | 0 | case 1089: return 241; |
287 | 0 | case 1090: return 242; |
288 | 0 | case 1091: return 243; |
289 | 0 | case 1092: return 244; |
290 | 0 | case 1093: return 245; |
291 | 0 | case 1094: return 246; |
292 | 0 | case 1095: return 247; |
293 | 0 | case 1096: return 248; |
294 | 0 | case 1097: return 249; |
295 | 0 | case 1098: return 250; |
296 | 0 | case 1099: return 251; |
297 | 0 | case 1100: return 252; |
298 | 0 | case 1101: return 253; |
299 | 0 | case 1102: return 254; |
300 | 0 | case 1103: return 255; |
301 | 0 | case 1105: return 184; |
302 | 0 | case 1106: return 144; |
303 | 0 | case 1107: return 131; |
304 | 0 | case 1108: return 186; |
305 | 0 | case 1109: return 190; |
306 | 0 | case 1110: return 179; |
307 | 0 | case 1111: return 191; |
308 | 0 | case 1112: return 188; |
309 | 0 | case 1113: return 154; |
310 | 0 | case 1114: return 156; |
311 | 0 | case 1115: return 158; |
312 | 0 | case 1116: return 157; |
313 | 0 | case 1118: return 162; |
314 | 0 | case 1119: return 159; |
315 | 0 | case 1168: return 165; |
316 | 0 | case 1169: return 180; |
317 | 0 | case 8211: return 150; |
318 | 0 | case 8212: return 151; |
319 | 0 | case 8216: return 145; |
320 | 0 | case 8217: return 146; |
321 | 0 | case 8218: return 130; |
322 | 0 | case 8220: return 147; |
323 | 0 | case 8221: return 148; |
324 | 0 | case 8222: return 132; |
325 | 0 | case 8224: return 134; |
326 | 0 | case 8225: return 135; |
327 | 0 | case 8226: return 149; |
328 | 0 | case 8230: return 133; |
329 | 0 | case 8240: return 137; |
330 | 0 | case 8249: return 139; |
331 | 0 | case 8250: return 155; |
332 | 0 | case 8364: return 136; |
333 | 0 | case 8470: return 185; |
334 | 0 | case 8482: return 153; |
335 | 0 | default: return '\0'; |
336 | 0 | }; |
337 | 0 | } |
338 | | |
339 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
340 | | * buffers, or until a stopper is reached. */ |
341 | | MVMuint32 MVM_string_windows125X_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
342 | | const MVMint32 *stopper_chars, |
343 | | MVMDecodeStreamSeparators *seps, |
344 | 0 | const MVMuint16 *codetable) { |
345 | 0 | MVMint32 count = 0, total = 0; |
346 | 0 | MVMint32 bufsize; |
347 | 0 | MVMGrapheme32 *buffer = NULL; |
348 | 0 | MVMDecodeStreamBytes *cur_bytes = NULL; |
349 | 0 | MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head; |
350 | 0 | MVMint32 last_accept_pos, last_was_cr; |
351 | 0 | MVMuint32 reached_stopper; |
352 | 0 | MVMStringIndex repl_length = ds->replacement ? MVM_string_graphs(tc, ds->replacement) : 0; |
353 | 0 | MVMStringIndex repl_pos = 0; |
354 | 0 |
|
355 | 0 | /* If there's no buffers, we're done. */ |
356 | 0 | if (!ds->bytes_head) |
357 | 0 | return 0; |
358 | 0 | last_accept_pos = ds->bytes_head_pos; |
359 | 0 |
|
360 | 0 | /* If we're asked for zero chars, also done. */ |
361 | 0 | if (stopper_chars && *stopper_chars == 0) |
362 | 0 | return 1; |
363 | 0 |
|
364 | 0 | bufsize = ds->result_size_guess; |
365 | 0 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
366 | 0 |
|
367 | 0 | /* Decode each of the buffers. */ |
368 | 0 | cur_bytes = ds->bytes_head; |
369 | 0 | last_was_cr = 0; |
370 | 0 | reached_stopper = 0; |
371 | 0 | while (cur_bytes) { |
372 | 0 | /* Process this buffer. */ |
373 | 0 | MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0; |
374 | 0 | unsigned char *bytes = (unsigned char *)cur_bytes->bytes; |
375 | 0 | while (pos < cur_bytes->length || repl_pos) { |
376 | 0 | MVMGrapheme32 graph; |
377 | 0 | MVMCodepoint codepoint = codetable[bytes[pos++]]; |
378 | 0 | if (repl_pos) { |
379 | 0 | graph = MVM_string_get_grapheme_at_nocheck(tc, ds->replacement, repl_pos++); |
380 | 0 | if (repl_length <= repl_pos) repl_pos = 0; |
381 | 0 | pos--; |
382 | 0 | } |
383 | 0 | else if (codepoint == UNMAPPED) { |
384 | 0 | if (MVM_ENCODING_CONFIG_STRICT(ds->config)) { |
385 | 0 | if (ds->replacement) { |
386 | 0 | graph = MVM_string_get_grapheme_at_nocheck(tc, ds->replacement, repl_pos); |
387 | 0 | /* If the replacement is more than one grapheme we need |
388 | 0 | * to set repl_pos++ so we will grab the next grapheme on |
389 | 0 | * the next loop */ |
390 | 0 | if (1 < repl_length) repl_pos++; |
391 | 0 | } |
392 | 0 | else { |
393 | 0 | /* Throw if it's unmapped */ |
394 | 0 | char *enc_name = codetable == windows1252_codepoints |
395 | 0 | ? "Windows-1252" : "Windows-1251"; |
396 | 0 | MVM_free(buffer); |
397 | 0 | MVM_exception_throw_adhoc(tc, |
398 | 0 | "Error decoding %s string: could not decode codepoint %d", |
399 | 0 | enc_name, |
400 | 0 | bytes[pos - 1]); |
401 | 0 | } |
402 | 0 | } |
403 | 0 | else { |
404 | 0 | /* Set it without translating, even though it creates |
405 | 0 | * standards uncompliant results */ |
406 | 0 | graph = bytes[pos-1]; |
407 | 0 | } |
408 | 0 | } |
409 | 0 | else if (last_was_cr) { |
410 | 0 | if (codepoint == '\n') { |
411 | 0 | graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm)); |
412 | 0 | } |
413 | 0 | else { |
414 | 0 | graph = '\r'; |
415 | 0 | pos--; |
416 | 0 | } |
417 | 0 | last_was_cr = 0; |
418 | 0 | } |
419 | 0 | else if (codepoint == '\r') { |
420 | 0 | last_was_cr = 1; |
421 | 0 | continue; |
422 | 0 | } |
423 | 0 | else { |
424 | 0 | graph = codepoint; |
425 | 0 | } |
426 | 0 | if (count == bufsize) { |
427 | 0 | /* We filled the buffer. Attach this one to the buffers |
428 | 0 | * linked list, and continue with a new one. */ |
429 | 0 | MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize); |
430 | 0 | buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32)); |
431 | 0 | count = 0; |
432 | 0 | } |
433 | 0 | buffer[count++] = graph; |
434 | 0 | last_accept_bytes = cur_bytes; |
435 | 0 | last_accept_pos = pos; |
436 | 0 | total++; |
437 | 0 | if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) { |
438 | 0 | reached_stopper = 1; |
439 | 0 | goto done; |
440 | 0 | } |
441 | 0 | else if (stopper_chars && *stopper_chars == total) { |
442 | 0 | reached_stopper = 1; |
443 | 0 | goto done; |
444 | 0 | } |
445 | 0 | } |
446 | 0 | cur_bytes = cur_bytes->next; |
447 | 0 | } |
448 | 0 | done: |
449 | 0 |
|
450 | 0 | /* Attach what we successfully parsed as a result buffer, and trim away |
451 | 0 | * what we chewed through. */ |
452 | 0 | if (count) { |
453 | 0 | MVM_string_decodestream_add_chars(tc, ds, buffer, count); |
454 | 0 | } |
455 | 0 | else { |
456 | 0 | MVM_free(buffer); |
457 | 0 | } |
458 | 0 | MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos); |
459 | 0 |
|
460 | 0 | return reached_stopper; |
461 | 0 | } |
462 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
463 | | * buffers, or until a stopper is reached. */ |
464 | | MVMuint32 MVM_string_windows1252_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
465 | | const MVMint32 *stopper_chars, |
466 | 0 | MVMDecodeStreamSeparators *seps) { |
467 | 0 | return MVM_string_windows125X_decodestream(tc, ds, stopper_chars, seps, windows1252_codepoints); |
468 | 0 | } |
469 | | /* Decodes using a decodestream. Decodes as far as it can with the input |
470 | | * buffers, or until a stopper is reached. */ |
471 | | MVMuint32 MVM_string_windows1251_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, |
472 | | const MVMint32 *stopper_chars, |
473 | 0 | MVMDecodeStreamSeparators *seps) { |
474 | 0 | return MVM_string_windows125X_decodestream(tc, ds, stopper_chars, seps, windows1251_codepoints); |
475 | 0 | } |
476 | | |
477 | | /* Decodes the specified number of bytes of windows1252 into an NFG string, |
478 | | * creating a result of the specified type. The type must have the MVMString |
479 | | * REPR. */ |
480 | | MVMString * MVM_string_windows125X_decode(MVMThreadContext *tc, |
481 | | const MVMObject *result_type, char *windows125X_c, size_t bytes, |
482 | 7 | MVMString *replacement, const MVMuint16 *codetable, MVMint64 config) { |
483 | 7 | MVMuint8 *windows125X = (MVMuint8 *)windows125X_c; |
484 | 7 | MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type)); |
485 | 7 | size_t pos, result_graphs, additional_bytes = 0; |
486 | 5 | MVMStringIndex repl_length = replacement ? MVM_string_graphs(tc, replacement) : 0; |
487 | 7 | |
488 | 7 | result->body.storage_type = MVM_STRING_GRAPHEME_32; |
489 | 7 | result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes); |
490 | 7 | |
491 | 7 | result_graphs = 0; |
492 | 67 | for (pos = 0; pos < bytes; pos++) { |
493 | 60 | MVMGrapheme32 codepoint; |
494 | 60 | if (windows125X[pos] == '\r' && pos + 1 < bytes && windows125X[pos + 1] == '\n') { |
495 | 0 | codepoint = MVM_nfg_crlf_grapheme(tc); |
496 | 0 | pos++; |
497 | 0 | } |
498 | 60 | else { |
499 | 60 | codepoint = codetable[windows125X[pos]]; |
500 | 60 | if (codepoint == UNMAPPED) { |
501 | 6 | /* Since things we are decoding always fit into Unicode, if we are |
502 | 6 | * using a replacement, it won't get used unless we use strict */ |
503 | 6 | if (replacement && MVM_ENCODING_CONFIG_STRICT(config)) { |
504 | 3 | int i = 0; |
505 | 3 | /* Only triggered if repl_length > 1. Copies all but the last |
506 | 3 | * grapheme in the replacement string */ |
507 | 3 | if (1 < repl_length) { |
508 | 3 | additional_bytes += repl_length - 1; |
509 | 3 | result->body.storage.blob_32 = realloc(result->body.storage.blob_32, |
510 | 3 | sizeof(MVMGrapheme32) * (additional_bytes + bytes)); |
511 | 15 | for (; i < repl_length - 1; i++) { |
512 | 12 | MVMGrapheme32 graph = MVM_string_get_grapheme_at(tc, replacement, i); |
513 | 12 | result->body.storage.blob_32[result_graphs++] = graph; |
514 | 12 | } |
515 | 3 | } |
516 | 3 | /* Now we set `codepoint` to the last grapheme in the replacement |
517 | 3 | * and proceed normally from here. */ |
518 | 3 | codepoint = MVM_string_get_grapheme_at(tc, replacement, i); |
519 | 3 | } |
520 | 3 | else if (MVM_ENCODING_CONFIG_STRICT(config)) { |
521 | 0 | /* Throw an exception if that codepoint has no mapping */ |
522 | 0 | char *enc_name = codetable == windows1252_codepoints |
523 | 0 | ? "Windows-1252" : "Windows-1251"; |
524 | 0 | MVM_exception_throw_adhoc(tc, |
525 | 0 | "Error decoding %s string: could not decode codepoint %d", |
526 | 0 | enc_name, |
527 | 0 | windows125X[pos]); |
528 | 0 | } |
529 | 3 | else { |
530 | 3 | /* Don't convert and just map to identical. This creates |
531 | 3 | * standards uncompliant results, but will decode buggy |
532 | 3 | * input */ |
533 | 3 | codepoint = windows125X[pos]; |
534 | 3 | } |
535 | 6 | } |
536 | 60 | } |
537 | 60 | result->body.storage.blob_32[result_graphs++] = codepoint; |
538 | 60 | } |
539 | 7 | result->body.num_graphs = result_graphs; |
540 | 7 | |
541 | 7 | return result; |
542 | 7 | } |
543 | | MVMString * MVM_string_windows1252_decode(MVMThreadContext *tc, |
544 | 0 | const MVMObject *result_type, char *windows125X_c, size_t bytes) { |
545 | 0 | return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, NULL, windows1252_codepoints, MVM_ENCODING_PERMISSIVE); |
546 | 0 | } |
547 | | MVMString * MVM_string_windows1251_decode(MVMThreadContext *tc, |
548 | 0 | const MVMObject *result_type, char *windows125X_c, size_t bytes) { |
549 | 0 | return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, NULL, windows1251_codepoints, MVM_ENCODING_PERMISSIVE); |
550 | 0 | } |
551 | | MVMString * MVM_string_windows1252_decode_config(MVMThreadContext *tc, |
552 | 7 | const MVMObject *result_type, char *windows125X_c, size_t bytes, MVMString *replacement, MVMint64 config) { |
553 | 7 | return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, replacement, windows1252_codepoints, config); |
554 | 7 | } |
555 | | MVMString * MVM_string_windows1251_decode_config(MVMThreadContext *tc, |
556 | 0 | const MVMObject *result_type, char *windows125X_c, size_t bytes, MVMString *replacement, MVMint64 config) { |
557 | 0 | return MVM_string_windows125X_decode(tc, result_type, windows125X_c, bytes, replacement, windows1251_codepoints, config); |
558 | 0 | } |
559 | | /* Encodes the specified substring to Windows-1252 or Windows-1251. It is passed |
560 | | * in the encoding, as well as the function that resolves Unicode to the result |
561 | | * encoding. Anything not in range will cause an exception unless a replacement |
562 | | * string is supplied. The result string is NULL terminated, but the specified |
563 | | * size is the non-null part. */ |
564 | | char * MVM_string_windows125X_encode_substr(MVMThreadContext *tc, MVMString *str, |
565 | | MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, |
566 | 13 | MVMint32 translate_newlines, MVMuint8(*cp_to_char)(MVMint32), MVMint64 config) { |
567 | 13 | /* Windows-1252 and Windows-1251 are single byte encodings, so each grapheme |
568 | 13 | * will just become a single byte. */ |
569 | 13 | MVMuint32 startu = (MVMuint32)start; |
570 | 13 | MVMStringIndex strgraphs = MVM_string_graphs(tc, str); |
571 | 9 | MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length); |
572 | 13 | MVMuint8 *result = NULL; |
573 | 13 | size_t result_alloc; |
574 | 13 | MVMuint8 *repl_bytes = NULL; |
575 | 13 | MVMuint64 repl_length; |
576 | 13 | |
577 | 13 | /* must check start first since it's used in the length check */ |
578 | 13 | if (start < 0 || strgraphs < start) |
579 | 0 | MVM_exception_throw_adhoc(tc, "start out of range"); |
580 | 13 | if (length < -1 || strgraphs < start + lengthu) |
581 | 0 | MVM_exception_throw_adhoc(tc, "length out of range"); |
582 | 13 | |
583 | 13 | if (replacement) |
584 | 4 | repl_bytes = (MVMuint8 *) MVM_string_windows125X_encode_substr(tc, |
585 | 4 | replacement, &repl_length, 0, -1, NULL, translate_newlines, cp_to_char, config); |
586 | 13 | |
587 | 13 | result_alloc = lengthu; |
588 | 13 | result = MVM_malloc(result_alloc + 1); |
589 | 13 | if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { |
590 | 0 | /* No encoding needed; directly copy. */ |
591 | 0 | memcpy(result, str->body.storage.blob_ascii, lengthu); |
592 | 0 | result[lengthu] = 0; |
593 | 0 | if (output_size) |
594 | 0 | *output_size = lengthu; |
595 | 0 | } |
596 | 13 | else { |
597 | 13 | MVMuint32 pos = 0; |
598 | 13 | MVMCodepointIter ci; |
599 | 13 | MVM_string_ci_init(tc, &ci, str, translate_newlines, 0); |
600 | 87 | while (MVM_string_ci_has_more(tc, &ci)) { |
601 | 74 | MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci); |
602 | 74 | if (result_alloc <= pos) { |
603 | 3 | result_alloc += 8; |
604 | 3 | result = MVM_realloc(result, result_alloc + 1); |
605 | 3 | } |
606 | 74 | /* If it's within ASCII just pass it through */ |
607 | 74 | if (0 <= codepoint && codepoint <= 127) { |
608 | 42 | result[pos] = (MVMuint8)codepoint; |
609 | 42 | pos++; |
610 | 42 | } |
611 | 32 | else if ((result[pos] = cp_to_char(codepoint)) != '\0') { |
612 | 19 | pos++; |
613 | 19 | } |
614 | 32 | /* If we have a replacement and are we either have it set to strict, |
615 | 32 | * or the codepoint can't fit within one byte, insert a replacement */ |
616 | 13 | else if (replacement && (MVM_ENCODING_CONFIG_STRICT(config) || codepoint < 0 || 255 < codepoint)) { |
617 | 6 | if (result_alloc <= pos + repl_length) { |
618 | 3 | result_alloc += repl_length; |
619 | 3 | result = MVM_realloc(result, result_alloc + 1); |
620 | 3 | } |
621 | 6 | memcpy(result + pos, repl_bytes, repl_length); |
622 | 6 | pos += repl_length; |
623 | 6 | } |
624 | 7 | else { |
625 | 7 | /* If we're decoding strictly or the codepoint cannot fit in |
626 | 7 | * one byte, throw an exception */ |
627 | 7 | if (MVM_ENCODING_CONFIG_STRICT(config) || codepoint < 0 || 255 < codepoint) { |
628 | 2 | char *enc_name = cp_to_char == windows1252_cp_to_char |
629 | 2 | ? "Windows-1252" : "Windows-1251"; |
630 | 2 | MVM_free(result); |
631 | 2 | MVM_free(repl_bytes); |
632 | 2 | MVM_exception_throw_adhoc(tc, |
633 | 2 | "Error encoding %s string: could not encode codepoint %d", |
634 | 2 | enc_name, |
635 | 2 | codepoint); |
636 | 2 | } |
637 | 7 | /* It fits in one byte and we're not decoding strictly, so pass |
638 | 7 | * it through unchanged */ |
639 | 5 | else { |
640 | 5 | result[pos++] = codepoint; |
641 | 5 | } |
642 | 7 | } |
643 | 74 | } |
644 | 13 | result[pos] = 0; |
645 | 13 | if (output_size) |
646 | 11 | *output_size = pos; |
647 | 13 | } |
648 | 13 | |
649 | 13 | MVM_free(repl_bytes); |
650 | 13 | return (char *)result; |
651 | 13 | } |
652 | | char * MVM_string_windows1252_encode_substr(MVMThreadContext *tc, MVMString *str, |
653 | | MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, |
654 | 0 | MVMint32 translate_newlines) { |
655 | 0 | return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1252_cp_to_char, MVM_ENCODING_PERMISSIVE); |
656 | 0 | } |
657 | | char * MVM_string_windows1251_encode_substr(MVMThreadContext *tc, MVMString *str, |
658 | | MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, |
659 | 0 | MVMint32 translate_newlines) { |
660 | 0 | return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1251_cp_to_char, MVM_ENCODING_PERMISSIVE); |
661 | 0 | } |
662 | | char * MVM_string_windows1252_encode_substr_config(MVMThreadContext *tc, MVMString *str, |
663 | | MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, |
664 | 9 | MVMint32 translate_newlines, MVMint64 config) { |
665 | 9 | return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1252_cp_to_char, config); |
666 | 9 | } |
667 | | char * MVM_string_windows1251_encode_substr_config(MVMThreadContext *tc, MVMString *str, |
668 | | MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, |
669 | 0 | MVMint32 translate_newlines, MVMint64 config) { |
670 | 0 | return MVM_string_windows125X_encode_substr(tc, str, output_size, start, length, replacement, translate_newlines, windows1251_cp_to_char, config); |
671 | 0 | } |