/home/travis/build/MoarVM/MoarVM/src/strings/utf8.c

Source (jump to first uncovered line)
#include "moar.h"

/* The below section has an MIT-style license, included here.

// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject
 * to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#define UTF8_ACCEPT 0
#define UTF8_REJECT 12

static const MVMuint8 utf8d[] = {
  // The first part of the table maps bytes to character classes that
  // to reduce the size of the transition table and create bitmasks.
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

  // The second part is a transition table that maps a combination
  // of a state of the automaton and a character class to a state.
   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
  12,36,12,12,12,12,12,12,12,12,12,12,
};

static MVMint32
decode_utf8_byte(MVMint32 *state, MVMGrapheme32 *codep, MVMuint8 byte) {
  MVMint32 type = utf8d[byte];

  *codep = (*state != UTF8_ACCEPT) ?
    (byte & 0x3fu) | (*codep << 6) :
    (0xff >> type) & (byte);

  *state = utf8d[256 + *state + type];
  return *state;
}
/* end Bjoern Hoehrmann section (some things were changed from the original) */

/* begin not_gerd section (modified from original)
// Copyright 2012 not_gerd
// see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122

Permission is granted to use, modify, and / or redistribute at will.

This includes removing authorship notices, re-use of code parts in
other software (with or without giving credit), and / or creating a
commercial product based on it.

This permission is not revocable by the author.

This software is provided as-is. Use it at your own risk. There is
no warranty whatsoever, neither expressed nor implied, and by using
this software you accept that the author(s) shall not be held liable
for any loss of data, loss of service, or other damages, be they
incidental or consequential. Your only option other than accepting
this is not to use the software at all.
*/

enum {
    CP_CHAR            = 1 << 0,
    CP_LOW_SURROGATE   = 1 << 1,
    CP_HIGH_SURROGATE  = 1 << 2,
    CP_NONCHAR         = 1 << 3,
    CP_OVERFLOW        = 1 << 4,

    U8_SINGLE          = 1 << 5,
    U8_DOUBLE          = 1 << 6,
    U8_TRIPLE          = 1 << 7,
    U8_QUAD            = 1 << 8
};

static unsigned classify(MVMCodepoint cp) {
    if(cp <= 0x7F)
        return CP_CHAR | U8_SINGLE;

    if(cp <= 0x07FF)
        return CP_CHAR | U8_DOUBLE;

    if(0xD800 <= cp && cp <= 0xDBFF)
        return CP_HIGH_SURROGATE | U8_TRIPLE;

    if(0xDC00 <= cp && cp <= 0xDFFF)
        return CP_LOW_SURROGATE | U8_TRIPLE;

    if(0xFDD0 <= cp && cp <= 0xFDEF)
        return CP_NONCHAR | U8_TRIPLE;

    if(cp <= 0xFFFD)
        return CP_CHAR | U8_TRIPLE;

    if(cp == 0xFFFE || cp == 0xFFFF)
        return CP_NONCHAR | U8_TRIPLE;

    if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF))
        return CP_NONCHAR | U8_QUAD;

    if(cp <= 0x10FFFF)
        return CP_CHAR | U8_QUAD;

    if(cp <= 0x1FFFFF)
        return CP_OVERFLOW | U8_QUAD;

    return 0;
}

static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) {
    unsigned cc = classify(cp);

    if (!(cc & (CP_CHAR | CP_NONCHAR)))
        return 0;

    if (cc & U8_SINGLE) {
        bp[0] = (MVMuint8)cp;
        return 1;
    }

    if (cc & U8_DOUBLE) {
        bp[0] = (MVMuint8)(( 6 << 5) |  (cp >> 6));
        bp[1] = (MVMuint8)(( 2 << 6) |  (cp &  0x3F));
        return 2;
    }

    if (cc & U8_TRIPLE) {
        bp[0] = (MVMuint8)((14 << 4) |  (cp >> 12));
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 6) & 0x3F));
        bp[2] = (MVMuint8)(( 2 << 6) | ( cp       & 0x3F));
        return 3;
    }

    if (cc & U8_QUAD) {
        bp[0] = (MVMuint8)((30 << 3) |  (cp >> 18));
        bp[1] = (MVMuint8)(( 2 << 6) | ((cp >> 12) & 0x3F));
        bp[2] = (MVMuint8)(( 2 << 6) | ((cp >>  6) & 0x3F));
        bp[3] = (MVMuint8)(( 2 << 6) | ( cp        & 0x3F));
        return 4;
    }

    return 0;
}

 /* end not_gerd section */

#define UTF8_MAXINC (32 * 1024 * 1024)

/* Decodes the specified number of bytes of utf8 into an NFG string, creating
 * a result of the specified type. The type must have the MVMString REPR. */
MVMString * MVM_string_utf8_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) {
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
    MVMint32 count = 0;
    MVMCodepoint codepoint;
    MVMint32 line_ending = 0;
    MVMint32 state = 0;
    MVMint32 bufsize = bytes;
    MVMGrapheme32 lowest_graph  =  0x7fffffff;
    MVMGrapheme32 highest_graph = -0x7fffffff;
    MVMGrapheme32 *buffer = MVM_malloc(sizeof(MVMGrapheme32) * bufsize);
    size_t orig_bytes;
    const char *orig_utf8;
    MVMint32 line;
    MVMint32 col;
    MVMint32 ready;

    /* Need to normalize to NFG as we decode. */
    MVMNormalizer norm;
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);

    orig_bytes = bytes;
    orig_utf8 = utf8;

    for (; bytes; ++utf8, --bytes) {
        switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
        case UTF8_ACCEPT: { /* got a codepoint */
            MVMGrapheme32 g;
            ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, codepoint, &g);
            if (ready) {
                while (count + ready >= bufsize) { /* if the buffer's full make a bigger one */
                    buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (
                        bufsize >= UTF8_MAXINC ? (bufsize += UTF8_MAXINC) : (bufsize *= 2)
                    ));
                }
                buffer[count++] = g;
                lowest_graph = g < lowest_graph ? g : lowest_graph;
                highest_graph = g > highest_graph ? g : highest_graph;
                while (--ready > 0) {
                    g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
                    lowest_graph = g < lowest_graph ? g : lowest_graph;
                    highest_graph = g > highest_graph ? g : highest_graph;
                    buffer[count++] = g;
                }
            }
            break;
        }
        case UTF8_REJECT:
            /* found a malformed sequence; parse it again this time tracking
             * line and col numbers. */
            MVM_unicode_normalizer_cleanup(tc, &norm); /* Since we'll throw. */
            bytes = orig_bytes; utf8 = orig_utf8; state = 0; line = 1; col = 1;
            for (; bytes; ++utf8, --bytes) {
                switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
                case UTF8_ACCEPT:
                    /* this could be reorganized into several nested ugly if/else :/ */
                    if (!line_ending && (codepoint == 10 || codepoint == 13)) {
                        /* Detect the style of line endings.
                         * Select whichever comes first.
                         * First or only part of first line ending. */
                        line_ending = codepoint;
                        col = 1; line++;
                    }
                    else if (line_ending && codepoint == line_ending) {
                        /* first or only part of next line ending */
                        col = 1; line++;
                    }
                    else if (codepoint == 10 || codepoint == 13) {
                        /* second part of line ending; ignore */
                    }
                    else /* non-line ending codepoint */
                        col++;
                    break;
                case UTF8_REJECT:
                    MVM_free(buffer);
                    MVM_exception_throw_adhoc(tc, "Malformed UTF-8 at line %u col %u", line, col);
                }
            }
            MVM_free(buffer);
            MVM_exception_throw_adhoc(tc, "Concurrent modification of UTF-8 input buffer!");
            break;
        }
    }
    if (state != UTF8_ACCEPT) {
        MVM_unicode_normalizer_cleanup(tc, &norm);
        MVM_free(buffer);
        MVM_exception_throw_adhoc(tc, "Malformed termination of UTF-8 string");
    }

    /* Get any final graphemes from the normalizer, and clean it up. */
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    if (ready) {
        if (count + ready >= bufsize) {
            buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (count + ready));
        }
        while (ready--) {
            MVMGrapheme32 g;
            g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
            lowest_graph = g < lowest_graph ? g : lowest_graph;
            highest_graph = g > highest_graph ? g : highest_graph;
            buffer[count++] = g;
        }
    }
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* If we're lucky, we can fit our string in 8 bits per grapheme.
     * That happens when our lowest value is bigger than -129 and our
     * highest value is lower than 128. */
    if (lowest_graph >= -128 && highest_graph < 128) {
        MVMGrapheme8 *new_buffer = MVM_malloc(sizeof(MVMGrapheme8) * count);
        for (ready = 0; ready < count; ready++) {
            new_buffer[ready] = buffer[ready];
        }
        MVM_free(buffer);
        result->body.storage.blob_8  = new_buffer;
        result->body.storage_type    = MVM_STRING_GRAPHEME_8;
    } else {
        /* just keep the same buffer as the MVMString's buffer.  Later
         * we can add heuristics to resize it if we have enough free
         * memory */
        if (bufsize - count > 4) {
            buffer = MVM_realloc(buffer, count * sizeof(MVMGrapheme32));
        }
        result->body.storage.blob_32 = buffer;
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
    }
    result->body.num_graphs      = count;

    return result;
}

static MVMint32 its_the_bom(const char *utf8) {
    const MVMuint8 *uns_utf8 = (const MVMuint8 *)utf8;
    return uns_utf8[0] == 0xEF && uns_utf8[1] == 0xBB && uns_utf8[2] == 0xBF;
}

/* Same as MVM_string_utf8_decode, but strips a BOM if it finds one. */
MVMString * MVM_string_utf8_decode_strip_bom(MVMThreadContext *tc, const MVMObject *result_type, const char *utf8, size_t bytes) {
    if (bytes >= 3 && its_the_bom(utf8)) {
        utf8 += 3;
        bytes -= 3;
    }
    return MVM_string_utf8_decode(tc, result_type, utf8, bytes);
}

/* Decodes using a decodestream. Decodes as far as it can with the input
 * buffers, or until a stopper is reached. */
MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
                                  const MVMint32 *stopper_chars,
                                  MVMDecodeStreamSeparators *seps) {
    MVMint32 count = 0, total = 0;
    MVMint32 state = 0;
    MVMCodepoint codepoint = 0;
    MVMint32 bufsize;
    MVMGrapheme32 *buffer;
    MVMDecodeStreamBytes *cur_bytes;
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
    MVMint32 last_accept_pos, ready, at_start;
    MVMuint32 reached_stopper;

    /* If there's no buffers, we're done. */
    if (!ds->bytes_head)
        return 0;
    last_accept_pos = ds->bytes_head_pos;

    /* If we're asked for zero chars, also done. */
    if (stopper_chars && *stopper_chars == 0)
        return 1;

    /* Rough starting-size estimate is number of bytes in the head buffer. */
    bufsize = ds->bytes_head->length;
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));

    /* Decode each of the buffers. */
    cur_bytes = ds->bytes_head;
    at_start = ds->abs_byte_pos == 0;
    reached_stopper = 0;
    while (cur_bytes) {
        /* Process this buffer. */
        MVMint32  pos   = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
        char     *bytes = cur_bytes->bytes;
        if (at_start) {
            /* We're right at the start of the stream of things to decode. See
             * if we have a BOM, and skip over it if so. */
            if (pos + 3 <= cur_bytes->length) {
                if (its_the_bom(bytes + pos)) {
                    pos += 3;
                    last_accept_bytes = cur_bytes;
                    last_accept_pos = pos;
                }
            }
            at_start = 0;
        }
        while (pos < cur_bytes->length) {
            switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) {
            case UTF8_ACCEPT: {
                MVMint32 first = 1;
                MVMGrapheme32 g;
                last_accept_bytes = cur_bytes;
                last_accept_pos = pos;
                ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &(ds->norm), codepoint, &g);
                while (ready--) {
                    if (first)
                        first = 0;
                    else
                        g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
                    if (count == bufsize) {
                        /* Valid character, but we filled the buffer. Attach this
                        * one to the buffers linked list, and continue with a new
                        * one. */
                        MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
                        buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
                        count = 0;
                    }
                    buffer[count++] = g;
                    total++;
                    if (stopper_chars && *stopper_chars == total) {
                        reached_stopper = 1;
                        goto done;
                    }
                    if (MVM_string_decode_stream_maybe_sep(tc, seps, g)) {
                        reached_stopper = 1;
                        goto done;
                    }
                }
                break;
            }
            case UTF8_REJECT:
                MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
                break;
            }
        }
        cur_bytes = cur_bytes->next;
    }
  done:

    /* Attach what we successfully parsed as a result buffer, and trim away
     * what we chewed through. */
    if (count) {
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
    else {
        MVM_free(buffer);
    }
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);

    return reached_stopper;
}

/* Encodes the specified string to UTF-8. */
char * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
        MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length,
        MVMString *replacement, MVMint32 translate_newlines) {
    MVMuint8        *result;
    size_t           result_pos, result_limit;
    MVMCodepointIter ci;
    MVMStringIndex   strgraphs = MVM_string_graphs(tc, str);
    MVMuint8        *repl_bytes = NULL;
    MVMuint64        repl_length;

    if (start < 0 || start > strgraphs)
        MVM_exception_throw_adhoc(tc, "start out of range");
    if (length == -1)
        length = strgraphs;
    if (length < 0 || start + length > strgraphs)
        MVM_exception_throw_adhoc(tc, "length out of range");

    if (replacement)
        repl_bytes = (MVMuint8 *) MVM_string_utf8_encode_substr(tc,
            replacement, &repl_length, 0, -1, NULL, translate_newlines);

    /* Guesstimate that we'll be within 2 bytes for most chars most of the
     * time, and give ourselves 4 bytes breathing space. */
    result_limit = 2 * length;
    result       = MVM_malloc(result_limit + 4);
    result_pos   = 0;

    /* Iterate the codepoints and encode them. */
    MVM_string_ci_init(tc, &ci, str, translate_newlines);
    while (MVM_string_ci_has_more(tc, &ci)) {
        MVMint32 bytes;
        MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci);
        if (result_pos >= result_limit) {
            result_limit *= 2;
            result = MVM_realloc(result, result_limit + 4);
        }
        bytes = utf8_encode(result + result_pos, cp);
        if (bytes)
            result_pos += bytes;
        else if (replacement) {
            if (repl_length >= result_limit || result_pos >= result_limit - repl_length) {
                result_limit += repl_length;
                result = MVM_realloc(result, result_limit + 4);
            }
            memcpy(result + result_pos, repl_bytes, repl_length);
            result_pos += repl_length;
        }
        else {
            MVM_free(result);
            MVM_free(repl_bytes);
            MVM_string_utf8_throw_encoding_exception(tc, cp);
        }
    }

    if (output_size)
        *output_size = (MVMuint64)result_pos;
    MVM_free(repl_bytes);
    return (char *)result;
}

/* Encodes the specified string to UTF-8. */
char * MVM_string_utf8_encode(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size,
        MVMint32 translate_newlines) {
    return MVM_string_utf8_encode_substr(tc, str, output_size, 0, -1, NULL,
        translate_newlines);
}

/* Encodes the specified string to a UTF-8 C string. */
char * MVM_string_utf8_encode_C_string(MVMThreadContext *tc, MVMString *str) {
    MVMuint64 output_size;
    char * result;
    char * utf8_string = MVM_string_utf8_encode(tc, str, &output_size, 0);
    /* this is almost always called from error-handling code. Don't care if it
     * contains embedded NULs. XXX TODO: Make sure all uses of this free what it returns */
    result = MVM_malloc(output_size + 1);
    memcpy(result, utf8_string, output_size);
    MVM_free(utf8_string);
    result[output_size] = (char)0;
    return result;
}

void MVM_string_utf8_throw_encoding_exception (MVMThreadContext *tc, MVMCodepoint cp) {
    const char *gencat = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
    if(cp > 0x10FFFF) {
        MVM_exception_throw_adhoc(tc,
            "Error encoding UTF-8 string: could not encode codepoint %d (0x%X), codepoint out of bounds. Cannot encode higher than %d (0x%X)",
            cp, cp, 0x10FFFF, 0x10FFFF);
    }
    else if (strcmp("Cs", gencat) == 0) {
        MVM_exception_throw_adhoc(tc,
            "Error encoding UTF-8 string: could not encode Unicode Surrogate codepoint %d (0x%X)",
            cp, cp);
    }
    else {
        MVM_exception_throw_adhoc(tc,
            "Error encoding UTF-8 string: could not encode codepoint %d (0x%X)",
            cp, cp);
    }
}

Coverage Report

Created: 2017-04-15 07:07

Line	Count	Source (jump to first uncovered line)
1		#include "moar.h"
2
3		/* The below section has an MIT-style license, included here.
4
5		// Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6		// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
7		*
8		* Permission is hereby granted, free of charge, to any person obtaining
9		* a copy of this software and associated documentation files (the
10		* "Software"), to deal in the Software without restriction, including
11		* without limitation the rights to use, copy, modify, merge, publish,
12		* distribute, sublicense, and/or sell copies of the Software, and to
13		* permit persons to whom the Software is furnished to do so, subject
14		* to the following conditions:
15		*
16		* The above copyright notice and this permission notice shall be
17		* included in all copies or substantial portions of the Software.
18		*
19		* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20		* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21		* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22		* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23		* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24		* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25		* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26		* SOFTWARE.
27		*/
28	1.52M	#define UTF8_ACCEPT 0
29	0	#define UTF8_REJECT 12
30
31		static const MVMuint8 utf8d[] = {
32		// The first part of the table maps bytes to character classes that
33		// to reduce the size of the transition table and create bitmasks.
34		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
36		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
37		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
38		1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
39		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
40		8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
41		10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
42
43		// The second part is a transition table that maps a combination
44		// of a state of the automaton and a character class to a state.
45		0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
46		12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
47		12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
48		12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
49		12,36,12,12,12,12,12,12,12,12,12,12,
50		};
51
52		static MVMint32
53	801k	decode_utf8_byte(MVMint32 state, MVMGrapheme32 codep, MVMuint8 byte) {
54	801k	MVMint32 type = utf8d[byte];
55	801k
56	801k	codep = (state != UTF8_ACCEPT) ?
57	107k	(byte & 0x3fu) \| (*codep << 6) :
58	694k	(0xff >> type) & (byte);
59	801k
60	801k	state = utf8d[256 + state + type];
61	801k	return *state;
62	801k	}
63		/* end Bjoern Hoehrmann section (some things were changed from the original) */
64
65		/* begin not_gerd section (modified from original)
66		// Copyright 2012 not_gerd
67		// see http://irclog.perlgeek.de/perl6/2012-06-04#i_5681122
68
69		Permission is granted to use, modify, and / or redistribute at will.
70
71		This includes removing authorship notices, re-use of code parts in
72		other software (with or without giving credit), and / or creating a
73		commercial product based on it.
74
75		This permission is not revocable by the author.
76
77		This software is provided as-is. Use it at your own risk. There is
78		no warranty whatsoever, neither expressed nor implied, and by using
79		this software you accept that the author(s) shall not be held liable
80		for any loss of data, loss of service, or other damages, be they
81		incidental or consequential. Your only option other than accepting
82		this is not to use the software at all.
83		*/
84
85		enum {
86		CP_CHAR = 1 << 0,
87		CP_LOW_SURROGATE = 1 << 1,
88		CP_HIGH_SURROGATE = 1 << 2,
89		CP_NONCHAR = 1 << 3,
90		CP_OVERFLOW = 1 << 4,
91
92		U8_SINGLE = 1 << 5,
93		U8_DOUBLE = 1 << 6,
94		U8_TRIPLE = 1 << 7,
95		U8_QUAD = 1 << 8
96		};
97
98	1.10M	static unsigned classify(MVMCodepoint cp) {
99	1.10M	if(cp <= 0x7F)
100	1.10M	return CP_CHAR \| U8_SINGLE;
101	1.10M
102	3.61k	if(cp <= 0x07FF)
103	2.98k	return CP_CHAR \| U8_DOUBLE;
104	3.61k
105	632	if(0xD800 <= cp && cp <= 0xDBFF)
106	0	return CP_HIGH_SURROGATE \| U8_TRIPLE;
107	632
108	632	if(0xDC00 <= cp && cp <= 0xDFFF)
109	0	return CP_LOW_SURROGATE \| U8_TRIPLE;
110	632
111	632	if(0xFDD0 <= cp && cp <= 0xFDEF)
112	0	return CP_NONCHAR \| U8_TRIPLE;
113	632
114	632	if(cp <= 0xFFFD)
115	621	return CP_CHAR \| U8_TRIPLE;
116	632
117	11	if(cp == 0xFFFE \|\| cp == 0xFFFF)
118	0	return CP_NONCHAR \| U8_TRIPLE;
119	11
120	11	if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE \|\| (cp & 0xFFFF) == 0xFFFF))
121	0	return CP_NONCHAR \| U8_QUAD;
122	11
123	11	if(cp <= 0x10FFFF)
124	11	return CP_CHAR \| U8_QUAD;
125	11
126	0	if(cp <= 0x1FFFFF)
127	0	return CP_OVERFLOW \| U8_QUAD;
128	0
129	0	return 0;
130	0	}
131
132	1.10M	static MVMint32 utf8_encode(MVMuint8 *bp, MVMCodepoint cp) {
133	1.10M	unsigned cc = classify(cp);
134	1.10M
135	1.10M	if (!(cc & (CP_CHAR \| CP_NONCHAR)))
136	0	return 0;
137	1.10M
138	1.10M	if (cc & U8_SINGLE) {
139	1.10M	bp[0] = (MVMuint8)cp;
140	1.10M	return 1;
141	1.10M	}
142	1.10M
143	3.61k	if (cc & U8_DOUBLE) {
144	2.98k	bp[0] = (MVMuint8)(( 6 << 5) \| (cp >> 6));
145	2.98k	bp[1] = (MVMuint8)(( 2 << 6) \| (cp & 0x3F));
146	2.98k	return 2;
147	2.98k	}
148	3.61k
149	632	if (cc & U8_TRIPLE) {
150	621	bp[0] = (MVMuint8)((14 << 4) \| (cp >> 12));
151	621	bp[1] = (MVMuint8)(( 2 << 6) \| ((cp >> 6) & 0x3F));
152	621	bp[2] = (MVMuint8)(( 2 << 6) \| ( cp & 0x3F));
153	621	return 3;
154	621	}
155	632
156	11	if (cc & U8_QUAD) {
157	11	bp[0] = (MVMuint8)((30 << 3) \| (cp >> 18));
158	11	bp[1] = (MVMuint8)(( 2 << 6) \| ((cp >> 12) & 0x3F));
159	11	bp[2] = (MVMuint8)(( 2 << 6) \| ((cp >> 6) & 0x3F));
160	11	bp[3] = (MVMuint8)(( 2 << 6) \| ( cp & 0x3F));
161	11	return 4;
162	11	}
163	11
164	0	return 0;
165	11	}
166
167		/* end not_gerd section */
168
169	0	#define UTF8_MAXINC (32 * 1024 * 1024)
170
171		/* Decodes the specified number of bytes of utf8 into an NFG string, creating
172		* a result of the specified type. The type must have the MVMString REPR. */
173	25.9k	MVMString * MVM_string_utf8_decode(MVMThreadContext tc, const MVMObject result_type, const char *utf8, size_t bytes) {
174	25.9k	MVMString result = (MVMString )REPR(result_type)->allocate(tc, STABLE(result_type));
175	25.9k	MVMint32 count = 0;
176	25.9k	MVMCodepoint codepoint;
177	25.9k	MVMint32 line_ending = 0;
178	25.9k	MVMint32 state = 0;
179	25.9k	MVMint32 bufsize = bytes;
180	25.9k	MVMGrapheme32 lowest_graph = 0x7fffffff;
181	25.9k	MVMGrapheme32 highest_graph = -0x7fffffff;
182	25.9k	MVMGrapheme32 buffer = MVM_malloc(sizeof(MVMGrapheme32) bufsize);
183	25.9k	size_t orig_bytes;
184	25.9k	const char *orig_utf8;
185	25.9k	MVMint32 line;
186	25.9k	MVMint32 col;
187	25.9k	MVMint32 ready;
188	25.9k
189	25.9k	/* Need to normalize to NFG as we decode. */
190	25.9k	MVMNormalizer norm;
191	25.9k	MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
192	25.9k
193	25.9k	orig_bytes = bytes;
194	25.9k	orig_utf8 = utf8;
195	25.9k
196	343k	for (; bytes; ++utf8, --bytes) {
197	317k	switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
198	210k	case UTF8_ACCEPT: { /* got a codepoint */
199	210k	MVMGrapheme32 g;
200	210k	ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, codepoint, &g);
201	210k	if (ready) {
202	184k	while (count + ready >= bufsize) { /* if the buffer's full make a bigger one */
203	0	buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (
204	0	bufsize >= UTF8_MAXINC ? (bufsize += UTF8_MAXINC) : (bufsize *= 2)
205	0	));
206	0	}
207	184k	buffer[count++] = g;
208	149k	lowest_graph = g < lowest_graph ? g : lowest_graph;
209	102k	highest_graph = g > highest_graph ? g : highest_graph;
210	184k	while (--ready > 0) {
211	299	g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
212	288	lowest_graph = g < lowest_graph ? g : lowest_graph;
213	283	highest_graph = g > highest_graph ? g : highest_graph;
214	299	buffer[count++] = g;
215	299	}
216	184k	}
217	210k	break;
218	210k	}
219	0	case UTF8_REJECT:
220	0	/* found a malformed sequence; parse it again this time tracking
221	0	* line and col numbers. */
222	0	MVM_unicode_normalizer_cleanup(tc, &norm); /* Since we'll throw. */
223	0	bytes = orig_bytes; utf8 = orig_utf8; state = 0; line = 1; col = 1;
224	0	for (; bytes; ++utf8, --bytes) {
225	0	switch(decode_utf8_byte(&state, &codepoint, (MVMuint8)*utf8)) {
226	0	case UTF8_ACCEPT:
227	0	/* this could be reorganized into several nested ugly if/else :/ */
228	0	if (!line_ending && (codepoint == 10 \|\| codepoint == 13)) {
229	0	/* Detect the style of line endings.
230	0	* Select whichever comes first.
231	0	* First or only part of first line ending. */
232	0	line_ending = codepoint;
233	0	col = 1; line++;
234	0	}
235	0	else if (line_ending && codepoint == line_ending) {
236	0	/* first or only part of next line ending */
237	0	col = 1; line++;
238	0	}
239	0	else if (codepoint == 10 \|\| codepoint == 13) {
240	0	/* second part of line ending; ignore */
241	0	}
242	0	else /* non-line ending codepoint */
243	0	col++;
244	0	break;
245	0	case UTF8_REJECT:
246	0	MVM_free(buffer);
247	0	MVM_exception_throw_adhoc(tc, "Malformed UTF-8 at line %u col %u", line, col);
248	0	}
249	0	}
250	0	MVM_free(buffer);
251	0	MVM_exception_throw_adhoc(tc, "Concurrent modification of UTF-8 input buffer!");
252	0	break;
253	317k	}
254	317k	}
255	25.9k	if (state != UTF8_ACCEPT) {
256	0	MVM_unicode_normalizer_cleanup(tc, &norm);
257	0	MVM_free(buffer);
258	0	MVM_exception_throw_adhoc(tc, "Malformed termination of UTF-8 string");
259	0	}
260	25.9k
261	25.9k	/* Get any final graphemes from the normalizer, and clean it up. */
262	25.9k	MVM_unicode_normalizer_eof(tc, &norm);
263	25.9k	ready = MVM_unicode_normalizer_available(tc, &norm);
264	25.9k	if (ready) {
265	25.8k	if (count + ready >= bufsize) {
266	19.9k	buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * (count + ready));
267	19.9k	}
268	51.6k	while (ready--) {
269	25.8k	MVMGrapheme32 g;
270	25.8k	g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
271	25.4k	lowest_graph = g < lowest_graph ? g : lowest_graph;
272	12.9k	highest_graph = g > highest_graph ? g : highest_graph;
273	25.8k	buffer[count++] = g;
274	25.8k	}
275	25.8k	}
276	25.9k	MVM_unicode_normalizer_cleanup(tc, &norm);
277	25.9k
278	25.9k	/* If we're lucky, we can fit our string in 8 bits per grapheme.
279	25.9k	* That happens when our lowest value is bigger than -129 and our
280	25.9k	* highest value is lower than 128. */
281	25.9k	if (lowest_graph >= -128 && highest_graph < 128) {
282	20.0k	MVMGrapheme8 new_buffer = MVM_malloc(sizeof(MVMGrapheme8) count);
283	174k	for (ready = 0; ready < count; ready++) {
284	154k	new_buffer[ready] = buffer[ready];
285	154k	}
286	20.0k	MVM_free(buffer);
287	20.0k	result->body.storage.blob_8 = new_buffer;
288	20.0k	result->body.storage_type = MVM_STRING_GRAPHEME_8;
289	5.97k	} else {
290	5.97k	/* just keep the same buffer as the MVMString's buffer. Later
291	5.97k	* we can add heuristics to resize it if we have enough free
292	5.97k	* memory */
293	5.97k	if (bufsize - count > 4) {
294	5.35k	buffer = MVM_realloc(buffer, count * sizeof(MVMGrapheme32));
295	5.35k	}
296	5.97k	result->body.storage.blob_32 = buffer;
297	5.97k	result->body.storage_type = MVM_STRING_GRAPHEME_32;
298	5.97k	}
299	25.9k	result->body.num_graphs = count;
300	25.9k
301	25.9k	return result;
302	25.9k	}
303
304	177	static MVMint32 its_the_bom(const char *utf8) {
305	177	const MVMuint8 uns_utf8 = (const MVMuint8 )utf8;
306	0	return uns_utf8[0] == 0xEF && uns_utf8[1] == 0xBB && uns_utf8[2] == 0xBF;
307	177	}
308
309		/* Same as MVM_string_utf8_decode, but strips a BOM if it finds one. */
310	4	MVMString * MVM_string_utf8_decode_strip_bom(MVMThreadContext tc, const MVMObject result_type, const char *utf8, size_t bytes) {
311	4	if (bytes >= 3 && its_the_bom(utf8)) {
312	0	utf8 += 3;
313	0	bytes -= 3;
314	0	}
315	4	return MVM_string_utf8_decode(tc, result_type, utf8, bytes);
316	4	}
317
318		/* Decodes using a decodestream. Decodes as far as it can with the input
319		* buffers, or until a stopper is reached. */
320		MVMuint32 MVM_string_utf8_decodestream(MVMThreadContext tc, MVMDecodeStream ds,
321		const MVMint32 *stopper_chars,
322	242	MVMDecodeStreamSeparators *seps) {
323	242	MVMint32 count = 0, total = 0;
324	242	MVMint32 state = 0;
325	242	MVMCodepoint codepoint = 0;
326	242	MVMint32 bufsize;
327	242	MVMGrapheme32 *buffer;
328	242	MVMDecodeStreamBytes *cur_bytes;
329	242	MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
330	242	MVMint32 last_accept_pos, ready, at_start;
331	242	MVMuint32 reached_stopper;
332	242
333	242	/* If there's no buffers, we're done. */
334	242	if (!ds->bytes_head)
335	24	return 0;
336	218	last_accept_pos = ds->bytes_head_pos;
337	218
338	218	/* If we're asked for zero chars, also done. */
339	218	if (stopper_chars && *stopper_chars == 0)
340	0	return 1;
341	218
342	218	/* Rough starting-size estimate is number of bytes in the head buffer. */
343	218	bufsize = ds->bytes_head->length;
344	218	buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
345	218
346	218	/* Decode each of the buffers. */
347	218	cur_bytes = ds->bytes_head;
348	218	at_start = ds->abs_byte_pos == 0;
349	218	reached_stopper = 0;
350	402	while (cur_bytes) {
351	219	/* Process this buffer. */
352	218	MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
353	219	char *bytes = cur_bytes->bytes;
354	219	if (at_start) {
355	180	/* We're right at the start of the stream of things to decode. See
356	180	* if we have a BOM, and skip over it if so. */
357	180	if (pos + 3 <= cur_bytes->length) {
358	176	if (its_the_bom(bytes + pos)) {
359	0	pos += 3;
360	0	last_accept_bytes = cur_bytes;
361	0	last_accept_pos = pos;
362	0	}
363	176	}
364	180	at_start = 0;
365	180	}
366	484k	while (pos < cur_bytes->length) {
367	483k	switch(decode_utf8_byte(&state, &codepoint, bytes[pos++])) {
368	483k	case UTF8_ACCEPT: {
369	483k	MVMint32 first = 1;
370	483k	MVMGrapheme32 g;
371	483k	last_accept_bytes = cur_bytes;
372	483k	last_accept_pos = pos;
373	483k	ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &(ds->norm), codepoint, &g);
374	967k	while (ready--) {
375	483k	if (first)
376	468k	first = 0;
377	483k	else
378	14.9k	g = MVM_unicode_normalizer_get_grapheme(tc, &(ds->norm));
379	483k	if (count == bufsize) {
380	0	/* Valid character, but we filled the buffer. Attach this
381	0	* one to the buffers linked list, and continue with a new
382	0	* one. */
383	0	MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
384	0	buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
385	0	count = 0;
386	0	}
387	483k	buffer[count++] = g;
388	483k	total++;
389	483k	if (stopper_chars && *stopper_chars == total) {
390	5	reached_stopper = 1;
391	5	goto done;
392	5	}
393	483k	if (MVM_string_decode_stream_maybe_sep(tc, seps, g)) {
394	30	reached_stopper = 1;
395	30	goto done;
396	30	}
397	483k	}
398	483k	break;
399	483k	}
400	0	case UTF8_REJECT:
401	0	MVM_exception_throw_adhoc(tc, "Malformed UTF-8");
402	0	break;
403	483k	}
404	483k	}
405	184	cur_bytes = cur_bytes->next;
406	184	}
407	218	done:
408	218
409	218	/* Attach what we successfully parsed as a result buffer, and trim away
410	218	* what we chewed through. */
411	218	if (count) {
412	214	MVM_string_decodestream_add_chars(tc, ds, buffer, count);
413	214	}
414	4	else {
415	4	MVM_free(buffer);
416	4	}
417	218	MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
418	218
419	218	return reached_stopper;
420	218	}
421
422		/* Encodes the specified string to UTF-8. */
423		char * MVM_string_utf8_encode_substr(MVMThreadContext *tc,
424		MVMString str, MVMuint64 output_size, MVMint64 start, MVMint64 length,
425	13.6k	MVMString *replacement, MVMint32 translate_newlines) {
426	13.6k	MVMuint8 *result;
427	13.6k	size_t result_pos, result_limit;
428	13.6k	MVMCodepointIter ci;
429	13.6k	MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
430	13.6k	MVMuint8 *repl_bytes = NULL;
431	13.6k	MVMuint64 repl_length;
432	13.6k
433	13.6k	if (start < 0 \|\| start > strgraphs)
434	0	MVM_exception_throw_adhoc(tc, "start out of range");
435	13.6k	if (length == -1)
436	13.5k	length = strgraphs;
437	13.6k	if (length < 0 \|\| start + length > strgraphs)
438	0	MVM_exception_throw_adhoc(tc, "length out of range");
439	13.6k
440	13.6k	if (replacement)
441	0	repl_bytes = (MVMuint8 *) MVM_string_utf8_encode_substr(tc,
442	0	replacement, &repl_length, 0, -1, NULL, translate_newlines);
443	13.6k
444	13.6k	/* Guesstimate that we'll be within 2 bytes for most chars most of the
445	13.6k	* time, and give ourselves 4 bytes breathing space. */
446	13.6k	result_limit = 2 * length;
447	13.6k	result = MVM_malloc(result_limit + 4);
448	13.6k	result_pos = 0;
449	13.6k
450	13.6k	/* Iterate the codepoints and encode them. */
451	13.6k	MVM_string_ci_init(tc, &ci, str, translate_newlines);
452	1.12M	while (MVM_string_ci_has_more(tc, &ci)) {
453	1.10M	MVMint32 bytes;
454	1.10M	MVMCodepoint cp = MVM_string_ci_get_codepoint(tc, &ci);
455	1.10M	if (result_pos >= result_limit) {
456	4	result_limit *= 2;
457	4	result = MVM_realloc(result, result_limit + 4);
458	4	}
459	1.10M	bytes = utf8_encode(result + result_pos, cp);
460	1.10M	if (bytes)
461	1.10M	result_pos += bytes;
462	0	else if (replacement) {
463	0	if (repl_length >= result_limit \|\| result_pos >= result_limit - repl_length) {
464	0	result_limit += repl_length;
465	0	result = MVM_realloc(result, result_limit + 4);
466	0	}
467	0	memcpy(result + result_pos, repl_bytes, repl_length);
468	0	result_pos += repl_length;
469	0	}
470	0	else {
471	0	MVM_free(result);
472	0	MVM_free(repl_bytes);
473	0	MVM_string_utf8_throw_encoding_exception(tc, cp);
474	0	}
475	1.10M	}
476	13.6k
477	13.6k	if (output_size)
478	13.6k	*output_size = (MVMuint64)result_pos;
479	13.6k	MVM_free(repl_bytes);
480	13.6k	return (char *)result;
481	13.6k	}
482
483		/* Encodes the specified string to UTF-8. */
484		char * MVM_string_utf8_encode(MVMThreadContext tc, MVMString str, MVMuint64 *output_size,
485	2.17k	MVMint32 translate_newlines) {
486	2.17k	return MVM_string_utf8_encode_substr(tc, str, output_size, 0, -1, NULL,
487	2.17k	translate_newlines);
488	2.17k	}
489
490		/* Encodes the specified string to a UTF-8 C string. */
491	1.96k	char * MVM_string_utf8_encode_C_string(MVMThreadContext tc, MVMString str) {
492	1.96k	MVMuint64 output_size;
493	1.96k	char * result;
494	1.96k	char * utf8_string = MVM_string_utf8_encode(tc, str, &output_size, 0);
495	1.96k	/* this is almost always called from error-handling code. Don't care if it
496	1.96k	* contains embedded NULs. XXX TODO: Make sure all uses of this free what it returns */
497	1.96k	result = MVM_malloc(output_size + 1);
498	1.96k	memcpy(result, utf8_string, output_size);
499	1.96k	MVM_free(utf8_string);
500	1.96k	result[output_size] = (char)0;
501	1.96k	return result;
502	1.96k	}
503
504	0	void MVM_string_utf8_throw_encoding_exception (MVMThreadContext *tc, MVMCodepoint cp) {
505	0	const char *gencat = MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_GENERAL_CATEGORY);
506	0	if(cp > 0x10FFFF) {
507	0	MVM_exception_throw_adhoc(tc,
508	0	"Error encoding UTF-8 string: could not encode codepoint %d (0x%X), codepoint out of bounds. Cannot encode higher than %d (0x%X)",
509	0	cp, cp, 0x10FFFF, 0x10FFFF);
510	0	}
511	0	else if (strcmp("Cs", gencat) == 0) {
512	0	MVM_exception_throw_adhoc(tc,
513	0	"Error encoding UTF-8 string: could not encode Unicode Surrogate codepoint %d (0x%X)",
514	0	cp, cp);
515	0	}
516	0	else {
517	0	MVM_exception_throw_adhoc(tc,
518	0	"Error encoding UTF-8 string: could not encode codepoint %d (0x%X)",
519	0	cp, cp);
520	0	}
521	0	}