/home/travis/build/MoarVM/MoarVM/src/strings/ops.c

Source (jump to first uncovered line)
#include "platform/memmem.h"
#include "platform/memmem32.h"
#include "moar.h"
#define MVM_DEBUG_STRANDS 0
#define MVM_string_KMP_max_pattern_length 8192
/* Max value possible for MVMuint32 MVMStringBody.num_graphs */
#define MAX_GRAPHEMES     0xFFFFFFFFLL

#if MVM_DEBUG_STRANDS
static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) {
    MVMGraphemeIter gi;
    MVMuint32       len;
    MVM_string_gi_init(tc, &gi, s);
    len = 0;
    while (MVM_string_gi_has_more(tc, &gi)) {
        MVM_string_gi_get_grapheme(tc, &gi);
        len++;
    }
    if (len != MVM_string_graphs(tc, s))
        MVM_exception_throw_adhoc(tc,
            "Strand sanity check failed (strand length %d != num_graphs %d)",
            len, MVM_string_graphs(tc, s));
}
#define STRAND_CHECK(tc, s) check_strand_sanity(tc, s);
#else
#define STRAND_CHECK(tc, s)
#endif

static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in);
#if MVM_DEBUG_NFG
static char * NFG_check_make_debug_string (MVMThreadContext *tc, MVMGrapheme32 g) {
    char *result = NULL;
    char *picked = NULL;
    if (g == '\r')
        picked = "\\r";
    else if (g == '\n')
        picked = "\\n";
    else if (g == MVM_nfg_crlf_grapheme(tc))
        picked = "\\r\\n";
    else if (0 <= g && !MVM_string_is_control_full(tc, g))
        result = MVM_string_utf8_encode_C_string(tc, MVM_string_chr(tc, g));
    else if (g < 0) {
        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
        char *format_str = " with num_codes = ";
        char *format_str2 = " first, second cp = ";
        char *synthtype_str = synth->is_utf8_c8 ?  "utf8-8 Synthetic" :  "Normal Synthetic";
        int this_len = strlen(format_str) + strlen(synthtype_str) + 6 + strlen(format_str2) + 11 + 1 + 11 + 1;
        result = MVM_malloc(this_len);
        if (2 <= synth->num_codes)
            sprintf(result, "%s%s%5i%s%.10"PRIi32",%.10"PRIi32"", synthtype_str, format_str, synth->num_codes, format_str2, synth->codes[0], synth->codes[1]);
        else
            sprintf(result, "WARNING synth has less than 2 codes");
        fprintf(stderr, "synth numcodes %i %"PRIi32"\n",
            MVM_nfg_get_synthetic_info(tc, synth->codes[1])->num_codes, MVM_nfg_get_synthetic_info(tc, synth->codes[1])->codes[0]
        );
    }
    else
        picked = "[Control]";
    if (picked) {
        result = MVM_malloc(sizeof(char) * (strlen(picked) + 1));
        strcpy(result, picked);
    }
    if (!result) {
        result = MVM_malloc(sizeof(char) * 1);
        result[0] = 0;
    }
    return result;
}
static char * NFG_checker (MVMThreadContext *tc, MVMString *orig, char *varname);
void NFG_check (MVMThreadContext *tc, MVMString *orig, char *varname) {
    char *out = NFG_checker(tc, orig, varname);
    char *waste[2] = { out, NULL };
    if (!out)
        return;
    MVM_exception_throw_adhoc_free(tc, waste, "%s", out);
}
static char * NFG_checker (MVMThreadContext *tc, MVMString *orig, char *varname) {
    MVMString *renorm = NULL;
    MVMStringIndex orig_graphs = MVM_string_graphs(tc, orig),
                   renorm_graphs = -1;
    MVMROOT2(tc, orig, renorm, {
        renorm = re_nfg(tc, orig);
        renorm_graphs = MVM_string_graphs(tc, renorm);
    });
    if (MVM_DEBUG_NFG_STRICT || orig_graphs != renorm_graphs) {
        MVMGraphemeIter orig_gi, renorm_gi;
        MVMint64 index = 0;
        MVM_string_gi_init(tc, &orig_gi, orig);
        MVM_string_gi_init(tc, &renorm_gi, renorm);
        while (MVM_string_gi_has_more(tc,  &orig_gi) && MVM_string_gi_has_more(tc,  &renorm_gi)) {
            MVMGrapheme32 orig_g   = MVM_string_gi_get_grapheme(tc, &orig_gi),
                          renorm_g = MVM_string_gi_get_grapheme(tc, &renorm_gi);
            if (orig_g != renorm_g) {
                char *orig_render   = NFG_check_make_debug_string(tc, orig_g),
                     *renorm_render = NFG_check_make_debug_string(tc, renorm_g);
                char *format = "NFG failure. Got different grapheme count of %s. "
                    "Got %i but after re_nfg got %i\n"
                        "Differing grapheme at index %"PRIi64"\n"
                            "orig: %"PRIi32"  (%s)  after re_nfg: %"PRIi32"  (%s)\n";
                int out_size = strlen(orig_render) + strlen(renorm_render)
                    + strlen(varname) + strlen(format) + (5 * 7) + 1;
                char *out = MVM_malloc(sizeof(char) * out_size);
                char *waste[] = {orig_render, renorm_render, NULL};
                char **w = waste;
                snprintf(out, out_size,
                    format,
                    varname,
                        orig_graphs, renorm_graphs,
                            index,
                                orig_g, orig_render, renorm_g, renorm_render);
                MVM_free(orig_render);
                MVM_free(renorm_render);
                return out;
            }
            index++;
        }
    }
    return NULL;
}
void NFG_check_concat (MVMThreadContext *tc, MVMString *result, MVMString *a, MVMString *b, char *varname) {
    char *a_out = NFG_checker(tc, a, "string ‘a’");
    char *b_out = NFG_checker(tc, b, "string ‘b’");
    char *out = NFG_checker(tc, result, varname);
    char *strings[] = { a_out, b_out, out };
    char *names[]   = { "\nconcat string ‘a’: ", "\nconcat string ‘b’: ", "\nconcat result: " };
    int i = 0, elems = 4;
    int rtrn = 0;
    char * empty = "";
    if (!a_out && !b_out && !out)
        return;
    else {
        MVMGrapheme32 last_a  =  MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1),
                      first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0);
        char   *debug_a = NFG_check_make_debug_string(tc, last_a),
               *debug_b = NFG_check_make_debug_string(tc, first_b),
             *escaped_a = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, a)),
             *escaped_b = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, b)),
        *escaped_result = MVM_string_utf8_encode_C_string(tc, MVM_string_escape(tc, result));
        char *waste[] = { out, debug_a, debug_b, escaped_a, escaped_b, escaped_result, NULL };
        MVM_exception_throw_adhoc_free(tc, waste,
            "In concat: a graphs: %"PRIi32" b graphs: %"PRIi32"\n"
            "last_a: %"PRIi32" (%s)  first_b %"PRIi32"  (%s)\n"
            "a: “%s”\n"
            "b: “%s”\n"
            "result: “%s”\n"
            "%s%s%s%s%s%s",
            MVM_string_graphs(tc, a), MVM_string_graphs(tc, b),
            last_a, debug_a, first_b, debug_b,
            escaped_a,
            escaped_b,
            escaped_result,
            (a_out?names[0]:""), (a_out?a_out:""),
            (b_out?names[1]:""), (b_out?b_out:""),
            (out?names[2]:""), (out?out:""));
        }


}
#endif

MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, void *Hs_or_gic, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase, int is_gic);
static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset);

/* Allocates strand storage. */
static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) {
    return MVM_malloc(num_strands * sizeof(MVMStringStrand));
}

/* Copies strands from one strand string to another. */
static void copy_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
        MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
    assert(from->body.storage_type == MVM_STRING_STRAND);
    assert(to->body.storage_type == MVM_STRING_STRAND);
    memcpy(
        to->body.storage.strands + to_offset,
        from->body.storage.strands + from_offset,
        num_strands * sizeof(MVMStringStrand));
}
/* Move strands inside the same strand string. */
static void move_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
        MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
    assert(from->body.storage_type == MVM_STRING_STRAND);
    assert(to->body.storage_type == MVM_STRING_STRAND);
    memmove(
        to->body.storage.strands + to_offset,
        from->body.storage.strands + from_offset,
        num_strands * sizeof(MVMStringStrand));
}

#define can_fit_into_8bit(g) ((-128 <= (g) && (g) <= 127))

MVM_STATIC_INLINE int can_fit_into_ascii (MVMGrapheme32 g) {
    return 0 <= g && g <= 127;
}
/* If a string is currently using 32bit storage, turn it into using
 * 8 bit storage. Doesn't do any checks at all. */
static void turn_32bit_into_8bit_unchecked(MVMThreadContext *tc, MVMString *str) {
    MVMGrapheme32 *old_buf = str->body.storage.blob_32;
    MVMStringIndex i;
    MVMGrapheme8 *dest_buf = NULL;
    MVMStringIndex num_graphs = MVM_string_graphs_nocheck(tc, str);
    str->body.storage_type   = MVM_STRING_GRAPHEME_8;
    dest_buf = str->body.storage.blob_8 = MVM_malloc(str->body.num_graphs * sizeof(MVMGrapheme8));
    MVM_VECTORIZE_LOOP
    for (i = 0; i < num_graphs; i++) {
        dest_buf[i] = old_buf[i];
    }

    MVM_free(old_buf);
}
/* Checks if the next num_graphs graphemes in the iterator can fit into 8 bits.
 * This was written to take advantage of SIMD vectorization, so we use a multiple
 * bitwise operations to check, and biwise OR it with val. Care must be taken
 * to not use any variables altered by the loop outside of the loop and to not
 * have any branching or funcion calls. `i` is not used outside the loop
 * `val` is allowed as biwise OR works with the vectorization well.
 * NOTE: GraphemeIter is not modified by this function. */
static int string_can_be_8bit(MVMThreadContext *tc, MVMGraphemeIter *gi_orig, MVMStringIndex num_graphs) {
    MVMStringIndex pos = 0;
    MVMGraphemeIter gi;
    memcpy(&gi, gi_orig, sizeof(MVMGraphemeIter));
    while (1) {
        MVMStringIndex strand_len = MVM_string_gi_graphs_left_in_strand(tc, &gi);
        MVMStringIndex togo = num_graphs - pos < strand_len
            ? num_graphs - pos
            : strand_len;
        if (MVM_string_gi_blob_type(tc, &gi) == MVM_STRING_GRAPHEME_32) {
            if (!MVM_string_buf32_can_fit_into_8bit(MVM_string_gi_active_blob_32_pos(tc, &gi), togo))
                return 0;
        }
        pos += togo;
        if (num_graphs == pos || !MVM_string_gi_has_more_strands_rep(tc, &gi)) {
            break;
        }
        MVM_string_gi_next_strand_rep(tc, &gi);
    }
    return 1;

}
/* Accepts an allocated string that should have body.num_graphs set but the blob
 * unallocated. This function will allocate the space for the blob and iterate
 * the supplied grapheme iterator for the length of body.num_graphs. Very fast
 * since compilers will convert them to SIMD vector operations. */
static void iterate_gi_into_string(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *result, MVMString *orig, MVMStringIndex num) {
    int result_pos = 0;
    MVMGrapheme8   *result8   = NULL;
    MVMGrapheme32 *result32   = NULL;
    MVMStringIndex result_graphs = MVM_string_graphs_nocheck(tc, result);
    if (!result_graphs)
        return;

    if (string_can_be_8bit(tc, gi, result_graphs)) {
        MVMStringIndex result_pos = 0;
        result->body.storage_type = MVM_STRING_GRAPHEME_8;
        result8 = result->body.storage.blob_8 =
            MVM_malloc(result_graphs * sizeof(MVMGrapheme8));
        while (1) {
            MVMStringIndex strand_len =
                MVM_string_gi_graphs_left_in_strand(tc, gi);
            MVMStringIndex to_copy = result_graphs - result_pos < strand_len
                ? result_graphs - result_pos
                : strand_len;
            MVMGrapheme8  *result_blob8 = result8 + result_pos;
            switch (MVM_string_gi_blob_type(tc, gi)) {
            case MVM_STRING_GRAPHEME_32: {
                MVMStringIndex i;
                MVMGrapheme32 *active_blob =
                    MVM_string_gi_active_blob_32_pos(tc, gi);
                MVM_VECTORIZE_LOOP
                for (i = 0; i < to_copy; i++) {
                    result_blob8[i] = active_blob[i];
                }
                break;
            }
            case MVM_STRING_GRAPHEME_8:
            case MVM_STRING_GRAPHEME_ASCII: {
                memcpy(
                    result_blob8,
                    MVM_string_gi_active_blob_8_pos(tc, gi),
                    to_copy * sizeof(MVMGrapheme8)
                );
                break;
            }
            default:
                MVM_exception_throw_adhoc(tc,
                    "Internal error, string corruption in iterate_gi_into_string\n");
            }
            result_pos += to_copy;
            if (result_graphs <= result_pos || !MVM_string_gi_has_more_strands_rep(tc, gi)) {
                break;
            }
            MVM_string_gi_next_strand_rep(tc, gi);
        }
    }
    else {
        MVMStringIndex result_pos = 0;
        result->body.storage_type            = MVM_STRING_GRAPHEME_32;
        result32 = result->body.storage.blob_32 =
            result_graphs ? MVM_malloc(result_graphs * sizeof(MVMGrapheme32)) : NULL;
        while (1) {
            MVMStringIndex strand_len = MVM_string_gi_graphs_left_in_strand(tc, gi);
            MVMStringIndex to_copy = result_graphs - result_pos < strand_len
                ? result_graphs - result_pos
                : strand_len;
            switch (MVM_string_gi_blob_type(tc, gi)) {
                case MVM_STRING_GRAPHEME_8:
                case MVM_STRING_GRAPHEME_ASCII: {
                    MVMGrapheme8  *active_blob =
                        MVM_string_gi_active_blob_8_pos(tc, gi);
                    MVMGrapheme32 *result_blob32 = result32 + result_pos;
                    MVMStringIndex i;
                    MVM_VECTORIZE_LOOP
                    for (i = 0; i < to_copy; i++) {
                        result_blob32[i] = active_blob[i];
                    }
                    break;
                }
                case MVM_STRING_GRAPHEME_32: {
                    memcpy(
                        result32 + result_pos,
                        MVM_string_gi_active_blob_32_pos(tc, gi),
                        to_copy * sizeof(MVMGrapheme32)
                    );
                    break;
                }
                default:
                    MVM_exception_throw_adhoc(tc,
                        "Internal error, string corruption in iterate_gi_into_string\n");
            }
            result_pos += to_copy;
            if (result_graphs <= result_pos || !MVM_string_gi_has_more_strands_rep(tc, gi)) {
                break;
            }
            MVM_string_gi_next_strand_rep(tc, gi);
        }
    }
}
#define copy_strands_memcpy(BLOB_TYPE, SIZEOF_TYPE, STORAGE_TYPE) { \
    result->body.storage.BLOB_TYPE = MVM_malloc(sizeof(SIZEOF_TYPE) * MVM_string_graphs_nocheck(tc, orig)); \
    for (i = 0; i < orig->body.num_strands; i++) { \
        size_t graphs_this_strand =  orig->body.storage.strands[i].end - orig->body.storage.strands[i].start; \
        /* If it's 8bit format and there's only one grapheme */ \
        if ((STORAGE_TYPE == MVM_STRING_GRAPHEME_ASCII || STORAGE_TYPE == MVM_STRING_GRAPHEME_8) && graphs_this_strand == 1) { \
            /* If there are not repetitions we can directly set the grapheme */ \
            if (!orig->body.storage.strands[i].repetitions) \
                result->body.storage.BLOB_TYPE[graphs_so_far] = orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE[orig->body.storage.strands[i].start]; \
            /* Otherwise, use memset for the correct number of repetitions */ \
            else { \
                graphs_this_strand += orig->body.storage.strands[i].repetitions; \
                memset(graphs_so_far + result->body.storage.BLOB_TYPE, \
                    orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE[orig->body.storage.strands[i].start], \
                    graphs_this_strand \
                ); \
            } \
            graphs_so_far += graphs_this_strand; \
        } \
        else { \
            int j = 0; \
            for (; j <= orig->body.storage.strands[i].repetitions; j++) { \
                memcpy(graphs_so_far + result->body.storage.BLOB_TYPE, \
                    orig->body.storage.strands[i].blob_string->body.storage.BLOB_TYPE + orig->body.storage.strands[i].start, \
                    sizeof(SIZEOF_TYPE) * graphs_this_strand \
                ); \
                graphs_so_far += graphs_this_strand; \
            } \
        } \
    } \
}
/* Collapses a bunch of strands into a single blob string. */
static MVMString * collapse_strands(MVMThreadContext *tc, MVMString *orig) {
    MVMString      *result = NULL;
    size_t graphs_so_far = 0;

    /* If it's not a strand, just return it */
    if (orig->body.storage_type != MVM_STRING_STRAND)
        return orig;
    /* If the original string is a STRAND and all the composite strands are
     * of the same type, then we will collapse it using memcpy instead of
     * using a grapheme iterator. */
    else {
        size_t i;
        MVMint32 common_storage_type = orig->body.storage.strands[0].blob_string->body.storage_type;
        MVMROOT(tc, orig, {
            result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
            result->body.num_graphs = MVM_string_graphs(tc, orig);
            for (i = 1; i < orig->body.num_strands; i++) {
                if (common_storage_type != orig->body.storage.strands[i].blob_string->body.storage_type) {
                    common_storage_type = -1;
                    break;
                }
            }
            result->body.storage_type = common_storage_type;
            switch (common_storage_type) {
                case MVM_STRING_GRAPHEME_32:
                    copy_strands_memcpy(blob_32, MVMGrapheme32, MVM_STRING_GRAPHEME_32);
                    break;
                case MVM_STRING_GRAPHEME_ASCII:
                case MVM_STRING_GRAPHEME_8:
                    copy_strands_memcpy(blob_8, MVMGrapheme8, MVM_STRING_GRAPHEME_8);
                    break;
                default: {
                    MVMGraphemeIter gi;
                    MVM_string_gi_init(tc, &gi, orig);
                    iterate_gi_into_string(tc, &gi, result, orig, 0);
                }
            }
        });
    }
#if (MVM_DEBUG_STRANDS || MVM_DEBUG_NFG)
    if (!MVM_string_equal(tc, result, orig))
        MVM_exception_throw_adhoc(tc, "result and original were not eq in collapse_strands");
#endif
    return result;
}

/* Takes a string that is no longer in NFG form after some concatenation-style
 * operation, and returns a new string that is in NFG. Note that we could do a
 * much, much, smarter thing in the future that doesn't involve all of this
 * copying and allocation and re-doing the whole string, but cases like this
 * should be fairly rare anyway, so go with simplicity for now. */
static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) {
    MVMNormalizer norm;
    MVMCodepointIter ci;
    MVMint32 ready;
    MVMString *out = NULL;
    MVMuint32 bufsize = in->body.num_graphs;

    /* Create the output buffer. We used to believe it can't ever be bigger
     * than the initial estimate, but utf8-c8 showed us otherwise. */
    MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
    MVMint64 out_pos = 0;
    /* Iterate codepoints and normalizer. */
    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
    /* Codepoint iterator that passes back utf8-c8 graphemes unchanged */
    MVM_string_ci_init(tc, &ci, in, 0, 1);
    while (MVM_string_ci_has_more(tc, &ci)) {
        MVMGrapheme32 g;
        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g);
        if (ready) {
            if (out_pos + ready > bufsize) {
                /* Doubling up the buffer size seems excessive, so just
                 * add a generous amount of storage */
                bufsize += ready + 32;
                out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
            }
            out_buffer[out_pos++] = g;
            while (--ready > 0) {
                out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
            }
        }
    }
    MVM_unicode_normalizer_eof(tc, &norm);
    ready = MVM_unicode_normalizer_available(tc, &norm);
    if (out_pos + ready > bufsize) {
        bufsize += ready + 1;
        out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
    }
    while (ready--) {
        out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
    }
    MVM_unicode_normalizer_cleanup(tc, &norm);

    /* Build result string. */
    out = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    out->body.storage.blob_32 = out_buffer;
    out->body.storage_type    = MVM_STRING_GRAPHEME_32;
    out->body.num_graphs      = out_pos;
    return out;
}

/* Returns nonzero if two substrings are equal, doesn't check bounds */
MVMint64 MVM_string_substrings_equal_nocheck(MVMThreadContext *tc, MVMString *a,
        MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
    MVMint64 i;

    /* Fast paths when storage types are identical. */
    switch (a->body.storage_type) {
        case MVM_STRING_GRAPHEME_32:
            if (b->body.storage_type == MVM_STRING_GRAPHEME_32)
                return 0 == memcmp(
                    a->body.storage.blob_32 + starta,
                    b->body.storage.blob_32 + startb,
                    length * sizeof(MVMGrapheme32));
            break;
        case MVM_STRING_GRAPHEME_ASCII:
        case MVM_STRING_GRAPHEME_8:
            if (b->body.storage_type == MVM_STRING_GRAPHEME_ASCII ||
                    b->body.storage_type == MVM_STRING_GRAPHEME_8)
                return 0 == memcmp(
                    a->body.storage.blob_8 + starta,
                    b->body.storage.blob_8 + startb,
                    length);
            break;
    }

    /* If both are flat, use MVM_string_get_grapheme_at_nocheck on both for speed */
    if (a->body.storage_type != MVM_STRING_STRAND && b->body.storage_type != MVM_STRING_STRAND) {
        for (i = 0; i < length; i++)
            if (MVM_string_get_grapheme_at_nocheck(tc, a, starta + i)
             != MVM_string_get_grapheme_at_nocheck(tc, b, startb + i))
                return 0;
        return 1;
    }
    else if (a->body.storage_type == MVM_STRING_STRAND && b->body.storage_type == MVM_STRING_STRAND) {
        MVMGraphemeIter gia, gib;
        /* Normal path, for the rest of the time. */
        MVM_string_gi_init(tc, &gia, a);
        MVM_string_gi_init(tc, &gib, b);
        /* Move the grapheme iterator if start is not 0 */
        if (starta) MVM_string_gi_move_to(tc, &gia, starta);
        if (startb) MVM_string_gi_move_to(tc, &gib, startb);
        for (i = 0; i < length; i++)
            if (MVM_string_gi_get_grapheme(tc, &gia) != MVM_string_gi_get_grapheme(tc, &gib))
                return 0;
        return 1;
    }
    else {
        MVMGraphemeIter gi_y;
        MVMString *y = NULL, *z = NULL;
        MVMint64 starty, startz;
        if (a->body.storage_type == MVM_STRING_STRAND) {
                 y = a;           z = b;
            starty = starta; startz = startb;
        }
        else {
                 y = b;           z = a;
            starty = startb; startz = starta;
        }
        MVM_string_gi_init(tc, &gi_y, y);
        if (starty) MVM_string_gi_move_to(tc, &gi_y, starty);
        for (i = 0; i < length; i++)
            if (MVM_string_gi_get_grapheme(tc, &gi_y) != MVM_string_get_grapheme_at_nocheck(tc, z, startz + i))
                return 0;
        return 1;
    }
}
static MVMint64 MVM_string_memmem_grapheme32 (MVMThreadContext *tc, MVMGrapheme32 *H_blob32, MVMGrapheme32 *n_blob32, MVMint64 H_start, MVMStringIndex H_graphs, MVMStringIndex n_graphs) {
    MVMGrapheme32 * rtrn = memmem_uint32(H_blob32 + H_start, H_graphs - H_start, n_blob32, n_graphs);
    return rtrn == NULL ? -1 : rtrn - H_blob32;
}
static MVMint64 MVM_string_memmem_grapheme32str (MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_start, MVMStringIndex H_graphs, MVMStringIndex n_graphs) {
    MVMGrapheme32 *needle_buf = NULL;
    MVMint64 rtrn;
    if (needle->body.storage_type != MVM_STRING_GRAPHEME_32) {
        MVMStringIndex i;
        MVMGraphemeIter n_gi;
        needle_buf = MVM_malloc(needle->body.num_graphs * sizeof(MVMGrapheme32));
        if (needle->body.storage_type != MVM_STRING_GRAPHEME_8) MVM_string_gi_init(tc, &n_gi, needle);
        for (i = 0; i < needle->body.num_graphs; i++) {
            needle_buf[i] = needle->body.storage_type == MVM_STRING_GRAPHEME_8 ? needle->body.storage.blob_8[i] : MVM_string_gi_get_grapheme(tc, &n_gi);
        }
    }
    rtrn = MVM_string_memmem_grapheme32(tc, Haystack->body.storage.blob_32, needle_buf ? needle_buf : needle->body.storage.blob_32, H_start, H_graphs, n_graphs);
    if (needle_buf) MVM_free(needle_buf);
    return rtrn;
}
/* Returns the location of one string in another or -1  */
MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
    size_t index           = (size_t)start;
    MVMStringIndex H_graphs, n_graphs;
    MVM_string_check_arg(tc, Haystack, "index search target");
    MVM_string_check_arg(tc,   needle, "index search term");
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
    n_graphs = MVM_string_graphs_nocheck(tc, needle);

    if (!n_graphs)
        return start <= H_graphs ? start : -1; /* the empty string is in any other string */

    if (!H_graphs)
        return -1;

    if (start < 0 || H_graphs <= start)
        return -1;

    if (H_graphs < n_graphs || n_graphs < 1)
        return -1;

    /* Fast paths when storage types are identical. Uses memmem function, which
     * uses Knuth-Morris-Pratt algorithm on Linux and on others
     * Crochemore+Perrin two-way string matching */
    switch (Haystack->body.storage_type) {
        case MVM_STRING_GRAPHEME_32:
            if (needle->body.storage_type == MVM_STRING_GRAPHEME_32 || needle->body.num_graphs < 100) {
                return MVM_string_memmem_grapheme32str(tc, Haystack, needle, start, H_graphs, n_graphs);
            }
            break;
        case MVM_STRING_GRAPHEME_8:
            if (needle->body.storage_type == MVM_STRING_GRAPHEME_8 || needle->body.num_graphs < 100) {
                void         *mm_return_8 = NULL;
                MVMGrapheme8 *needle_buf  = NULL;
                if (needle->body.storage_type != MVM_STRING_GRAPHEME_8) {
                    MVMStringIndex i;
                    MVMGraphemeIter n_gi;
                    needle_buf = MVM_malloc(needle->body.num_graphs * sizeof(MVMGrapheme8));
                    if (needle->body.storage_type != MVM_STRING_GRAPHEME_32) MVM_string_gi_init(tc, &n_gi, needle);
                    for (i = 0; i < needle->body.num_graphs; i++) {
                        MVMGrapheme32 g = needle->body.storage_type == MVM_STRING_GRAPHEME_32
                            ? needle->body.storage.blob_32[i]
                            : MVM_string_gi_get_grapheme(tc, &n_gi);
                        /* Haystack is 8 bit, needle is 32 bit. if we encounter a non8bit grapheme
                         * it's impossible to match */
                        if (!can_fit_into_8bit(g)) {
                            MVM_free(needle_buf);
                            return -1;
                        }
                        needle_buf[i] = g;
                    }
                }
                mm_return_8 = MVM_memmem(
                    Haystack->body.storage.blob_8 + start, /* start position */
                    (H_graphs - start) * sizeof(MVMGrapheme8), /* length of Haystack from start position to end */
                    needle_buf ? needle_buf : needle->body.storage.blob_8, /* needle start */
                    n_graphs * sizeof(MVMGrapheme8) /* needle length */
                );
                if (needle_buf) MVM_free(needle_buf);
                if (mm_return_8 == NULL)
                    return -1;
                else
                    return (MVMGrapheme8*)mm_return_8 -  Haystack->body.storage.blob_8;
            }
            break;
    }
    /* Minimal code version for needles of size 1 */
    if (n_graphs == 1) {
        MVMGraphemeIter H_gi;
        MVMGrapheme32 n_g = MVM_string_get_grapheme_at_nocheck(tc, needle, 0);
        MVM_string_gi_init(tc, &H_gi, Haystack);
        if (index) MVM_string_gi_move_to(tc, &H_gi, index);
        while (index < H_graphs) {
            if (n_g == MVM_string_gi_get_grapheme(tc, &H_gi))
                return (MVMint64)index;
            index++;
        }
    }
    else if (n_graphs <= MVM_string_KMP_max_pattern_length)
        return knuth_morris_pratt_string_index(tc, needle, Haystack, start);
    else {
        int is_gic = Haystack->body.storage_type == MVM_STRING_STRAND ? 1 : 0;
        void *Hs_or_gic = Haystack;
        /* If Haystack is a strand allocate space for a MVMGraphemeIter_cached
         * and initialize it */
        if (is_gic) {
            Hs_or_gic = alloca(sizeof(MVMGraphemeIter_cached));
            MVM_string_gi_cached_init(tc, Hs_or_gic, Haystack, start);
        }
        /* For needles > MVM_string_KMP_max_pattern_length we must revert to brute force for now.
         * Eventually we can implement brute force after it matches the whole needle OR
         * allocate more space for the pattern on reaching the end of the pattern */
        while (index <= H_graphs - n_graphs) {
            if (string_equal_at_ignore_case_INTERNAL_loop(tc, Hs_or_gic, needle, index, H_graphs, n_graphs, 0, 0, is_gic) != -1)
                return (MVMint64)index;
            index++;
        }
    }
    return -1;
}

/* Returns the location of one string in another or -1  */
MVMint64 MVM_string_index_from_end(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
    MVMint64 result = -1;
    size_t index;
    MVMStringIndex H_graphs, n_graphs;

    MVM_string_check_arg(tc, Haystack, "rindex search target");
    MVM_string_check_arg(tc, needle, "rindex search term");
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
    n_graphs = MVM_string_graphs_nocheck(tc, needle);

    if (!n_graphs) {
        if (0 <= start)
            return start <= H_graphs ? start : -1; /* the empty string is in any other string */
        else
            return H_graphs; /* no start, so return end */
    }

    if (!H_graphs)
        return -1;

    if (H_graphs < n_graphs || n_graphs < 1)
        return -1;

    if (start == -1)
        start = H_graphs - n_graphs;

    if (start < 0 || H_graphs <= start)
        /* maybe return -1 instead? */
        MVM_exception_throw_adhoc(tc, "index start offset out of range");

    index = start;

    if (H_graphs < index + n_graphs) {
        index = H_graphs - n_graphs;
    }

    /* brute force for now. horrible, yes. halp. */
    do {
        if (MVM_string_substrings_equal_nocheck(tc, needle, 0, n_graphs, Haystack, index)) {
            result = (MVMint64)index;
            break;
        }
    } while (0 < index--);
    return result;
}

/* Returns a substring of the given string */
MVMString * MVM_string_substring(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMint64 length) {
    MVMString *result;
    MVMint64   start_pos, end_pos;

    MVMint64 agraphs;

    MVM_string_check_arg(tc, a, "substring");
    /* convert to signed to avoid implicit arithmetic conversions */
    agraphs = (MVMint64)MVM_string_graphs_nocheck(tc, a);

    /* -1 signifies go to the end of the string; anything less is a bug */
    if (length < -1)
        MVM_exception_throw_adhoc(tc, "Substring length (%"PRId64") cannot be negative", length);

    /* negative offsets count from the end */
    start_pos = offset < 0 ? offset + agraphs : offset;
    end_pos   = length == -1 ? agraphs : start_pos + length;

    /* return an empty string if start_pos is out of bounds but positive */
    if (agraphs < start_pos) {
        start_pos = 0;
        end_pos   = 0;
    }

    if (end_pos < 0)
        MVM_exception_throw_adhoc(tc, "Substring end (%"PRId64") cannot be less than 0", end_pos);

    /* Ensure we're within bounds. */
    if (start_pos < 0)
        start_pos = 0;
    if (agraphs < end_pos)
        end_pos = agraphs;

    /* Check trivial cases: empty string and whole string. */
    if (start_pos == end_pos)
        return tc->instance->str_consts.empty;
    if (start_pos == 0 && end_pos == agraphs)
        return a;

    /* Construct a result; how we efficiently do so will vary based on the
     * input string. */
    MVMROOT(tc, a, {
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
        result->body.num_graphs = end_pos - start_pos;
        if (a->body.storage_type != MVM_STRING_STRAND) {
            /* It's some kind of buffer. Construct a strand view into it. */
            result->body.storage_type    = MVM_STRING_STRAND;
            result->body.storage.strands = allocate_strands(tc, 1);
            result->body.num_strands     = 1;
            result->body.storage.strands[0].blob_string = a;
            result->body.storage.strands[0].start       = start_pos;
            result->body.storage.strands[0].end         = end_pos;
            result->body.storage.strands[0].repetitions = 0;
        }
        else if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
            /* Single strand string; quite possibly already a substring. We'll
             * just produce an updated view. */
            MVMStringStrand *orig_strand = &(a->body.storage.strands[0]);
            result->body.storage_type    = MVM_STRING_STRAND;
            result->body.storage.strands = allocate_strands(tc, 1);
            result->body.num_strands     = 1;
            result->body.storage.strands[0].blob_string = orig_strand->blob_string;
            result->body.storage.strands[0].start       = orig_strand->start + start_pos;
            result->body.storage.strands[0].end         = orig_strand->start + end_pos;
            result->body.storage.strands[0].repetitions = 0;
        }
        else {
            /* Produce a new blob string, collapsing the strands. */
            MVMGraphemeIter gi;
            MVM_string_gi_init(tc, &gi, a);
            MVM_string_gi_move_to(tc, &gi, start_pos);
            iterate_gi_into_string(tc, &gi, result, a, start_pos);
        }
    });

    STRAND_CHECK(tc, result);
    return result;
}

MVMString * MVM_string_replace(MVMThreadContext *tc, MVMString *original, MVMint64 start, MVMint64 count, MVMString *replacement) {
    /* XXX this could probably be done more efficiently directly. */
    MVMString *first_part = NULL;
    MVMString *rest_part  = NULL;
    MVMString *result     = NULL;

    MVM_gc_root_temp_push(tc, (MVMCollectable **)&replacement);
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&original);
    MVM_gc_root_temp_push(tc, (MVMCollectable **)&first_part);
    first_part = MVM_string_substring(tc, original, 0, start);

    rest_part  = MVM_string_substring(tc, original, start + count, -1);
    rest_part  = MVM_string_concatenate(tc, replacement, rest_part);
    result     = MVM_string_concatenate(tc, first_part, rest_part);


    STRAND_CHECK(tc, result);
    NFG_CHECK(tc, result, "MVM_string_replace");
    MVM_gc_root_temp_pop_n(tc, 3);
    return result;
}

static MVMString * string_from_strand_at_index(MVMThreadContext *tc, MVMString *a, MVMuint16 index) {
    MVMStringStrand *ss = &(a->body.storage.strands[index]);
    return MVM_string_substring(tc, ss->blob_string, ss->start, ss->end - ss->start);
}

static MVMuint16 final_strand_match_with_repetition_count(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    if (a->body.storage_type == MVM_STRING_STRAND) {
        MVMStringStrand *sa = &(a->body.storage.strands[a->body.num_strands - 1]);
        /* If the final strand of a eq b, we'll just increment the final strand of a's repetitions. */
        if (sa->end - sa->start == MVM_string_graphs_nocheck(tc, b)) {
            if (MVM_string_equal_at(tc, sa->blob_string, b, sa->start))
                return 1;
        }
        /* If the final strand of a eq the first (and only) strand of b, we'll just add b's repetitions
   * (plus 1 for the strand itself) to the final strand of a's repetitions. */
        else if (b->body.storage_type == MVM_STRING_STRAND && b->body.num_strands == 1) {
            MVMStringStrand *sb = &(b->body.storage.strands[0]);
            if (sa->end - sa->start == sb->end - sb->start)
                if (MVM_string_equal(tc, string_from_strand_at_index(tc, a, a->body.num_strands - 1), string_from_strand_at_index(tc, b, 0)))
                    return b->body.storage.strands[0].repetitions + 1;
  }
    }
    return 0;
}

/* Append one string to another. */
MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVMString *result = NULL, *renormalized_section = NULL;
    int renormalized_section_graphs = 0, consumed_a = 0, consumed_b = 0;
    MVMuint32  agraphs, bgraphs;
    MVMuint64  total_graphs;
    int lost_strands          = 0;
    int is_concat_stable      = 0;
    int index_ss_b;
    MVMuint16 matching_repetition_count;
    MVM_string_check_arg(tc, a, "concatenate");
    MVM_string_check_arg(tc, b, "concatenate");

    /* Simple empty-string cases. */
    agraphs = MVM_string_graphs_nocheck(tc, a);
    if (agraphs == 0)
        return b;
    bgraphs = MVM_string_graphs_nocheck(tc, b);
    if (bgraphs == 0)
        return a;

    is_concat_stable = MVM_nfg_is_concat_stable(tc, a, b);

    /* If is_concat_stable equals 0 and a and b are not repetitions. */
    if (is_concat_stable == 0 && !(a->body.storage_type == MVM_STRING_STRAND && a->body.storage.strands[a->body.num_strands - 1].repetitions)
    && !(b->body.storage_type == MVM_STRING_STRAND && b->body.storage.strands[0].repetitions)) {
        MVMCodepoint last_a_first_b[2] = {
            MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1),
            MVM_string_get_grapheme_at_nocheck(tc, b, 0)
        };
        MVMROOT2(tc, a, b, {
        /* If both are not synthetics, we can can pass those values unchanged
         * instead of iterating by codepoint */
        if (0 <= last_a_first_b[0] && 0 <= last_a_first_b[1]) {
            renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b, 2);
            consumed_a = 1; consumed_b = 1;
        }
        else {
            MVMCodepointIter last_a_ci;
            MVMCodepointIter first_b_ci;
            MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci,  last_a_first_b[0], 1);
            MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1], 1);
            /* MSVC doesn't allow variable length arrays so use alloca to allocate onto the stack */
            MVMCodepoint *last_a_first_b_codes = alloca((a_codes + b_codes) * sizeof(MVMCodepoint));
            MVMuint32 i = 0;
            for (; MVM_string_grapheme_ci_has_more(tc, &last_a_ci); i++) {
                last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &last_a_ci);
            }
            for (; MVM_string_grapheme_ci_has_more(tc, &first_b_ci); i++) {
                last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &first_b_ci);
            }
            renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b_codes, a_codes + b_codes);
            consumed_a = 1; consumed_b = 1;
        }
        });
        if (renormalized_section) {
            if (agraphs == consumed_a && bgraphs == consumed_b) {
                NFG_CHECK_CONCAT(tc, renormalized_section, a, b, "renormalized_section");
                return renormalized_section;
            }
            renormalized_section_graphs = MVM_string_graphs_nocheck(tc, renormalized_section);
        }
    }
    /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
    total_graphs = (MVMuint64)agraphs + (MVMuint64)bgraphs;
    if (MAX_GRAPHEMES < total_graphs)
        MVM_exception_throw_adhoc(tc,
            "Can't concatenate strings, required number of graphemes %"PRIu64" > max allowed of %lld",
             total_graphs, MAX_GRAPHEMES);

    /* Otherwise, we'll assemble a result string. */
    MVMROOT4(tc, a, b, renormalized_section, result, {

        /* Allocate it. */
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);

        /* Total graphemes is trivial; just total up inputs. */
        result->body.num_graphs = (MVMuint32)total_graphs;

        /* Result string will be made of strands. */
        result->body.storage_type = MVM_STRING_STRAND;

        /* Detect the wonderful case where we're repeatedly concating the same
         * string again and again, and thus can just bump a repetition. */
        if (is_concat_stable == 1 && (matching_repetition_count = final_strand_match_with_repetition_count(tc, a, b))) {
            /* We have it; just copy the strands to a new string and bump the
             * repetitions count of the last one. */
            result->body.storage.strands = allocate_strands(tc, a->body.num_strands);
            copy_strands(tc, a, 0, result, 0, a->body.num_strands);
            result->body.storage.strands[a->body.num_strands - 1].repetitions += matching_repetition_count;
            result->body.num_strands = a->body.num_strands;
        }

        /* Otherwise, construct a new strand string. */
        else {
            /* See if we have too many strands between the two. If so, we will
             * collapse the biggest side. */
            MVMuint16 strands_a = a->body.storage_type == MVM_STRING_STRAND
                ? a->body.num_strands
                : 1;
            MVMuint16 strands_b = b->body.storage_type == MVM_STRING_STRAND
                ? b->body.num_strands
                : 1;
            MVMString *effective_a = a;
            MVMString *effective_b = b;
            if (MVM_STRING_MAX_STRANDS < strands_a + strands_b) {
                MVMROOT(tc, result, {
                    if (strands_b <= strands_a) {
                        effective_a = collapse_strands(tc, effective_a);
                        strands_a   = 1;
                    }
                    else {
                        effective_b = collapse_strands(tc, effective_b);
                        strands_b   = 1;
                    }
                });
            }
            /* Assemble the result. */
            result->body.num_strands = strands_a + strands_b + (renormalized_section_graphs ? 1 : 0);
            result->body.storage.strands = allocate_strands(tc, result->body.num_strands);
            /* START 1 */
            if (effective_a->body.storage_type == MVM_STRING_STRAND) {
                copy_strands(tc, effective_a, 0, result, 0, strands_a);
            }
            else {
                int index_ss_a = 0;
                MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]);
                ss_a->blob_string = effective_a;
                ss_a->start       = 0;
                ss_a->end         = effective_a->body.num_graphs;
                ss_a->repetitions = 0;
            }
            if (renormalized_section) {
                int index_ss_re;
                int index_ss_a = strands_a - 1;
                /* Tweak the end of the last strand of string a. Since if a is made up of multiple strands, we can't just refer to index 0 and instead erfer to strands_a - 1 */
                MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]);
                MVMStringStrand *ss_re = NULL;
                ss_a->end -= consumed_a;
                /* If the strands ends up to be zero length we need to reduce the number of strand_index and also incease lost_strands so the next operation writes over it */
                if (ss_a->start == ss_a->end)
                    lost_strands++;
            /* END 1 */
            /* START 1.5 (only triggered in some cases) */
                index_ss_re = strands_a - lost_strands;
                ss_re = &(result->body.storage.strands[index_ss_re]);

                /* Add the renormalized section in as a strand */
                ss_re->blob_string = renormalized_section;
                ss_re->start       = 0;
                ss_re->end         = renormalized_section->body.num_graphs;
                ss_re->repetitions = 0;
                if (ss_re->start == ss_re->end) {
                    MVM_exception_throw_adhoc(tc, "Unexpected error in concatenation: renormalized_section is 0 graphemes.\n");
                    /* renormalized_section should always be at least one grapheme
                     * in length so throw if it does not (zero length is an error
                     * we shouldn't lost_strands++ unlike the other strands */
                }
            /* END 1.5 */
            }
            /* START 2 */
            index_ss_b = strands_a - lost_strands + (renormalized_section_graphs ? 1 : 0 );
            if (effective_b->body.storage_type == MVM_STRING_STRAND) {
                copy_strands(tc, effective_b, 0, result, index_ss_b, strands_b);
            }
            else {
                MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]);
                ss_b->blob_string = effective_b;
                ss_b->start       = 0;
                ss_b->end         = effective_b->body.num_graphs;
                ss_b->repetitions = 0;
            }
            if (renormalized_section_graphs) {
                /* Tweak the beginning of the first strand of string b */
                MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]);
                ss_b->start += consumed_b;
                if (ss_b->start == ss_b->end) {
                    lost_strands++;
                    move_strands(tc, result, index_ss_b + 1, result, index_ss_b, strands_b - 1);
                }
            /* END 2 */
            /* Adjust result->num_strands */
                if (lost_strands)
                    result->body.num_strands -= lost_strands;
                /* Adjust result->num_graphs */
                result->body.num_graphs += renormalized_section_graphs - consumed_b - consumed_a;
            }
        }
    STRAND_CHECK(tc, result);
    if (is_concat_stable == 1 || (is_concat_stable == 0 && renormalized_section))
        NFG_CHECK_CONCAT(tc, result, a, b, "'result'");
    });
    if (is_concat_stable == 1 || (is_concat_stable == 0 && renormalized_section))
        return result;
    /* If it's regional indicator (is_concat_stable == 2) */
    return re_nfg(tc, result);
}

MVMString * MVM_string_repeat(MVMThreadContext *tc, MVMString *a, MVMint64 count) {
    MVMString *result = NULL;
    MVMuint32  agraphs;
    MVMuint64  total_graphs;

    MVM_string_check_arg(tc, a, "repeat");

    /* Validate count; handle common cases. */
    if (count == 0)
        return tc->instance->str_consts.empty;
    if (count == 1)
        return a;
    if (count < 0)
        MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be negative", count);
    if (MAX_GRAPHEMES < count)
        MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be greater than max allowed number of graphemes %lld", count, MAX_GRAPHEMES);

    /* If input string is empty, repeating it is empty. */
    agraphs = MVM_string_graphs_nocheck(tc, a);
    if (agraphs == 0)
        return tc->instance->str_consts.empty;

    /* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
    total_graphs = (MVMuint64)agraphs * (MVMuint64)count;
    if (MAX_GRAPHEMES < total_graphs)
        MVM_exception_throw_adhoc(tc,
            "Can't repeat string, required number of graphemes (%"PRIu32" * %"PRIu64") greater than max allowed of %lld",
             agraphs, count, MAX_GRAPHEMES);

    /* Now build a result string with the repetition set. */
    MVMROOT(tc, a, {
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
        result->body.num_graphs      = agraphs * count;
        result->body.storage_type    = MVM_STRING_STRAND;
        result->body.storage.strands = allocate_strands(tc, 1);
        if (a->body.storage_type == MVM_STRING_STRAND) {
            if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
                copy_strands(tc, a, 0, result, 0, 1);
            }
            else {
                MVMROOT(tc, result, {
                    a = collapse_strands(tc, a);
                });
                result->body.storage.strands[0].blob_string = a;
                result->body.storage.strands[0].start       = 0;
                result->body.storage.strands[0].end         = agraphs;
            }
        }
        else {
            result->body.storage.strands[0].blob_string = a;
            result->body.storage.strands[0].start       = 0;
            result->body.storage.strands[0].end         = agraphs;
        }
        result->body.storage.strands[0].repetitions = count - 1;
        result->body.num_strands = 1;
    });
    /* If string a is not stable under concatenation, we need to create a flat
     * string and ensure it is normalized */
    if (!MVM_nfg_is_concat_stable(tc, a, a))
        result = re_nfg(tc, result);
    STRAND_CHECK(tc, result);
    return result;
}

void MVM_string_say(MVMThreadContext *tc, MVMString *a) {
    MVM_string_check_arg(tc, a, "say");
    MVM_string_print(tc, MVM_string_concatenate(tc, a,
        tc->instance->str_consts.platform_newline));
}

void MVM_string_print(MVMThreadContext *tc, MVMString *a) {
    MVMOSHandle *handle = (MVMOSHandle *)tc->instance->stdout_handle;
    MVMuint64 encoded_size;
    char *encoded;
    MVM_string_check_arg(tc, a, "print");
    encoded = MVM_string_utf8_encode(tc, a, &encoded_size, MVM_TRANSLATE_NEWLINE_OUTPUT);
    MVM_io_write_bytes_c(tc, tc->instance->stdout_handle, encoded, encoded_size);
    MVM_free(encoded);
}
/* Meant to be pased in a MVMNormalizer of type MVM_NORMALIZE_NFD */
static MVMGrapheme32 ord_getbasechar (MVMThreadContext *tc, MVMGrapheme32 g) {
    /* If we get a synthetic, extract the base codepoint and call ord_getbasechar again */
    if (g < 0) {
        MVMNFGSynthetic *synth = MVM_nfg_get_synthetic_info(tc, g);
        return ord_getbasechar(tc, synth->codes[synth->base_index]);
    }
    else {
        MVMGrapheme32 return_g;
        MVMint32 ready;
        MVMNormalizer norm;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD);

        ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, g, &return_g);
        MVM_unicode_normalizer_eof(tc, &norm);
        if (!ready)
            return_g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
        return return_g;
    }
}
/* Tests whether one string a has the other string b as a substring at that index */
MVMint64 MVM_string_equal_at(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 offset) {

    MVMStringIndex agraphs, bgraphs;

    MVM_string_check_arg(tc, a, "equal_at");
    MVM_string_check_arg(tc, b, "equal_at");

    agraphs = MVM_string_graphs_nocheck(tc, a);
    bgraphs = MVM_string_graphs_nocheck(tc, b);

    if (offset < 0) {
        offset += agraphs;
        if (offset < 0)
            offset = 0; /* XXX I think this is the right behavior here */
    }
    if (agraphs - offset < bgraphs)
        return 0;
    return MVM_string_substrings_equal_nocheck(tc, a, offset, bgraphs, b, 0);
}
/* Ensure return value can hold numbers at least 3x higher than MVMStringIndex.
 * Theoretically if the string has all ﬃ ligatures and 1/3 the max size of
 * MVMStringIndex in length, we could have some weird results. */

/* ignoremark is 0 for normal operation and 1 for ignoring diacritics */
MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, void *Hs_or_gic, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase, int is_gic) {
    MVMuint32 H_fc_cps;
    /* An additional needle offset which is used only when codepoints expand
     * when casefolded. The offset is the number of additional codepoints that
     * have been seen so Haystack and needle stay aligned */
    MVMint64 n_offset = 0;
    MVMint64 i, j;
    MVMGrapheme32 H_g, n_g;
    for (i = 0; i + H_start < H_graphs && i + n_offset < n_fc_graphs; i++) {
        const MVMCodepoint* H_result_cps;
        H_g = is_gic ? MVM_string_gi_cached_get_grapheme(tc, Hs_or_gic, H_start + i) : MVM_string_get_grapheme_at_nocheck(tc, Hs_or_gic, H_start + i);
        if (!ignorecase) {
            H_fc_cps = 0;
        }
        else if (0 <= H_g) {
            /* For codeponits we can get the case change directly */
            H_fc_cps = MVM_unicode_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, &H_result_cps);
        }
        else {
            /* Synthetics must use this function */
            H_fc_cps = MVM_nfg_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, (MVMGrapheme32**) &H_result_cps);
        }
        /* If we get 0 for the number that means the cp doesn't change when casefolded */
        if (H_fc_cps == 0) {
            n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
            if (ignoremark) {
                H_g = ord_getbasechar(tc, H_g);
                n_g = ord_getbasechar(tc, n_g);
            }
            if (H_g != n_g)
                return -1;
        }
        else if (1 <= H_fc_cps) {
            for (j = 0; j < H_fc_cps; j++) {
                n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
                H_g = H_result_cps[j];
                if (ignoremark) {
                    H_g = ord_getbasechar(tc, H_g);
                    n_g = ord_getbasechar(tc, n_g);
                }
                if (H_g != n_g)
                    return -1;
                n_offset++;
            }
            n_offset--;
        }
    }
    return n_offset;
    /* We return -1 if the strings are not equal and 0 or more if they are equal
     * The return values from 0, 1 etc designate how many Haystack graphemes
     * were expanded.
     * This may seem like an odd arangement, but this extra information is needed
     * to determine the length of the Haystack which was traversed, as it can
     * differ from the length of the needle if there are expansions. */
}
/* Checks if needle exists at the offset, but ignores case.
 * Sometimes there is a difference in length of a string before and after foldcase,
 * because of this we must compare this differently than just foldcasing both
 * strings to ensure the offset is correct */
static MVMint64 string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset, int ignoremark, int ignorecase) {
    /* Foldcase version of needle */
    MVMString *needle_fc = NULL;
    MVMStringIndex H_graphs = MVM_string_graphs(tc, Haystack);
    MVMStringIndex n_graphs = MVM_string_graphs(tc, needle);
    MVMStringIndex n_fc_graphs;
    /* H_expansion must be able to hold integers 3x larger than MVMStringIndex */
    MVMint64 H_expansion;

    if (H_offset < 0) {
        H_offset += H_graphs;
        if (H_offset < 0)
            H_offset = 0; /* XXX I think this is the right behavior here */
    }
    /* If the offset is greater or equal to the number of Haystack graphemes
     * return 0. Since size of graphemes could change under casefolding, we
     * can't assume too much. If optimizing this be careful */
    if (H_graphs <= H_offset)
        return 0;
    MVMROOT(tc, Haystack, {
        needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle;
    });
    n_fc_graphs = MVM_string_graphs(tc, needle_fc);
    if (Haystack->body.storage_type == MVM_STRING_STRAND) {
        MVMGraphemeIter_cached H_gic;
        MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
        H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, &H_gic, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 1);
    }
    else {
        H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 0);
    }
    if (0 <= H_expansion)
        return n_fc_graphs <= H_graphs + H_expansion - H_offset ? 1 : 0;
    return 0;
}
/* Processes the pattern. The pattern must be able to store negative and positive
 * numbers. It must be able to store at least 1/2 the length of the needle,
 * though possibly more (though I am not sure it's possible for it to be more than
 * 1/2). */
static void knuth_morris_pratt_process_pattern (MVMThreadContext *tc, MVMString *pat, MVMint16 *next, MVMStringIndex pat_graphs) {
    MVMint64 i = 0;
    MVMint64 j = next[0] = -1;
    while (i < pat_graphs) {
        if (j == -1 || MVM_string_get_grapheme_at_nocheck(tc, pat, i)
                    == MVM_string_get_grapheme_at_nocheck(tc, pat, j)) {
            i++; j++;
            next[i] = (i < pat_graphs
            && MVM_string_get_grapheme_at_nocheck(tc, pat, j)
            == MVM_string_get_grapheme_at_nocheck(tc, pat, i))
                ? next[j]
                : j;
        }
        else j = next[j];
    }
}

static MVMint64 knuth_morris_pratt_string_index (MVMThreadContext *tc, MVMString *needle, MVMString *Haystack, MVMint64 H_offset) {
    MVMint64 needle_offset = 0;
    MVMint64 text_offset   = H_offset;
    MVMStringIndex Haystack_graphs = MVM_string_graphs_nocheck(tc, Haystack);
    MVMStringIndex needle_graphs   = MVM_string_graphs_nocheck(tc, needle);
    MVMint16         *next = NULL;
    MVMString *flat_needle = NULL;
    size_t next_size = (1 + needle_graphs) * sizeof(MVMint16);
    int    next_is_malloced = 0;
    assert(needle_graphs <= MVM_string_KMP_max_pattern_length);
    /* Empty string is found at start of string */
    if (needle_graphs == 0)
        return 0;
    /* Allocate max 8K onto the stack, otherwise malloc */
    if (next_size < 3000)
        next = alloca(next_size);
    else {
        next = MVM_malloc(next_size);
        next_is_malloced = 1;
    }
    /* If the needle is a strand, flatten it, otherwise use the original string */
    flat_needle = needle->body.storage_type == MVM_STRING_STRAND
        ? collapse_strands(tc, needle)
        : needle;
    /* Process the needle into a jump table put into variable 'next' */
    knuth_morris_pratt_process_pattern(tc, flat_needle, next, needle_graphs);
    /* If the Haystack is a strand, use MVM_string_gi_cached_get_grapheme
     * since it retains its grapheme iterator over invocations unlike
     * MVM_string_get_grapheme_at_nocheck and caches the previous grapheme. It
     * is slower for flat Haystacks though. */
    #define MVM_kmp_loop(Haystack_function) {\
        while (text_offset < Haystack_graphs && needle_offset < needle_graphs) {\
            if (needle_offset == -1 || MVM_string_get_grapheme_at_nocheck(tc, flat_needle, needle_offset)\
                                    == (Haystack_function)) {\
                text_offset++; needle_offset++;\
                if (needle_offset == needle_graphs) {\
                    if (next_is_malloced) MVM_free(next);\
                    return text_offset - needle_offset;\
                }\
            }\
            else needle_offset = next[needle_offset];\
        }\
    }
    if (Haystack->body.storage_type == MVM_STRING_STRAND) {
        MVMGraphemeIter_cached H_gic;
        MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
        MVM_kmp_loop(MVM_string_gi_cached_get_grapheme(tc, &H_gic, text_offset));
    }
    else {
        MVM_kmp_loop(MVM_string_get_grapheme_at_nocheck(tc, Haystack, text_offset));
    }
    if (next_is_malloced) MVM_free(next);
    return -1;
}
static MVMint64 string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start, int ignoremark, int ignorecase) {
    /* Foldcase version of needle */
    MVMString *needle_fc = NULL;
    MVMStringIndex n_fc_graphs;

    size_t index           = (size_t)start;
    MVMStringIndex H_graphs, n_graphs;
    /* H_expansion must be able to hold integers 3x larger than MVMStringIndex */
    MVMint64 H_expansion;
    MVMint64 return_val = -1;
    int is_gic = Haystack->body.storage_type == MVM_STRING_STRAND ? 1 : 0;
    void *Hs_or_gic = Haystack;
    MVM_string_check_arg(tc, Haystack, ignoremark ? "index ignore case ignore mark search target" : "index ignore case search target");
    MVM_string_check_arg(tc, needle,   ignoremark ? "index ignore case ignore mark search term"   : "index ignore case search term");
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
    n_graphs = MVM_string_graphs_nocheck(tc, needle);
    if (!n_graphs)
        return start <= H_graphs ? start : -1; /* Empty string is in any other string */
    if (!H_graphs)
        return -1;
    if (start < 0 || H_graphs <= start)
        return -1;
    /* Codepoints can expand into up to THREE codepoints (as of Unicode 9.0). The next check
     * checks if it is at all possible for the needle grapheme number to be higher
     * than the Haystack */
    if (H_graphs * 3 < n_graphs)
        return -1;

    if (n_graphs < 1)
        return -1;

    MVMROOT(tc, Haystack, {
        needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle;
    });
    n_fc_graphs = MVM_string_graphs(tc, needle_fc);
    /* brute force for now. horrible, yes. halp. */
    if (is_gic) {
        Hs_or_gic = alloca(sizeof(MVMGraphemeIter_cached));
        MVM_string_gi_cached_init(tc, Hs_or_gic, Haystack, start);
    }
    while (index <= H_graphs) {
        H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Hs_or_gic, needle_fc, index, H_graphs, n_fc_graphs, ignoremark, ignorecase, is_gic);
        if (0 <= H_expansion)
            return n_fc_graphs <= H_graphs + H_expansion - index ? (MVMint64)index : -1;
        index++;
    }
    return -1;
}

MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
    return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 0, 1);
}
MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
    return string_index_ignore_case(tc, Haystack, needle, start, 0, 1);
}
MVMint64 MVM_string_equal_at_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
    return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1, 0);
}
MVMint64 MVM_string_index_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
    return string_index_ignore_case(tc, Haystack, needle, start, 1, 0);
}
MVMint64 MVM_string_equal_at_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
    return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1, 1);
}
MVMint64 MVM_string_index_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
    return string_index_ignore_case(tc, Haystack, needle, start, 1, 1);
}

MVMGrapheme32 MVM_string_ord_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
    MVMStringIndex agraphs;
    MVMGrapheme32 g;

    MVM_string_check_arg(tc, s, "grapheme_at");

    agraphs = MVM_string_graphs(tc, s);
    if (offset < 0 || agraphs <= offset)
        return -1;

    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);

    return 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
}

/* Gets the base character at a grapheme position, ignoring things like diacritics */
MVMGrapheme32 MVM_string_ord_basechar_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
    MVMStringIndex agraphs;
    MVMint32 ready;

    MVM_string_check_arg(tc, s, "ord_basechar_at");

    agraphs = MVM_string_graphs_nocheck(tc, s);
    if (offset < 0 || agraphs <= offset)
        return -1;  /* fixes RT #126771 */

    return ord_getbasechar(tc, MVM_string_get_grapheme_at_nocheck(tc, s, offset));
}


/* Compares two strings for equality. */
MVMint64 MVM_string_equal(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVMStringIndex agraphs, bgraphs;

    MVM_string_check_arg(tc, a, "equal");
    MVM_string_check_arg(tc, b, "equal");

    if (a == b)
        return 1;

    agraphs = MVM_string_graphs_nocheck(tc, a);
    bgraphs = MVM_string_graphs_nocheck(tc, b);

    if (agraphs != bgraphs)
        return 0;
    /* If we have cached hash codes from both a and b we can compare if they are identical.
     * If they don't match then we already know the two strings are not equal. */
    if (a->body.cached_hash_code && b->body.cached_hash_code && a->body.cached_hash_code != b->body.cached_hash_code)
        return 0;

    return MVM_string_substrings_equal_nocheck(tc, a, 0, bgraphs, b, 0);
}

/* more general form of has_at; compares two substrings for equality */
MVMint64 MVM_string_have_at(MVMThreadContext *tc, MVMString *a,
        MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {

    MVM_string_check_arg(tc, a, "have_at");
    MVM_string_check_arg(tc, b, "have_at");

    if (starta < 0 || startb < 0)
        return 0;
    if (length == 0)
        return 1;
    if (MVM_string_graphs_nocheck(tc, a) < starta + length || MVM_string_graphs_nocheck(tc, b) < startb + length)
        return 0;

    return MVM_string_substrings_equal_nocheck(tc, a, starta, length, b, startb);
}

/* Returns the grapheme at a given index of the string */
MVMint64 MVM_string_get_grapheme_at(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
    MVMStringIndex agraphs;

    MVM_string_check_arg(tc, a, "grapheme_at");

    agraphs = MVM_string_graphs_nocheck(tc, a);

    if (index < 0 || agraphs <= index)
        MVM_exception_throw_adhoc(tc, "Invalid string index: max %"PRId32", got %"PRId64,
            agraphs - 1, index);

    return (MVMint64)MVM_string_get_grapheme_at_nocheck(tc, a, index);
}

/* Finds the location of a grapheme in a string.  Useful for small character class lookup */
MVMint64 MVM_string_index_of_grapheme(MVMThreadContext *tc, MVMString *a, MVMGrapheme32 grapheme) {
    size_t index = -1;
    MVMGraphemeIter gi;

    MVM_string_check_arg(tc, a, "string_index_of_grapheme");

    MVM_string_gi_init(tc, &gi, a);
    while (MVM_string_gi_has_more(tc, &gi)) {
        index++;
        if (MVM_string_gi_get_grapheme(tc, &gi) == grapheme)
            return index;
    }
    return -1;
}

/* Case change functions. */
MVMint64 MVM_string_grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g);
static MVMString * do_case_change(MVMThreadContext *tc, MVMString *s, MVMint32 type, char *error) {
    MVMint64 sgraphs;
    MVM_string_check_arg(tc, s, error);
    sgraphs = MVM_string_graphs_nocheck(tc, s);
    if (sgraphs) {
        MVMString *result;
        MVMGraphemeIter gi;
        MVMint64 result_graphs = sgraphs;
        MVMGrapheme32 *result_buf = MVM_malloc(result_graphs * sizeof(MVMGrapheme32));
        MVMint32 changed = 0;
        MVMint64 i = 0;
        MVM_string_gi_init(tc, &gi, s);
        while (MVM_string_gi_has_more(tc, &gi)) {
            MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
          peeked:
            if (g == 0x03A3) {
                /* Greek sigma needs special handling when lowercased. */
                switch (type) {
                    case MVM_unicode_case_change_type_upper:
                    case MVM_unicode_case_change_type_title:
                        result_buf[i++] = g;
                        break;
                    case MVM_unicode_case_change_type_lower:
                        changed = 1;
                        if (i == 0) {
                            /* Start of string, so not final. */
                            result_buf[i++] = 0x03C3;
                        }
                        else if (!MVM_string_grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, result_buf[i - 1])) {
                            /* Previous char is not a letter; not final (as has
                             * to be at end of a word and not only thing in a
                             * word). */
                            result_buf[i++] = 0x03C3;
                        }
                        else if (!MVM_string_gi_has_more(tc, &gi)) {
                            /* End of string. We only reach here if we have a
                             * letter before us, so it must be final. */
                            result_buf[i++] = 0x03C2;
                        }
                        else {
                            /* Letter before us, something ahead of us. Need to
                             * peek ahead to see if it's a letter, to decide if
                             * we have final sigma or not. */
                            g = MVM_string_gi_get_grapheme(tc, &gi);
                            if (MVM_string_grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, g))
                                result_buf[i++] = 0x03C3;
                            else
                                result_buf[i++] = 0x03C2;
                            goto peeked;
                        }
                        break;
                    case MVM_unicode_case_change_type_fold:
                        result_buf[i++] = 0x03C3;
                        changed = 1;
                        break;
                }
            }
            else if (0 <= g) {
                const MVMCodepoint *result_cps;
                MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
                    g, type, &result_cps);
                if (num_result_cps == 0) {
                    result_buf[i++] = g;
                }
                else if (num_result_cps == 1) {
                    result_buf[i++] = *result_cps;
                    changed = 1;
                }
                else {
                    /* To maintain NFG, we need to re-normalize when we get an
                     * expansion. */
                    MVMNormalizer norm;
                    MVMint32 num_result_graphs;
                    MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
                    MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, num_result_cps);
                    MVM_unicode_normalizer_eof(tc, &norm);
                    num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);

                    /* Make space for any extra graphemes. */
                    if (1 < num_result_graphs) {
                        result_graphs += num_result_graphs - 1;
                        result_buf = MVM_realloc(result_buf,
                            result_graphs * sizeof(MVMGrapheme32));
                    }

                    /* Copy resulting graphemes. */
                    while (0 < num_result_graphs) {
                        result_buf[i++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
                        num_result_graphs--;
                    }
                    changed = 1;

                    /* Clean up normalizer (we could init one per transform
                     * and keep it around in the future, if we find it's a
                     * worthwhile gain). */
                    MVM_unicode_normalizer_cleanup(tc, &norm);
                }
            }
            else {
                MVMGrapheme32 *transformed;
                MVMint32 num_transformed = MVM_nfg_get_case_change(tc, g, type, &transformed);
                if (num_transformed == 0) {
                    result_buf[i++] = g;
                }
                else if (num_transformed == 1) {
                    result_buf[i++] = *transformed;
                    changed = 1;
                }
                else {
                    MVMuint32 j;
                    result_graphs += num_transformed - 1;
                    result_buf = MVM_realloc(result_buf,
                        result_graphs * sizeof(MVMGrapheme32));
                    MVM_VECTORIZE_LOOP
                    for (j = 0; j < num_transformed; j++)
                        result_buf[i++] = transformed[j];
                    changed = 1;
                }
            }
        }
        if (changed) {
            result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
            result->body.num_graphs      = result_graphs;
            result->body.storage_type    = MVM_STRING_GRAPHEME_32;
            result->body.storage.blob_32 = result_buf;
            return result;
        }
        else {
            MVM_free(result_buf);
        }
    }
    STRAND_CHECK(tc, s);
    return s;
}
MVMString * MVM_string_uc(MVMThreadContext *tc, MVMString *s) {
    return do_case_change(tc, s, MVM_unicode_case_change_type_upper, "uc");
}
MVMString * MVM_string_lc(MVMThreadContext *tc, MVMString *s) {
    return do_case_change(tc, s, MVM_unicode_case_change_type_lower, "lc");
}
MVMString * MVM_string_tc(MVMThreadContext *tc, MVMString *s) {
    return do_case_change(tc, s, MVM_unicode_case_change_type_title, "tc");
}
MVMString * MVM_string_fc(MVMThreadContext *tc, MVMString *s) {
    return do_case_change(tc, s, MVM_unicode_case_change_type_fold, "fc");
}

/* "Strict"ly (if possible) decodes a C buffer to an MVMString, dependent on the
 * encoding type flag. Unlike MVM_string_decode, it will not pass through
 * codepoints which have no official mapping. `config` can be set to 1 to indicate
 * that you want to decode non-strict ("permissive"), which will try and decode
 * as long as it's possible (For example codepoint 129 in windows-1252 is invalid,
 * but is technically possible to use Unicode codepoint 129 instead (though it's
 * most likely this means the input is actually *not* windows-1252).
 * For now windows-1252 and windows-1251 are the only ones this makes a difference
 * on. And it is mostly irrelevant for utf8/utf8-c8 encodings since they can
 * already represent all codepoints below 0x10FFFF */
MVMString * MVM_string_decode_config(MVMThreadContext *tc,
        const MVMObject *type_object, char *Cbuf, MVMint64 byte_length,
        MVMint64 encoding_flag, MVMString *replacement, MVMint64 config) {
    switch(encoding_flag) {
        case MVM_encoding_type_utf8:
            return MVM_string_utf8_decode_strip_bom(tc, type_object, Cbuf, byte_length);
        case MVM_encoding_type_ascii:
            return MVM_string_ascii_decode(tc, type_object, Cbuf, byte_length);
        case MVM_encoding_type_latin1:
            return MVM_string_latin1_decode(tc, type_object, Cbuf, byte_length);
        case MVM_encoding_type_utf16:
            return MVM_string_utf16_decode(tc, type_object, Cbuf, byte_length);
        case MVM_encoding_type_windows1252:
            return MVM_string_windows1252_decode_config(tc, type_object, Cbuf, byte_length, replacement, config);
        case MVM_encoding_type_windows1251:
            return MVM_string_windows1251_decode_config(tc, type_object, Cbuf, byte_length, replacement, config);
        case MVM_encoding_type_shiftjis:
            return MVM_string_shiftjis_decode(tc, type_object, Cbuf, byte_length, replacement, config);
        case MVM_encoding_type_utf8_c8:
            return MVM_string_utf8_c8_decode(tc, type_object, Cbuf, byte_length);
        default:
            MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
    }
}
/* Strictly decodes a C buffer to an MVMString, dependent on the encoding type flag.
 * See the comments above MVM_string_decode_config() above for more details. */
MVMString * MVM_string_decode(MVMThreadContext *tc,
        const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, MVMint64 encoding_flag) {
    return MVM_string_decode_config(tc, type_object, Cbuf, byte_length, encoding_flag, NULL, MVM_ENCODING_PERMISSIVE);
}

/* Strictly encodes an MVMString to a C buffer, dependent on the encoding type flag.
 * See comments for MVM_string_decode_config() above for more details. */
char * MVM_string_encode_config(MVMThreadContext *tc, MVMString *s, MVMint64 start,
        MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag,
        MVMString *replacement, MVMint32 translate_newlines, MVMuint8 config) {
    switch(encoding_flag) {
        case MVM_encoding_type_utf8:
            return MVM_string_utf8_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
        case MVM_encoding_type_ascii:
            return MVM_string_ascii_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
        case MVM_encoding_type_latin1:
            return MVM_string_latin1_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
        case MVM_encoding_type_utf16:
            return MVM_string_utf16_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
        case MVM_encoding_type_windows1252:
            return MVM_string_windows1252_encode_substr_config(tc, s, output_size, start, length, replacement, translate_newlines, config);
        case MVM_encoding_type_windows1251:
            return MVM_string_windows1251_encode_substr_config(tc, s, output_size, start, length, replacement, translate_newlines, config);
        case MVM_encoding_type_shiftjis:
            return MVM_string_shiftjis_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines, config);
        case MVM_encoding_type_utf8_c8:
            return MVM_string_utf8_c8_encode_substr(tc, s, output_size, start, length, replacement);
        default:
            MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
    }
}
char * MVM_string_encode(MVMThreadContext *tc, MVMString *s, MVMint64 start,
        MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag,
        MVMString *replacement, MVMint32 translate_newlines) {
    return MVM_string_encode_config(tc, s, start, length, output_size, encoding_flag, replacement, translate_newlines, MVM_ENCODING_PERMISSIVE);
}

/* Strictly encodes a string, and writes the encoding string into the supplied Buf
 * instance, which should be an integer array with MVMArray REPR. */
MVMObject * MVM_string_encode_to_buf_config(MVMThreadContext *tc, MVMString *s, MVMString *enc_name,
        MVMObject *buf, MVMString *replacement, MVMint64 config) {
    MVMuint64 output_size;
    MVMuint8 *encoded;
    MVMArrayREPRData *buf_rd;
    MVMuint8 elem_size = 0;

    /* Ensure the target is in the correct form. */
    MVM_string_check_arg(tc, s, "encode");
    if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
        MVM_exception_throw_adhoc(tc, "encode requires a native array to write into");
    buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
    if (buf_rd) {
        switch (buf_rd->slot_type) {
            case MVM_ARRAY_I64: elem_size = 8; break;
            case MVM_ARRAY_I32: elem_size = 4; break;
            case MVM_ARRAY_I16: elem_size = 2; break;
            case MVM_ARRAY_I8:  elem_size = 1; break;
            case MVM_ARRAY_U64: elem_size = 8; break;
            case MVM_ARRAY_U32: elem_size = 4; break;
            case MVM_ARRAY_U16: elem_size = 2; break;
            case MVM_ARRAY_U8:  elem_size = 1; break;
        }
    }
    if (!elem_size)
        MVM_exception_throw_adhoc(tc, "encode requires a native int array");
    if (((MVMArray *)buf)->body.slots.any)
        MVM_exception_throw_adhoc(tc, "encode requires an empty array");

    /* At least find_encoding may allocate on first call, so root just
     * in case. */
    MVMROOT2(tc, buf, s, {
        const MVMuint8 encoding_flag = MVM_string_find_encoding(tc, enc_name);
        encoded = (MVMuint8 *)MVM_string_encode_config(tc, s, 0, MVM_string_graphs_nocheck(tc, s), &output_size,
            encoding_flag, replacement, 0, config);
    });

    /* Stash the encoded data in the VMArray. */
    ((MVMArray *)buf)->body.slots.i8 = (MVMint8 *)encoded;
    ((MVMArray *)buf)->body.start    = 0;
    ((MVMArray *)buf)->body.ssize    = output_size / elem_size;
    ((MVMArray *)buf)->body.elems    = output_size / elem_size;
    return buf;
}
MVMObject * MVM_string_encode_to_buf(MVMThreadContext *tc, MVMString *s, MVMString *enc_name,
        MVMObject *buf, MVMString *replacement) {
    return MVM_string_encode_to_buf_config(tc, s, enc_name, buf, replacement, MVM_ENCODING_PERMISSIVE);
}
/* Decodes a string using the data from the specified Buf. Decodes "strict" by
 * default, but optionally can be "permissive". */
MVMString * MVM_string_decode_from_buf_config(MVMThreadContext *tc, MVMObject *buf,
        MVMString *enc_name, MVMString *replacement, MVMint64 config) {
    MVMArrayREPRData *buf_rd;
    MVMuint8 encoding_flag;
    MVMuint8 elem_size = 0;

    /* Ensure the source is in the correct form. */
    if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
        MVM_exception_throw_adhoc(tc, "decode requires a native array to read from");
    buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
    if (buf_rd) {
        switch (buf_rd->slot_type) {
            case MVM_ARRAY_I64: elem_size = 8; break;
            case MVM_ARRAY_I32: elem_size = 4; break;
            case MVM_ARRAY_I16: elem_size = 2; break;
            case MVM_ARRAY_I8:  elem_size = 1; break;
            case MVM_ARRAY_U64: elem_size = 8; break;
            case MVM_ARRAY_U32: elem_size = 4; break;
            case MVM_ARRAY_U16: elem_size = 2; break;
            case MVM_ARRAY_U8:  elem_size = 1; break;
        }
    }
    if (!elem_size)
        MVM_exception_throw_adhoc(tc, "encode requires a native int array");

    /* Decode. */
    MVMROOT(tc, buf, {
        encoding_flag = MVM_string_find_encoding(tc, enc_name);
    });
    return MVM_string_decode_config(tc, tc->instance->VMString,
        (char *)(((MVMArray *)buf)->body.slots.i8 + ((MVMArray *)buf)->body.start),
        ((MVMArray *)buf)->body.elems * elem_size,
        encoding_flag, replacement, config);
}
MVMString * MVM_string_decode_from_buf(MVMThreadContext *tc, MVMObject *buf, MVMString *enc_name) {
    return MVM_string_decode_from_buf_config(tc, buf, enc_name, NULL, MVM_ENCODING_PERMISSIVE);
}

MVMObject * MVM_string_split(MVMThreadContext *tc, MVMString *separator, MVMString *input) {
    MVMObject *result = NULL;
    MVMStringIndex start, end, sep_length;
    MVMHLLConfig *hll = MVM_hll_current(tc);

    MVM_string_check_arg(tc, separator, "split separator");
    MVM_string_check_arg(tc, input, "split input");

    MVMROOT3(tc, input, separator, result, {
        result = MVM_repr_alloc_init(tc, hll->slurpy_array_type);
        start = 0;
        end = MVM_string_graphs_nocheck(tc, input);
        sep_length = MVM_string_graphs_nocheck(tc, separator);

        while (start < end) {
            MVMString *portion;
            MVMStringIndex index;
            MVMStringIndex length;

            /* XXX make this use the dual-traverse iterator, but such that it
                can reset the index of what it's comparing... <!> */
            index = MVM_string_index(tc, input, separator, start);
            length = sep_length ? (index == -1 ? end : index) - start : 1;
            if (0 < length || (sep_length && length == 0)) {
                portion = MVM_string_substring(tc, input, start, length);
                MVMROOT(tc, portion, {
                    MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
                    MVM_repr_set_str(tc, pobj, portion);
                    MVM_repr_push_o(tc, result, pobj);
                });
            }
            start += length + sep_length;
            /* Gather an empty string if the delimiter is found at the end. */
            if (sep_length && start == end) {
                MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
                MVM_repr_set_str(tc, pobj, tc->instance->str_consts.empty);
                MVM_repr_push_o(tc, result, pobj);
            }
        }
    });

    return result;
}
/* Used in the MVM_string_join function. Moved here to simplify the code */
void copy_to_32bit (MVMThreadContext *tc, MVMString *source,
    MVMString *dest, MVMint64 *position, MVMGraphemeIter *gi) {
    /* Add source. */
    switch (source->body.storage_type) {
        case MVM_STRING_GRAPHEME_32: {
            memcpy(
                dest->body.storage.blob_32 + *position,
                source->body.storage.blob_32,
                source->body.num_graphs * sizeof(MVMGrapheme32));
            *position += source->body.num_graphs;
            break;
        }
        case MVM_STRING_GRAPHEME_ASCII:
        case MVM_STRING_GRAPHEME_8: {
            MVMStringIndex sindex = 0;
            while (sindex < source->body.num_graphs)
                dest->body.storage.blob_32[(*position)++] =
                    source->body.storage.blob_8[sindex++];
            break;
        }
        default:
            MVM_string_gi_init(tc, gi, source);
            while (MVM_string_gi_has_more(tc, gi))
                dest->body.storage.blob_32[(*position)++] =
                    MVM_string_gi_get_grapheme(tc, gi);
            break;
    }
}
/* Used in MVM_string_join to check stability of adding the next piece */
MVM_STATIC_INLINE void join_check_stability(MVMThreadContext *tc, MVMString *piece,
    MVMString *separator, MVMString **pieces, MVMint32 *concats_stable, MVMint64 num_pieces, MVMint64 sgraphs, MVMint64 piece_index) {
    if (!sgraphs) {
        /* If there's no separator and one piece is The Empty String we
         * have to be extra careful about concat stability */
        if (!MVM_string_graphs_nocheck(tc, piece)
                && piece_index + 1 < num_pieces
                && !MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], pieces[piece_index + 1])) {
            *concats_stable = 0;
        }
        /* Separator has no graphemes, so NFG stability check
         * should consider pieces. */
        else if (!MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], piece))
            *concats_stable = 0;
    }
    /* If we have a separator, check concat stability */
    else {
        if (!MVM_nfg_is_concat_stable(tc, pieces[piece_index - 1], separator) /* Before */
         || !MVM_nfg_is_concat_stable(tc, separator, piece)) { /* And after separator */
            *concats_stable = 0;
        }
    }
}
MVM_STATIC_INLINE MVMString * join_get_str_from_pos(MVMThreadContext *tc, MVMObject *array, MVMint64 index, MVMint64 is_str_array) {
    if (is_str_array) {
        MVMString *piece = MVM_repr_at_pos_s(tc, array, index);
        if (piece)
            return piece;
    }
    else {
        MVMObject *item = MVM_repr_at_pos_o(tc, array, index);
        if (item && IS_CONCRETE(item))
            return MVM_repr_get_str(tc, item);
    }
    return (MVMString*)NULL;
}
MVMString * MVM_string_join(MVMThreadContext *tc, MVMString *separator, MVMObject *input) {
    MVMString  *result = NULL;
    MVMString **pieces = NULL;
    MVMint64    elems, num_pieces, sgraphs, i, is_str_array, total_graphs;
    MVMuint16   sstrands, total_strands;
    MVMint32    concats_stable = 1, all_strands;
    size_t      bytes;

    MVM_string_check_arg(tc, separator, "join separator");
    if (!IS_CONCRETE(input))
        MVM_exception_throw_adhoc(tc, "join needs a concrete array to join");

    /* See how many things we have to join; if the answer is "none" then we
     * can make a hasty escape. */
    elems = MVM_repr_elems(tc, input);
    if (elems == 0)
        return tc->instance->str_consts.empty;
    bytes = elems * sizeof(MVMString *);
    is_str_array = REPR(input)->pos_funcs.get_elem_storage_spec(tc,
        STABLE(input)).boxed_primitive == MVM_STORAGE_SPEC_BP_STR;

    /* If there's only one element to join, just return it. */
    if (elems == 1) {
        MVMString *piece = join_get_str_from_pos(tc, input, 0, is_str_array);
        if (piece)
            return piece;
    }

    /* Allocate result. */
    MVMROOT2(tc, separator, input, {
        result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    });

    /* Take a first pass through the string, counting up length and the total
     * number of strands we encounter as well as building a flat array of the
     * strings (so we only have to do the indirect calls once). */
    sgraphs  = MVM_string_graphs_nocheck(tc, separator);
    if (sgraphs)
        sstrands = separator->body.storage_type == MVM_STRING_STRAND
            ? separator->body.num_strands
            : 1;
    else
        sstrands = 1;
    pieces        = MVM_fixed_size_alloc(tc, tc->instance->fsa, bytes);
    num_pieces    = 0;
    total_graphs  = 0;
    total_strands = 0;
    /* Is the separator a strand? */
    all_strands = separator->body.storage_type == MVM_STRING_STRAND;
    for (i = 0; i < elems; i++) {
        /* Get piece of the string. */
        MVMString *piece = join_get_str_from_pos(tc, input, i, is_str_array);
        MVMint64   piece_graphs;
        if (!piece)
            continue;

        /* Check that all the pieces are strands. */
        if (all_strands)
            all_strands = piece->body.storage_type == MVM_STRING_STRAND;

        /* If it wasn't the first piece, add separator here. */
        if (num_pieces) {
            total_strands += sstrands;
            total_graphs  += sgraphs;
        }

        /* Add on the piece's strands and graphs. */
        piece_graphs = MVM_string_graphs(tc, piece);
        if (piece_graphs) {
            total_strands += piece->body.storage_type == MVM_STRING_STRAND
                ? piece->body.num_strands
                : 1;
            total_graphs += piece_graphs;
        }

        /* Store piece. */
        pieces[num_pieces++] = piece;
    }
    /* This guards the joining by method of multiple concats, and will be faster
     * if we only end up with one piece after going through each element of the array */
    if (num_pieces == 1)
        return pieces[0];
    /* We now know the total eventual number of graphemes. */
    if (total_graphs == 0) {
        MVM_fixed_size_free(tc, tc->instance->fsa, bytes, pieces);
        return tc->instance->str_consts.empty;
    }
    result->body.num_graphs = total_graphs;

    MVMROOT(tc, result, {
    /* If the separator and pieces are all strands, and there are
     * on average at least 16 graphemes in each of the strands. */
    if (all_strands && total_strands <  MVM_STRING_MAX_STRANDS
              &&  total_strands * 16 <= total_graphs) {
        MVMuint16 offset = 0;
        result->body.storage_type    = MVM_STRING_STRAND;
        result->body.storage.strands = allocate_strands(tc, total_strands);
        result->body.num_strands     = total_strands;
        for (i = 0; i < num_pieces; i++) {
            MVMString *piece = pieces[i];
            if (0 < i) {
                /* No more checks unless still stable */
                if (concats_stable)
                    join_check_stability(tc, piece, separator, pieces,
                        &concats_stable, num_pieces, sgraphs, i);
                copy_strands(tc, separator, 0, result, offset, separator->body.num_strands);
                offset += separator->body.num_strands;
            }
            copy_strands(tc, piece, 0, result, offset, piece->body.num_strands);
            offset += piece->body.num_strands;
        }
    }
    /* Doing multiple concats is only faster if we have about 300 graphemes per
       piece or if we have less than for pieces and more than 150 graphemes per piece */
    else if (total_strands <  MVM_STRING_MAX_STRANDS && (300 < num_pieces/total_graphs || (num_pieces < 4 && 150 < num_pieces/total_graphs))) {
        MVMString *result = NULL;
        MVMROOT(tc, result, {
            if (sgraphs) {
                i = 0;
                result = MVM_string_concatenate(tc, pieces[i++], separator);
                result = MVM_string_concatenate(tc, result, pieces[i++]);
                for (; i < num_pieces;) {
                    result = MVM_string_concatenate(tc, result, separator);
                    result = MVM_string_concatenate(tc, result, pieces[i++]);
                }

            }
            else {
                result = MVM_string_concatenate(tc, pieces[0], pieces[1]);
                i = 2;
                for (; i < num_pieces;) {
                    result = MVM_string_concatenate(tc, result, pieces[i++]);
                }
            }
        });
        return result;
    }
    else {
        /* We'll produce a single, flat string. */
        MVMint64        position = 0;
        MVMGraphemeIter gi;
        result->body.storage_type    = MVM_STRING_GRAPHEME_32;
        result->body.storage.blob_32 = MVM_malloc(total_graphs * sizeof(MVMGrapheme32));
        for (i = 0; i < num_pieces; i++) {
            /* Get piece. */
            MVMString *piece = pieces[i];

            /* Add separator if needed. */
            if (0 < i) {
                /* No more checks unless still stable */
                if (concats_stable)
                    join_check_stability(tc, piece, separator, pieces,
                        &concats_stable, num_pieces, sgraphs, i);
                /* Add separator */
                if (sgraphs)
                    copy_to_32bit(tc, separator, result, &position, &gi);
            }
            /* Add piece */
            copy_to_32bit(tc, piece, result, &position, &gi);
        }
    }

    MVM_fixed_size_free(tc, tc->instance->fsa, bytes, pieces);
    STRAND_CHECK(tc, result);
    /* if concat is stable and NFG_CHECK on, run a NFG_CHECK on it since it
     * should be properly constructed now */
    if (concats_stable)
        NFG_CHECK(tc, result, "MVM_string_join");
    });
    return concats_stable ? result : re_nfg(tc, result);
}

/* Returning nonzero means it found the char at the position specified in 'a' in 'Haystack'.
 * For character enumerations in regexes. */
MVMint64 MVM_string_char_at_in_string(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMString *Haystack) {
    MVMuint32     H_graphs;
    MVMGrapheme32 search;

    MVM_string_check_arg(tc, a, "char_at_in_string");
    MVM_string_check_arg(tc, Haystack, "char_at_in_string");

    /* We return -2 here only to be able to distinguish between "out of bounds" and "not in string". */
    if (offset < 0 || MVM_string_graphs_nocheck(tc, a) <= offset)
        return -2;

    search  = MVM_string_get_grapheme_at_nocheck(tc, a, offset);
    H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
    switch (Haystack->body.storage_type) {
    case MVM_STRING_GRAPHEME_32:
        return MVM_string_memmem_grapheme32(tc, Haystack->body.storage.blob_32, &search, 0, H_graphs, 1);

    case MVM_STRING_GRAPHEME_ASCII:
        if (can_fit_into_ascii(search)) {
            MVMStringIndex i;
            for (i = 0; i < H_graphs; i++)
                if (Haystack->body.storage.blob_ascii[i] == search)
                    return i;
        }
        break;
    case MVM_STRING_GRAPHEME_8:
        if (can_fit_into_8bit(search)) {
            MVMStringIndex i;
            for (i = 0; i < H_graphs; i++)
                if (Haystack->body.storage.blob_8[i] == search)
                    return i;
        }
        break;
    case MVM_STRING_STRAND: {
        MVMGraphemeIter gi;
        MVMStringIndex  i;
        MVM_string_gi_init(tc, &gi, Haystack);
        for (i = 0; i < H_graphs; i++)
            if (MVM_string_gi_get_grapheme(tc, &gi) == search)
                return i;
    }
    }
    return -1;
}

MVMint64 MVM_string_offset_has_unicode_property_value(MVMThreadContext *tc, MVMString *s, MVMint64 offset, MVMint64 property_code, MVMint64 property_value_code) {
    MVMGrapheme32 g;
    MVMCodepoint  cp;

    MVM_string_check_arg(tc, s, "uniprop");

    if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s))
        return 0;

    g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
    if (g >= 0)
        cp = (MVMCodepoint)g;
    else
        cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0];
    return MVM_unicode_codepoint_has_property_value(tc, cp, property_code, property_value_code);
}

/* If the string is made up of strands, then produces a flattend string
 * representing the exact same graphemes but without strands. Otherwise,
 * returns the input string. Intended for strings that will be indexed
 * into heavily (when evaluating regexes, for example). */
MVMString * MVM_string_indexing_optimized(MVMThreadContext *tc, MVMString *s) {
    MVM_string_check_arg(tc, s, "indexingoptimized");
    if (s->body.storage_type == MVM_STRING_STRAND)
        return collapse_strands(tc, s);
    else
        return s;
}

/* Escapes a string, replacing various chars like \n with \\n. Can no doubt be
 * further optimized. */
MVMString * MVM_string_escape(MVMThreadContext *tc, MVMString *s) {
    MVMString      *res     = NULL;
    MVMStringIndex  spos    = 0;
    MVMStringIndex  bpos    = 0;
    MVMStringIndex  sgraphs, balloc;
    MVMGrapheme32  *buffer  = NULL;
    MVMGrapheme32   crlf;
    MVMint8         string_can_fit_into_8bit = 1;

    MVM_string_check_arg(tc, s, "escape");

    sgraphs = MVM_string_graphs_nocheck(tc, s);
    balloc  = sgraphs;
    buffer  = MVM_malloc(sizeof(MVMGrapheme32) * balloc);

    crlf = MVM_nfg_crlf_grapheme(tc);

    for (; spos < sgraphs; spos++) {
        MVMGrapheme32 graph = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
        MVMGrapheme32 esc   = 0;
        switch (graph) {
            case '\\': esc = '\\'; break;
            case 7:    esc = 'a';  break;
            case '\b': esc = 'b';  break;
            case '\n': esc = 'n';  break;
            case '\r': esc = 'r';  break;
            case '\t': esc = 't';  break;
            case '\f': esc = 'f';  break;
            case '"':  esc = '"';  break;
            case 27:   esc = 'e';  break;
        }
        if (esc) {
            if (bpos + 2 > balloc) {
                balloc += 32;
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
            }
            buffer[bpos++] = '\\';
            buffer[bpos++] = esc;
        }
        else if (graph == crlf) {
            if (bpos + 4 > balloc) {
                balloc += 32;
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
            }
            buffer[bpos++] = '\\';
            buffer[bpos++] = 'r';
            buffer[bpos++] = '\\';
            buffer[bpos++] = 'n';
        }
        else {
            if (bpos + 1 > balloc) {
                balloc += 32;
                buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
            }
            if (!can_fit_into_8bit(graph))
                string_can_fit_into_8bit = 0;
            buffer[bpos++] = graph;
        }
    }

    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;
    res->body.storage.blob_32 = buffer;
    res->body.num_graphs      = bpos;

    if (string_can_fit_into_8bit)
        turn_32bit_into_8bit_unchecked(tc, res);

    STRAND_CHECK(tc, res);
    return res;
}

/* Takes a string and reverses its characters. */
MVMString * MVM_string_flip(MVMThreadContext *tc, MVMString *s) {
    MVMString      *res     = NULL;
    MVMStringIndex  spos    = 0;
    MVMStringIndex  sgraphs;
    MVMStringIndex  rpos;

    MVM_string_check_arg(tc, s, "flip");
    sgraphs = MVM_string_graphs_nocheck(tc, s);
    rpos    = sgraphs;

    switch (s->body.storage_type) {
    case MVM_STRING_GRAPHEME_ASCII:
    case MVM_STRING_GRAPHEME_8: {
        MVMGrapheme8   *rbuffer;
        /* Copy the variables, so we can use them just for the loop. This way
         * the loop will vectorize since we won't refer to their values except
         * in the loop. Use size_t to coerce vectorization.  */
        size_t spos_l = spos, rpos_l = rpos;
        rbuffer = MVM_malloc(sizeof(MVMGrapheme8) * sgraphs);
        MVM_VECTORIZE_LOOP
        while (spos_l < s->body.num_graphs)
            rbuffer[--rpos_l] = s->body.storage.blob_8[spos_l++];

        spos += sgraphs - spos;
        rpos -= sgraphs - spos;

        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
        res->body.storage_type    = s->body.storage_type;
        res->body.storage.blob_8  = rbuffer;
        break;
    }
    default: {
        MVMGrapheme32  *rbuffer;
        rbuffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);

        if (s->body.storage_type == MVM_STRING_GRAPHEME_32) {
            size_t spos_l = spos, rpos_l = rpos;
            MVM_VECTORIZE_LOOP
            while (spos_l < s->body.num_graphs)
                rbuffer[--rpos_l] = s->body.storage.blob_32[spos_l++];

            spos += sgraphs - spos;
            rpos -= sgraphs - spos;
        }
        else
            for (; spos < sgraphs; spos++)
                rbuffer[--rpos] = MVM_string_get_grapheme_at_nocheck(tc, s, spos);

        res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
        res->body.storage_type    = MVM_STRING_GRAPHEME_32;
        res->body.storage.blob_32 = rbuffer;
    }}

    res->body.num_graphs      = sgraphs;

    STRAND_CHECK(tc, res);
    return res;
}

/* Compares two strings, returning -1, 0 or 1 to indicate less than,
 * equal or greater than. */
MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVMStringIndex alen, blen, i = 0, scanlen;
    MVMGraphemeIter gi_a, gi_b;

    MVM_string_check_arg(tc, a, "compare");
    MVM_string_check_arg(tc, b, "compare");

    /* Simple cases when one or both are zero length. */
    alen = MVM_string_graphs_nocheck(tc, a);
    blen = MVM_string_graphs_nocheck(tc, b);
    if (alen == 0)
        return blen == 0 ? 0 : -1;
    if (blen == 0)
        return 1;

    /* Otherwise, need to scan them. */
    scanlen = blen < alen ? blen : alen;

    /* Short circuit a case where the other conditionals won't speed it up */
    if (a->body.storage_type == MVM_STRING_STRAND || b->body.storage_type == MVM_STRING_STRAND) {

    }
    else if ((a->body.storage_type == MVM_STRING_GRAPHEME_8 || a->body.storage_type == MVM_STRING_GRAPHEME_ASCII)
          && (b->body.storage_type == MVM_STRING_GRAPHEME_8 || b->body.storage_type == MVM_STRING_GRAPHEME_ASCII)) {
        MVMGrapheme8  *a_blob8 = a->body.storage.blob_8;
        MVMGrapheme8  *b_blob8 = b->body.storage.blob_8;
        while (i < scanlen && a_blob8[i] == b_blob8[i]) {
            i++;
        }
    }
    else if (a->body.storage_type == MVM_STRING_GRAPHEME_32 && b->body.storage_type == MVM_STRING_GRAPHEME_32) {
        MVMGrapheme32  *a_blob32 = a->body.storage.blob_32;
        MVMGrapheme32  *b_blob32 = b->body.storage.blob_32;
        while (i < scanlen && a_blob32[i] == b_blob32[i]) {
            i++;
        }
    }
    else {
        MVMGrapheme32 *blob32 = NULL;
        MVMGrapheme8  *blob8  = NULL;
        switch (a->body.storage_type) {
            case MVM_STRING_GRAPHEME_8:
            case MVM_STRING_GRAPHEME_ASCII:
                blob8 = a->body.storage.blob_8;
                break;
            case MVM_STRING_GRAPHEME_32:
                blob32 = a->body.storage.blob_32;
                break;
            default:
                MVM_exception_throw_adhoc(tc,
                    "String corruption in string compare. Unknown string type.");
        }
        switch (b->body.storage_type) {
            case MVM_STRING_GRAPHEME_8:
            case MVM_STRING_GRAPHEME_ASCII:
                blob8 = b->body.storage.blob_8;
                break;
            case MVM_STRING_GRAPHEME_32:
                blob32 = b->body.storage.blob_32;
                break;
            default:
                MVM_exception_throw_adhoc(tc,
                    "String corruption in string compare. Unknown string type.");
        }
        while (i < scanlen && blob32[i] == blob8[i]) {
            i++;
        }
    }
    /* If one of the strings was a strand or we encountered a differing character
     * while scanning in the loops above. */
    if (i < scanlen) {
        MVM_string_gi_init(tc, &gi_a, a);
        MVM_string_gi_init(tc, &gi_b, b);
        if (i) {
            MVM_string_gi_move_to(tc, &gi_a, i);
            MVM_string_gi_move_to(tc, &gi_b, i);
        }
    }
    for (; i < scanlen; i++) {
        MVMGrapheme32 g_a = MVM_string_gi_get_grapheme(tc, &gi_a);
        MVMGrapheme32 g_b = MVM_string_gi_get_grapheme(tc, &gi_b);
        if (g_a != g_b) {
            MVMint64 rtrn;
            /* If one of the deciding graphemes is a synthetic then we need to
             * iterate the codepoints inside it */
            if (g_a < 0 || g_b < 0) {
                MVMCodepointIter ci_a, ci_b;
                MVM_string_grapheme_ci_init(tc, &ci_a, g_a, 0);
                MVM_string_grapheme_ci_init(tc, &ci_b, g_b, 0);
                while (MVM_string_grapheme_ci_has_more(tc, &ci_a) && MVM_string_grapheme_ci_has_more(tc, &ci_b)) {
                    g_a = MVM_string_grapheme_ci_get_codepoint(tc, &ci_a);
                    g_b = MVM_string_grapheme_ci_get_codepoint(tc, &ci_b);
                    if (g_a != g_b)
                        break;
                }
                rtrn = g_a < g_b ? -1 :
                       g_b < g_a ?  1 :
                                    0 ;
                /* If we get here, all the codepoints in the synthetics have matched
                 * so go based on which has more codepoints left in that grapheme */
                if (!rtrn) {
                    MVMint32 a_has_more = MVM_string_grapheme_ci_has_more(tc, &ci_a),
                             b_has_more = MVM_string_grapheme_ci_has_more(tc, &ci_b);

                    return a_has_more < b_has_more ? -1 :
                           b_has_more < a_has_more ?  1 :
                                                      0 ;
                }
                return rtrn;
            }
            return g_a < g_b ? -1 :
                   g_b < g_a ?  1 :
                                0 ;
        }
    }

    /* All shared chars equal, so go on length. */
    return alen < blen ? -1 :
           blen < alen ?  1 :
                          0 ;
}
#define nfg_ok(cp) ((cp) < MVM_NORMALIZE_FIRST_SIG_NFC)
/* BITOP is the operator that is done to the two strings. GO_FULL_LEN determines
 * if we do the operation for the longest strong. If GO_FULL_LEN == 0 then
 * it only goes to the end of the shortest string (we do this for the string AND
 * op, but the other ones go for the length of the longest string. */
#define MVM_STRING_BITOP(BITOP, GO_FULL_LEN, OP_DESC) \
    MVMString      *res    = NULL;\
    MVMGrapheme32  *buffer = NULL;\
    MVMStringIndex  alen, blen, sgraphs = 0;\
    size_t buf_size;\
    MVMCodepointIter ci_a, ci_b;\
    int nfg_is_safe = 1;\
    MVM_string_check_arg(tc, a, (OP_DESC));\
    MVM_string_check_arg(tc, b, (OP_DESC));\
\
    alen = MVM_string_graphs_nocheck(tc, a);\
    blen = MVM_string_graphs_nocheck(tc, b);\
    buf_size = blen < alen ? alen : blen;\
    buffer = MVM_malloc(sizeof(MVMGrapheme32) * buf_size);\
    MVM_string_ci_init(tc, &ci_a, a, 0, 0);\
    MVM_string_ci_init(tc, &ci_b, b, 0, 0);\
\
    /* First, binary-or up to the length of the shortest string. */\
    while (MVM_string_ci_has_more(tc, &ci_a) && MVM_string_ci_has_more(tc, &ci_b)) {\
        const MVMGrapheme32 g_a = MVM_string_ci_get_codepoint(tc, &ci_a);\
        const MVMGrapheme32 g_b = MVM_string_ci_get_codepoint(tc, &ci_b);\
        buffer[sgraphs++] = g_a BITOP g_b;\
        if (nfg_is_safe && (!nfg_ok(g_a) || !nfg_ok(g_b)))\
            nfg_is_safe = 0;\
        if (sgraphs == buf_size) {\
            buf_size += 16;\
            buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\
        }\
    }\
    if (GO_FULL_LEN) {\
        while (MVM_string_ci_has_more(tc, &ci_a)) {\
            const MVMGrapheme32 g_a = MVM_string_ci_get_codepoint(tc, &ci_a);\
            buffer[sgraphs++] = g_a;\
            if (nfg_is_safe && !nfg_ok(g_a))\
                nfg_is_safe = 0;\
            if (sgraphs == buf_size) {\
                buf_size += 16;\
                buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\
            }\
        }\
        while (MVM_string_ci_has_more(tc, &ci_b)) {\
            const MVMGrapheme32 g_b = MVM_string_ci_get_codepoint(tc, &ci_b);\
            buffer[sgraphs++] = g_b;\
            if (nfg_is_safe && !nfg_ok(g_b))\
                nfg_is_safe = 0;\
            if (sgraphs == buf_size) {\
                buf_size += 16;\
                buffer = MVM_realloc(buffer, buf_size * sizeof(MVMGrapheme32));\
            }\
        }\
    }\
\
    res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);\
    res->body.storage_type    = MVM_STRING_GRAPHEME_32;\
    res->body.storage.blob_32 = buffer;\
    res->body.num_graphs      = sgraphs;\
\
    res = nfg_is_safe ? res : re_nfg(tc, res);\
    STRAND_CHECK(tc, res);\
    return res;\

/* Takes two strings and AND's their characters. */
MVMString * MVM_string_bitand(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVM_STRING_BITOP( & , 0, "bitwise and")
}

/* Takes two strings and OR's their characters. */
MVMString * MVM_string_bitor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVM_STRING_BITOP( | , 1, "bitwise or")
}
/* Takes two strings and XOR's their characters. */
MVMString * MVM_string_bitxor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVM_STRING_BITOP( ^ , 1, "bitwise xor");
}

/* Shortcuts for some unicode general category pvalues */
#define UPV_Nd MVM_UNICODE_PVALUE_GC_ND
#define UPV_Lu MVM_UNICODE_PVALUE_GC_LU
#define UPV_Ll MVM_UNICODE_PVALUE_GC_LL
#define UPV_Lt MVM_UNICODE_PVALUE_GC_LT
#define UPV_Lm MVM_UNICODE_PVALUE_GC_LM
#define UPV_Lo MVM_UNICODE_PVALUE_GC_LO
#define UPV_Zs MVM_UNICODE_PVALUE_GC_ZS
#define UPV_Zl MVM_UNICODE_PVALUE_GC_ZL
#define UPV_Pc MVM_UNICODE_PVALUE_GC_PC
#define UPV_Pd MVM_UNICODE_PVALUE_GC_PD
#define UPV_Ps MVM_UNICODE_PVALUE_GC_PS
#define UPV_Pe MVM_UNICODE_PVALUE_GC_PE
#define UPV_Pi MVM_UNICODE_PVALUE_GC_PI
#define UPV_Pf MVM_UNICODE_PVALUE_GC_PF
#define UPV_Po MVM_UNICODE_PVALUE_GC_PO

/* concatenating with "" ensures that only literal strings are accepted as argument. */
#define STR_WITH_LEN(str)  ("" str ""), (sizeof(str) - 1)

#include "strings/unicode_prop_macros.h"
/* Checks if the specified grapheme is in the given character class. */
MVMint64 MVM_string_grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) {
    /* If it's a synthetic, then grab the base codepoint. */
    MVMCodepoint cp;
    if (0 <= g)
        cp = (MVMCodepoint)g;
    else
        cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0];

    switch (cclass) {
        case MVM_CCLASS_ANY:
            return 1;

        case MVM_CCLASS_UPPERCASE:
            return MVM_unicode_codepoint_has_property_value(tc, cp,
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu);

        case MVM_CCLASS_LOWERCASE:
            return MVM_unicode_codepoint_has_property_value(tc, cp,
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll);

        case MVM_CCLASS_WORD:
            if (cp <= 'z') {  /* short circuit common case */
                if (cp >= 'a' || cp == '_' || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9'))
                    return 1;
                else
                    return 0;
            }
            /* Deliberate fall-through; word is _ or digit or alphabetic. */

        case MVM_CCLASS_ALPHANUMERIC:
            if (cp <= '9' && cp >= '0')  /* short circuit common case */
                return 1;
            if (MVM_unicode_codepoint_has_property_value(tc, cp,
                    MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd))
                return 1;
            /* Deliberate fall-through; alphanumeric is digit or alphabetic. */

        case MVM_CCLASS_ALPHABETIC:
            if (cp <= 'z') {  /* short circuit common case */
                if (cp >= 'a' || (cp >= 'A' && cp <= 'Z'))
                    return 1;
                else
                    return 0;
            }
            /* Property L covers Lo, Ll, Lu, Lt, Lm */
            return !!MVM_unicode_codepoint_get_property_int(tc, cp,
                MVM_UNICODE_PROPERTY_L);
            /* TODO: Maybe we want MVM_UNICODE_PROPERTY_ALPHABETIC instead? */

        case MVM_CCLASS_NUMERIC:
            if (cp <= '9' && cp >= '0')  /* short circuit common case */
                return 1;
            return MVM_unicode_codepoint_has_property_value(tc, cp,
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd);

        case MVM_CCLASS_HEXADECIMAL:
            return MVM_unicode_codepoint_has_property_value(tc, cp,
                MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1);

        case MVM_CCLASS_WHITESPACE:
            return MVM_CP_is_White_Space(cp);

        case MVM_CCLASS_BLANK:
            if (cp == '\t')
                return 1;
            return MVM_unicode_codepoint_has_property_value(tc, cp,
                MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zs);

        case MVM_CCLASS_CONTROL: {
            return (cp >= 0 && cp < 32) || (cp >= 127 && cp < 160);
        }

        case MVM_CCLASS_PRINTING: {
            return !((cp >= 0 && cp < 32) || (cp >= 127 && cp < 160));
        }

        case MVM_CCLASS_PUNCTUATION:
            return !!MVM_unicode_codepoint_get_property_int(tc, cp,
                MVM_UNICODE_PROPERTY_P);

        case MVM_CCLASS_NEWLINE: {
            if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
                cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))
                return 1;
        }

        default:
            return 0;
    }
}

/* Checks if the character at the specified offset is a member of the
 * indicated character class. */
MVMint64 MVM_string_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset) {
    MVM_string_check_arg(tc, s, "is_cclass");
    if (MVM_UNLIKELY(offset < 0 || MVM_string_graphs_nocheck(tc, s) <= offset))
        return 0;
    return MVM_string_grapheme_is_cclass(tc, cclass, MVM_string_get_grapheme_at_nocheck(tc, s, offset));
}

/* Searches for the next char that is in the specified character class. */
MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
    MVMGraphemeIter gi;
    MVMint64        length, end, pos;

    MVM_string_check_arg(tc, s, "find_cclass");

    length = MVM_string_graphs_nocheck(tc, s);
    end    = offset + count;
    end = length < end ? length : end;
    if (offset < 0 || offset >= length)
        return end;

    MVM_string_gi_init(tc, &gi, s);
    MVM_string_gi_move_to(tc, &gi, offset);
    switch (cclass) {
        case MVM_CCLASS_WHITESPACE:
            for (pos = offset; pos < end; pos++) {
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
                if (MVM_CP_is_White_Space(cp))
                    return pos;
            }
            break;
        case MVM_CCLASS_NEWLINE:
            for (pos = offset; pos < end; pos++) {
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
                if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
                    cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))
                    return pos;
            }
            break;
        default:
            for (pos = offset; pos < end; pos++) {
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
                if (MVM_string_grapheme_is_cclass(tc, cclass, g) > 0)
                    return pos;
            }
    }

    return end;
}

/* Searches for the next char that is not in the specified character class. */
MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
    MVMGraphemeIter gi;
    MVMint64        length, end, pos;

    MVM_string_check_arg(tc, s, "find_not_cclass");

    length = MVM_string_graphs_nocheck(tc, s);
    end    = offset + count;
    end = length < end ? length : end;
    if (offset < 0 || offset >= length)
        return end;

    MVM_string_gi_init(tc, &gi, s);
    MVM_string_gi_move_to(tc, &gi, offset);
    switch (cclass) {
        case MVM_CCLASS_WHITESPACE:
            for (pos = offset; pos < end; pos++) {
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
                if (!MVM_CP_is_White_Space(cp))
                    return pos;
            }
            break;
        case MVM_CCLASS_NEWLINE:
            for (pos = offset; pos < end; pos++) {
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
                MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
                if (!(cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
                    cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp)))
                    return pos;
            }
            break;
        default:
            for (pos = offset; pos < end; pos++) {
                MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
                if (!MVM_string_grapheme_is_cclass(tc, cclass, g))
                    return pos;
            }
    }

    return end;
}

static MVMint16   encoding_name_init         = 0;
static MVMString *encoding_utf8_name         = NULL;
static MVMString *encoding_ascii_name        = NULL;
static MVMString *encoding_latin1_name       = NULL;
static MVMString *encoding_utf16_name        = NULL;
static MVMString *encoding_windows1252_name  = NULL;
static MVMString *encoding_windows1251_name  = NULL;
static MVMString *encoding_shiftjis_name     = NULL;
static MVMString *encoding_utf8_c8_name      = NULL;
MVMuint8 MVM_string_find_encoding(MVMThreadContext *tc, MVMString *name) {
    MVM_string_check_arg(tc, name, "find encoding");
    if (!encoding_name_init) {
        MVM_gc_allocate_gen2_default_set(tc);
        encoding_utf8_name        = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_name, "Encoding name");
        encoding_ascii_name       = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "ascii");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_ascii_name, "Encoding name");
        encoding_latin1_name      = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "iso-8859-1");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_latin1_name, "Encoding name");
        encoding_utf16_name       = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf16");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf16_name, "Encoding name");
        encoding_windows1252_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1252");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1252_name, "Encoding name");
        encoding_windows1251_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1251");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1251_name, "Encoding name");
        encoding_shiftjis_name     = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-932");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_shiftjis_name, "Encoding name");
        encoding_utf8_c8_name     = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8-c8");
        MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_c8_name, "Encoding name");
        encoding_name_init   = 1;
        MVM_gc_allocate_gen2_default_clear(tc);
    }

    if (MVM_string_equal(tc, name, encoding_utf8_name)) {
        return MVM_encoding_type_utf8;
    }
    else if (MVM_string_equal(tc, name, encoding_ascii_name)) {
        return MVM_encoding_type_ascii;
    }
    else if (MVM_string_equal(tc, name, encoding_latin1_name)) {
        return MVM_encoding_type_latin1;
    }
    else if (MVM_string_equal(tc, name, encoding_windows1252_name)) {
        return MVM_encoding_type_windows1252;
    }
    else if (MVM_string_equal(tc, name, encoding_windows1251_name)) {
        return MVM_encoding_type_windows1251;
    }
    else if (MVM_string_equal(tc, name, encoding_utf16_name)) {
        return MVM_encoding_type_utf16;
    }
    else if (MVM_string_equal(tc, name, encoding_utf8_c8_name)) {
        return MVM_encoding_type_utf8_c8;
    }
    else if (MVM_string_equal(tc, name, encoding_shiftjis_name)) {
        return MVM_encoding_type_shiftjis;
    }
    else {
        char *c_name = MVM_string_utf8_encode_C_string(tc, name);
        char *waste[] = { c_name, NULL };
        MVM_exception_throw_adhoc_free(tc, waste, "Unknown string encoding: '%s'",
            c_name);
    }
}

/* Turns a codepoint into a string. If required uses the normalizer to ensure
 * that we get a valid NFG string (NFG is a superset of NFC, and singleton
 * decompositions exist). */
MVMString * MVM_string_chr(MVMThreadContext *tc, MVMint64 cp) {
    MVMString *s = NULL;
    MVMGrapheme32 g;

    if (cp < 0)
        MVM_exception_throw_adhoc(tc, "chr codepoint cannot be negative");
    /* If the codepoint decomposes we may need to normalize it.
     * The first cp that decomposes is U+0340, but to be on the safe side
     * for now we go with the first significant character which at the time
     * of writing (Unicode 9.0) is COMBINING GRAVE ACCENT U+300 */
    if (cp >= MVM_NORMALIZE_FIRST_SIG_NFC
        && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE)
        != MVM_UNICODE_PVALUE_DT_NONE) {

        MVMNormalizer norm;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        if (!MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp, &g)) {
            MVM_unicode_normalizer_eof(tc, &norm);
            g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
        }
        MVM_unicode_normalizer_cleanup(tc, &norm);
    }
    else {
        g = (MVMGrapheme32) cp;
    }

    s = (MVMString *)REPR(tc->instance->VMString)->allocate(tc, STABLE(tc->instance->VMString));
    if (can_fit_into_8bit(g)) {
        s->body.storage_type       = MVM_STRING_GRAPHEME_8;
        s->body.storage.blob_8     = MVM_malloc(sizeof(MVMGrapheme8));
        s->body.storage.blob_8[0]  = g;
    } else {
        s->body.storage_type       = MVM_STRING_GRAPHEME_32;
        s->body.storage.blob_32    = MVM_malloc(sizeof(MVMGrapheme32));
        s->body.storage.blob_32[0] = g;
    }
    s->body.num_graphs         = 1;
    return s;
}

/* Takes a string and computes a hash code for it, storing it in the hash code
 * cache field of the string. Hashing code is derived from the Jenkins hash
 * implementation in uthash.h. */
typedef union {
    MVMint32 graphs[3];
    unsigned char bytes[12];
} MVMJenHashGraphemeView;

MVM_STATIC_INLINE void MVM_hash_add_three (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv) {
    *hj_i  += hash_block->graphs[0];
    *hj_j  += hash_block->graphs[1];
    *hashv += hash_block->graphs[2];
    HASH_JEN_MIX(*hj_i, *hj_j, *hashv);
}
MVM_STATIC_INLINE void MVM_hash_finish (MVMJenHashGraphemeView *hash_block, MVMuint32 *hj_i, MVMuint32 *hj_j, MVMuint32 *hashv, MVMStringIndex sgraphs, MVMStringIndex graphs_remaining) {
    /* Mix in key length (in bytes, not graphemes). */
    *hashv += sgraphs * sizeof(MVMGrapheme32);

    /* Now handle trailing graphemes (must be 2, 1, or 0). */
    /* NOTE: this is weird since it changes the order in different cases. This
     * is just replicating old functionality. */
    switch (graphs_remaining) {
        case 2:
            *hj_j += hash_block->graphs[0];
            *hj_i += hash_block->graphs[1];
            break;
        /* Fallthrough */
        case 1:
            *hj_i += hash_block->graphs[0];
    }
    HASH_JEN_MIX(*hj_i, *hj_j, *hashv);
    /* Because we check if MVMString->body.cached_hash_code == 0 to tell if
     * we have not yet computed the hash code, ensure that hashv is never 0
     * by adding the length of the string to hashv iff hashv == 0. Since both
     * the hashv and MVMStringIndex are both uint32, there should never be any
     * overflow. Only problematic case is if the string is of length 0 and
     * hashv is zero, though this is very very unlikely (if possible at all)
     * and it should be very fast to calculate the hash so as to be negligible. */
    if (*hashv == 0) {
        *hashv += sgraphs;
    }
}
void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) {
    /* The hash algorithm works in bytes. Since we can represent strings in a
     * number of ways, and we want consistent hashing, then we'll read the
     * strings using the grapheme iterator in groups of 3, using 32-bit ints
     * for the graphemes no matter what the string really holds them as. Then
     * we'll use the bytes view of that in the hashing function. */

    MVMStringIndex graphs_remaining, sgraphs;

    /* Initialize hash state. */
    MVMhashv hashv = tc->instance->hashSecret;
    MVMuint32 hj_i, hj_j;
    hj_i = hj_j = 0x9e3779b9;
    graphs_remaining = sgraphs = MVM_string_graphs(tc, s);

    switch (s->body.storage_type) {
        case MVM_STRING_GRAPHEME_ASCII:
        case MVM_STRING_GRAPHEME_8: {
            int i;
            MVMJenHashGraphemeView hash_block;
            for (i = 0; 3 <= sgraphs - i; i += 3) {
                hash_block.graphs[0] = s->body.storage.blob_8[i];
                hash_block.graphs[1] = s->body.storage.blob_8[i+1];
                hash_block.graphs[2] = s->body.storage.blob_8[i+2];
                MVM_hash_add_three(
                    &hash_block,
                    &hj_i, &hj_j, &hashv);
            }
            graphs_remaining = sgraphs - i;
            switch (graphs_remaining) {
                case 1:
                    hash_block.graphs[0] = s->body.storage.blob_8[i];
                    break;
                case 2:
                    hash_block.graphs[0] = s->body.storage.blob_8[i];
                    hash_block.graphs[1] = s->body.storage.blob_8[i+1];
                    break;
            }
            MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
            break;
        }
        case MVM_STRING_GRAPHEME_32: {
            int i;
            for (i = 0; 3 <= sgraphs - i; i += 3) {
                MVM_hash_add_three(
                    (MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i),
                    &hj_i, &hj_j, &hashv);
            }
            graphs_remaining = sgraphs - i;
            MVM_hash_finish((MVMJenHashGraphemeView*)(s->body.storage.blob_32 + i), &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
            break;
        }
        default: {
            MVMGraphemeIter gi;
            MVMJenHashGraphemeView hash_block;
            /* Work through the string 3 graphemes at a time. */
            MVM_string_gi_init(tc, &gi, s);
            while (3 <= graphs_remaining) {
                hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
                hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
                hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi);
                MVM_hash_add_three(
                    &hash_block,
                    &hj_i, &hj_j, &hashv);
                graphs_remaining -= 3;
            }
            /* Now handle trailing graphemes (must be 2, 1, or 0). */
            switch (graphs_remaining) {
                case 1:
                    hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
                    break;
                case 2:
                    hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
                    hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
                    break;

            }
            MVM_hash_finish(&hash_block, &hj_i, &hj_j, &hashv, sgraphs, graphs_remaining);
        }
    }
    /* Store computed hash value. */
    s->body.cached_hash_code = hashv;
}

Coverage Report

Created: 2018-07-03 15:31