/home/travis/build/MoarVM/MoarVM/src/strings/normalize.h

Source (jump to first uncovered line)
/* Normalization modes. Numbers picked so that:
 *  - The LSB tells us whether to do canonical or compatibility normalization
 *  - The second bit tells us whether to do canonical normalization
 *  - The third bit tells us to go a step further and create synthetic codes
 *    for graphemes.
 */
typedef enum {
    MVM_NORMALIZE_NFD   = 0,
    MVM_NORMALIZE_NFKD  = 1,
    MVM_NORMALIZE_NFC   = 2,
    MVM_NORMALIZE_NFKC  = 3,
    MVM_NORMALIZE_NFG   = 6
} MVMNormalization;

/* Ways of checking various properties of the normalization form. */
#define MVM_NORMALIZE_COMPAT_DECOMP(form) (form & 1)
#define MVM_NORMALIZE_COMPOSE(form)       (form & 2)
#define MVM_NORMALIZE_GRAPHEME(form)      (form & 4)

/* First codepoint where we have to actually do a real check and maybe some
 * work when normalizing. */
#define MVM_NORMALIZE_FIRST_SIG_NFD     0x00C0
#define MVM_NORMALIZE_FIRST_SIG_NFC     0x0300
#define MVM_NORMALIZE_FIRST_SIG_NFKD    0x00A0
#define MVM_NORMALIZE_FIRST_SIG_NFKC    0x00A0

/* First codepoint with a non-zero canonical combining class. */
#define MVM_NORMALIZE_FIRST_NONZERO_CCC 0x300

/* Streaming Unicode normalizer structure. */
struct MVMNormalizer {
    /* What form of normalization are we doing? */
    MVMNormalization form;

    /* Current buffer of codepoints we're working to normalize. */
    MVMCodepoint *buffer;

    /* Size of the normalization buffer. */
    MVMint32 buffer_size;

    /* Start offset in the buffer where we're still processing. */
    MVMint32 buffer_start;

    /* End offset in the buffer, and where to add the next thing to process. */
    MVMint32 buffer_end;

    /* End offset in the buffer for things we've normalized and so can return. */
    MVMint32 buffer_norm_end;

    /* The first significant codepoint in this normalization form that we may
     * have to do something with. If we see two things beneath the limit in a
     * row then we know the first one below it is good to spit out. */
    MVMCodepoint first_significant;

    /* The quickcheck property for the normalization form in question. */
    MVMint32 quick_check_property;

    /* If we should translate the \r\n grapheme to \n (only applicable when
     * normalizing to NFG). */
    MVMint32 translate_newlines;

    MVMint32 prepend_buffer;

    MVMint32 regional_indicator;

};

/* Guts-y functions, called by the API level ones below. */
MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out);
MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out);

/* Takes a codepoint to process for normalization as the "in" parameter. If we
 * are able to produce one or more normalized codepoints right off, then we
 * put it into the location pointed to by "out", and return the number of
 * codepoints now available including the one we just passed out. If we can't
 * produce a normalized codepoint right now, we return a 0. */
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
    /* Control characters in the Latin-1 range are normalization terminators -
     * that is, we know we can spit out whatever codepoints we have seen so
     * far in normalized form without having to consider them into the
     * normalization process. The exception is if we're computing NFG, and
     * we got \r, which can form a grapheme in the case of \r\n. */
    if (in < 0x20 || (0x7F <= in && in <= 0x9F) || in == 0xAD) {
        /* For utf8-c8 synthetic graphemes. May be able to be removed after
         * changing and further testing of the TODO marked below. */
        if (MVM_UNLIKELY(in < 0)) {
            if (MVM_LIKELY(MVM_nfg_get_synthetic_info(tc, in)->is_utf8_c8))
                return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
            MVM_exception_throw_adhoc(tc, "Internal error: encountered non-utf8-c8 synthetic during normalization");
        }
        /* If in isn't \r */
        if (in != 0x0D || !MVM_NORMALIZE_GRAPHEME(n->form))
            return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
    }

    /* Fast-paths apply when the codepoint to consider is too low to have any
     * interesting properties in the target normalization form AND
     * it doesn't follow a prepend character */
    if (in < n->first_significant && !n->prepend_buffer) {
        if (MVM_LIKELY(MVM_NORMALIZE_COMPOSE(n->form))) {
            /* For the composition fast path we always have to know that we've
            * seen two codepoints in a row that are below those needing a full
            * check. Then we can spit out the first one. Exception: we are
            * normalizing to graphemes and see \r. */
            if (MVM_LIKELY(in != 0x0D || !MVM_NORMALIZE_GRAPHEME(n->form))) {
                if (n->buffer_end - n->buffer_start == 1) {
                    if (n->buffer[n->buffer_start] < n->first_significant) {
                        *out = n->buffer[n->buffer_start];
                        n->buffer[n->buffer_start] = in;
                        return 1;
                    }
                }
            }
        }
        else {
            /* For decomposition fast-path, the buffer should be empty. In
             * that case, we just hand back what we got. */
            if (n->buffer_start == n->buffer_end) {
                *out = in;
                return 1;
            }
        }
    }
    /* Fall back to slow path. */
    return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out);
}

/* Grapheme version of the above. Note that this exists mostly for API clarity
 * rather than adding any semantics; the normalizer must be configured to
 * produce NFG to get synthetics out. */
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint_to_grapheme(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMGrapheme32 *out) {
    assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32));
    return MVM_unicode_normalizer_process_codepoint(tc, n, in, (MVMGrapheme32 *)out);
}

/* Push a number of codepoints into the "to normalize" buffer. */
void MVM_unicode_normalizer_push_codepoints(MVMThreadContext *tc, MVMNormalizer *n, const MVMCodepoint *in, MVMint32 num_codepoints);

/* Get the number of codepoints/graphemes ready to fetch. */
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_available(MVMThreadContext *tc, MVMNormalizer *n) {
    return n->buffer_norm_end - n->buffer_start;
}

/* Get the number of codepoints/graphemes ready to fetch. */
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_empty(MVMThreadContext *tc, MVMNormalizer *n) {
    return n->buffer_end == n->buffer_start;
}

/* Indicate that we've reached the end of the input stream. Any codepoints
 * left to normalize now can be. */
void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n);

/* Get a normalized codepoint; should only ever be called if there are some
 * known to be available, either because normalize_to_codepoint returned a
 * value greater than 1, or normalize_available returned a non-zero value. */
MVM_STATIC_INLINE MVMCodepoint MVM_unicode_normalizer_get_codepoint(MVMThreadContext *tc, MVMNormalizer *n) {
    if (n->buffer_norm_end == n->buffer_start)
        MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get codepoint");
    return n->buffer[n->buffer_start++];
}

/* Grapheme version of the above. Note that this exists mostly for API clarity
 * rather than adding any semantics; the normalizer must be configured to
 * produce NFG to get synthetics out. */
MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_get_grapheme(MVMThreadContext *tc, MVMNormalizer *n) {
    assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32));
    if (n->buffer_norm_end == n->buffer_start)
        MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get grapheme");
    return (MVMGrapheme32)n->buffer[n->buffer_start++];
}

/* Setup and teardown of the MVMNormalizer struct. */
MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in);
void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNormalization norm);
void MVM_unicode_normalizer_translate_newlines(MVMThreadContext *tc, MVMNormalizer *n);
void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n);

/* High-level normalize implementation, working from an input array of
 * codepoints and producing an output array of codepoints. */
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, const MVMObject *in, MVMObject *out, MVMNormalization form);

/* High-level function to produces an NFG string from an input array of
 * codepoints. */
MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext *tc,const MVMObject *codes);
MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext *tc, MVMCodepoint * cp_v, MVMint64 cp_count);

/* High-level function to produce an array of codepoints from a string. */
void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out);

/* faster atoi function */
MVM_STATIC_INLINE MVMint32 fast_atoi( const char * dec_str ) {
    MVMint32 value = 0;
    while( *dec_str ) {
        value = value*10 + (*dec_str++ - '0');
    }
    return value;
}
MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp);
MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext *tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer *norm);
MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp);
MVMint32 MVM_string_is_control_full(MVMThreadContext *tc, MVMCodepoint in);
/* Function for choosing the appropriate line-ending grapheme depending on if
 * newline translation is enabled. */
MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_translated_crlf(MVMThreadContext *tc, MVMNormalizer *n) {
    return n->translate_newlines
        ? '\n'
        : MVM_nfg_crlf_grapheme(tc);
}

Coverage Report

Created: 2018-07-03 15:31

Line	Count	Source (jump to first uncovered line)
1		/* Normalization modes. Numbers picked so that:
2		* - The LSB tells us whether to do canonical or compatibility normalization
3		* - The second bit tells us whether to do canonical normalization
4		* - The third bit tells us to go a step further and create synthetic codes
5		* for graphemes.
6		*/
7		typedef enum {
8		MVM_NORMALIZE_NFD = 0,
9		MVM_NORMALIZE_NFKD = 1,
10		MVM_NORMALIZE_NFC = 2,
11		MVM_NORMALIZE_NFKC = 3,
12		MVM_NORMALIZE_NFG = 6
13		} MVMNormalization;
14
15		/* Ways of checking various properties of the normalization form. */
16		#define MVM_NORMALIZE_COMPAT_DECOMP(form) (form & 1)
17		#define MVM_NORMALIZE_COMPOSE(form) (form & 2)
18		#define MVM_NORMALIZE_GRAPHEME(form) (form & 4)
19
20		/* First codepoint where we have to actually do a real check and maybe some
21		* work when normalizing. */
22		#define MVM_NORMALIZE_FIRST_SIG_NFD 0x00C0
23		#define MVM_NORMALIZE_FIRST_SIG_NFC 0x0300
24		#define MVM_NORMALIZE_FIRST_SIG_NFKD 0x00A0
25		#define MVM_NORMALIZE_FIRST_SIG_NFKC 0x00A0
26
27		/* First codepoint with a non-zero canonical combining class. */
28		#define MVM_NORMALIZE_FIRST_NONZERO_CCC 0x300
29
30		/* Streaming Unicode normalizer structure. */
31		struct MVMNormalizer {
32		/* What form of normalization are we doing? */
33		MVMNormalization form;
34
35		/* Current buffer of codepoints we're working to normalize. */
36		MVMCodepoint *buffer;
37
38		/* Size of the normalization buffer. */
39		MVMint32 buffer_size;
40
41		/* Start offset in the buffer where we're still processing. */
42		MVMint32 buffer_start;
43
44		/* End offset in the buffer, and where to add the next thing to process. */
45		MVMint32 buffer_end;
46
47		/* End offset in the buffer for things we've normalized and so can return. */
48		MVMint32 buffer_norm_end;
49
50		/* The first significant codepoint in this normalization form that we may
51		* have to do something with. If we see two things beneath the limit in a
52		* row then we know the first one below it is good to spit out. */
53		MVMCodepoint first_significant;
54
55		/* The quickcheck property for the normalization form in question. */
56		MVMint32 quick_check_property;
57
58		/* If we should translate the \r\n grapheme to \n (only applicable when
59		* normalizing to NFG). */
60		MVMint32 translate_newlines;
61
62		MVMint32 prepend_buffer;
63
64		MVMint32 regional_indicator;
65
66		};
67
68		/* Guts-y functions, called by the API level ones below. */
69		MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext tc, MVMNormalizer n, MVMCodepoint in, MVMCodepoint *out);
70		MVMint32 MVM_unicode_normalizer_process_codepoint_norm_terminator(MVMThreadContext tc, MVMNormalizer n, MVMCodepoint in, MVMCodepoint *out);
71
72		/* Takes a codepoint to process for normalization as the "in" parameter. If we
73		* are able to produce one or more normalized codepoints right off, then we
74		* put it into the location pointed to by "out", and return the number of
75		* codepoints now available including the one we just passed out. If we can't
76		* produce a normalized codepoint right now, we return a 0. */
77	0	MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext tc, MVMNormalizer n, MVMCodepoint in, MVMCodepoint *out) {
78	0	/* Control characters in the Latin-1 range are normalization terminators -
79	0	* that is, we know we can spit out whatever codepoints we have seen so
80	0	* far in normalized form without having to consider them into the
81	0	* normalization process. The exception is if we're computing NFG, and
82	0	* we got \r, which can form a grapheme in the case of \r\n. */
83	0	if (in < 0x20 \|\| (0x7F <= in && in <= 0x9F) \|\| in == 0xAD) {
84	0	/* For utf8-c8 synthetic graphemes. May be able to be removed after
85	0	* changing and further testing of the TODO marked below. */
86	0	if (MVM_UNLIKELY(in < 0)) {
87	0	if (MVM_LIKELY(MVM_nfg_get_synthetic_info(tc, in)->is_utf8_c8))
88	0	return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
89	0	MVM_exception_throw_adhoc(tc, "Internal error: encountered non-utf8-c8 synthetic during normalization");
90	0	}
91	0	/* If in isn't \r */
92	0	if (in != 0x0D \|\| !MVM_NORMALIZE_GRAPHEME(n->form))
93	0	return MVM_unicode_normalizer_process_codepoint_norm_terminator(tc, n, in, out);
94	0	}
95	0
96	0	/* Fast-paths apply when the codepoint to consider is too low to have any
97	0	* interesting properties in the target normalization form AND
98	0	* it doesn't follow a prepend character */
99	0	if (in < n->first_significant && !n->prepend_buffer) {
100	0	if (MVM_LIKELY(MVM_NORMALIZE_COMPOSE(n->form))) {
101	0	/* For the composition fast path we always have to know that we've
102	0	* seen two codepoints in a row that are below those needing a full
103	0	* check. Then we can spit out the first one. Exception: we are
104	0	* normalizing to graphemes and see \r. */
105	0	if (MVM_LIKELY(in != 0x0D \|\| !MVM_NORMALIZE_GRAPHEME(n->form))) {
106	0	if (n->buffer_end - n->buffer_start == 1) {
107	0	if (n->buffer[n->buffer_start] < n->first_significant) {
108	0	*out = n->buffer[n->buffer_start];
109	0	n->buffer[n->buffer_start] = in;
110	0	return 1;
111	0	}
112	0	}
113	0	}
114	0	}
115	0	else {
116	0	/* For decomposition fast-path, the buffer should be empty. In
117	0	* that case, we just hand back what we got. */
118	0	if (n->buffer_start == n->buffer_end) {
119	0	*out = in;
120	0	return 1;
121	0	}
122	0	}
123	0	}
124	0	/* Fall back to slow path. */
125	0	return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out);
126	0	}
127
128		/* Grapheme version of the above. Note that this exists mostly for API clarity
129		* rather than adding any semantics; the normalizer must be configured to
130		* produce NFG to get synthetics out. */
131	0	MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint_to_grapheme(MVMThreadContext tc, MVMNormalizer n, MVMCodepoint in, MVMGrapheme32 *out) {
132	0	assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32));
133	0	return MVM_unicode_normalizer_process_codepoint(tc, n, in, (MVMGrapheme32 *)out);
134	0	}
135
136		/* Push a number of codepoints into the "to normalize" buffer. */
137		void MVM_unicode_normalizer_push_codepoints(MVMThreadContext tc, MVMNormalizer n, const MVMCodepoint *in, MVMint32 num_codepoints);
138
139		/* Get the number of codepoints/graphemes ready to fetch. */
140	0	MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_available(MVMThreadContext tc, MVMNormalizer n) {
141	0	return n->buffer_norm_end - n->buffer_start;
142	0	}
143
144		/* Get the number of codepoints/graphemes ready to fetch. */
145	0	MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_empty(MVMThreadContext tc, MVMNormalizer n) {
146	0	return n->buffer_end == n->buffer_start;
147	0	}
148
149		/* Indicate that we've reached the end of the input stream. Any codepoints
150		* left to normalize now can be. */
151		void MVM_unicode_normalizer_eof(MVMThreadContext tc, MVMNormalizer n);
152
153		/* Get a normalized codepoint; should only ever be called if there are some
154		* known to be available, either because normalize_to_codepoint returned a
155		* value greater than 1, or normalize_available returned a non-zero value. */
156	0	MVM_STATIC_INLINE MVMCodepoint MVM_unicode_normalizer_get_codepoint(MVMThreadContext tc, MVMNormalizer n) {
157	0	if (n->buffer_norm_end == n->buffer_start)
158	0	MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get codepoint");
159	0	return n->buffer[n->buffer_start++];
160	0	}
161
162		/* Grapheme version of the above. Note that this exists mostly for API clarity
163		* rather than adding any semantics; the normalizer must be configured to
164		* produce NFG to get synthetics out. */
165	0	MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_get_grapheme(MVMThreadContext tc, MVMNormalizer n) {
166	0	assert(sizeof(MVMCodepoint) == sizeof(MVMGrapheme32));
167	0	if (n->buffer_norm_end == n->buffer_start)
168	0	MVM_exception_throw_adhoc(tc, "Normalization: illegal call to get grapheme");
169	0	return (MVMGrapheme32)n->buffer[n->buffer_start++];
170	0	}
171
172		/* Setup and teardown of the MVMNormalizer struct. */
173		MVMNormalization MVM_unicode_normalizer_form(MVMThreadContext *tc, MVMint64 form_in);
174		void MVM_unicode_normalizer_init(MVMThreadContext tc, MVMNormalizer n, MVMNormalization norm);
175		void MVM_unicode_normalizer_translate_newlines(MVMThreadContext tc, MVMNormalizer n);
176		void MVM_unicode_normalizer_cleanup(MVMThreadContext tc, MVMNormalizer n);
177
178		/* High-level normalize implementation, working from an input array of
179		* codepoints and producing an output array of codepoints. */
180		void MVM_unicode_normalize_codepoints(MVMThreadContext tc, const MVMObject in, MVMObject *out, MVMNormalization form);
181
182		/* High-level function to produces an NFG string from an input array of
183		* codepoints. */
184		MVMString * MVM_unicode_codepoints_to_nfg_string(MVMThreadContext tc,const MVMObject codes);
185		MVMString * MVM_unicode_codepoints_c_array_to_nfg_string(MVMThreadContext tc, MVMCodepoint cp_v, MVMint64 cp_count);
186
187		/* High-level function to produce an array of codepoints from a string. */
188		void MVM_unicode_string_to_codepoints(MVMThreadContext tc, MVMString s, MVMNormalization form, MVMObject *out);
189
190		/* faster atoi function */
191	0	MVM_STATIC_INLINE MVMint32 fast_atoi( const char * dec_str ) {
192	0	MVMint32 value = 0;
193	0	while( *dec_str ) {
194	0	value = value10 + (dec_str++ - '0');
195	0	}
196	0	return value;
197	0	}
198		MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp);
199		MVMint32 MVM_unicode_normalize_should_break(MVMThreadContext tc, MVMCodepoint a, MVMCodepoint b, MVMNormalizer norm);
200		MVMint64 MVM_unicode_relative_ccc(MVMThreadContext *tc, MVMCodepoint cp);
201		MVMint32 MVM_string_is_control_full(MVMThreadContext *tc, MVMCodepoint in);
202		/* Function for choosing the appropriate line-ending grapheme depending on if
203		* newline translation is enabled. */
204	0	MVM_STATIC_INLINE MVMGrapheme32 MVM_unicode_normalizer_translated_crlf(MVMThreadContext tc, MVMNormalizer n) {
205	0	return n->translate_newlines
206	0	? '\n'
207	0	: MVM_nfg_crlf_grapheme(tc);
208	0	}