C API: 8-bit Unicode handling macros. More...

#include <stdbool.h>
#include "unicode/umachine.h"
#include "unicode/utf.h"

Macros
#define	U8_COUNT_TRAIL_BYTES(leadByte)
	Counts the trail bytes for a UTF-8 lead byte. More...

#define	U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))
	Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. More...

#define	U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
	Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. More...

#define	U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
	Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1. More...

#define	U8_IS_VALID_LEAD3_AND_T1(lead, t1) (U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
	Internal 3-byte UTF-8 validity check. More...

#define	U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
	Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1. More...

#define	U8_IS_VALID_LEAD4_AND_T1(lead, t1) (U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
	Internal 4-byte UTF-8 validity check. More...

#define	U8_IS_SINGLE(c) (((c)&0x80)==0)
	Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? More...

#define	U8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
	Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4) More...

#define	U8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
	Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF) More...

#define	U8_LENGTH(c)
	How many code units (bytes) are used for the UTF-8 encoding of this Unicode code point? More...

#define	U8_MAX_LENGTH 4
	The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). More...

#define	U8_GET_UNSAFE(s, i, c)
	Get a code point from a string at a random-access offset, without changing the offset. More...

#define	U8_GET(s, start, i, length, c)
	Get a code point from a string at a random-access offset, without changing the offset. More...

#define	U8_GET_OR_FFFD(s, start, i, length, c)
	Get a code point from a string at a random-access offset, without changing the offset. More...

#define	U8_NEXT_UNSAFE(s, i, c)
	Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary. More...

#define	U8_NEXT(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)
	Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary. More...

#define	U8_NEXT_OR_FFFD(s, i, length, c) U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)
	Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary. More...

#define	U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, sub)

#define	U8_APPEND_UNSAFE(s, i, c)
	Append a code point to a string, overwriting 1 to 4 bytes. More...

#define	U8_APPEND(s, i, capacity, c, isError)
	Append a code point to a string, overwriting 1 to 4 bytes. More...

#define	U8_FWD_1_UNSAFE(s, i)
	Advance the string offset from one code point boundary to the next. More...

#define	U8_FWD_1(s, i, length)
	Advance the string offset from one code point boundary to the next. More...

#define	U8_FWD_N_UNSAFE(s, i, n)
	Advance the string offset from one code point boundary to the n-th next one, i.e., move forward by n code points. More...

#define	U8_FWD_N(s, i, length, n)
	Advance the string offset from one code point boundary to the n-th next one, i.e., move forward by n code points. More...

#define	U8_SET_CP_START_UNSAFE(s, i)
	Adjust a random-access offset to a code point boundary at the start of a code point. More...

#define	U8_SET_CP_START(s, start, i)
	Adjust a random-access offset to a code point boundary at the start of a code point. More...

#define	U8_TRUNCATE_IF_INCOMPLETE(s, start, length)
	If the string ends with a UTF-8 byte sequence that is valid so far but incomplete, then reduce the length of the string to end before the lead byte of that incomplete sequence. More...

#define	U8_PREV_UNSAFE(s, i, c)
	Move the string offset from one code point boundary to the previous one and get the code point between them. More...

#define	U8_PREV(s, start, i, c)
	Move the string offset from one code point boundary to the previous one and get the code point between them. More...

#define	U8_PREV_OR_FFFD(s, start, i, c)
	Move the string offset from one code point boundary to the previous one and get the code point between them. More...

#define	U8_BACK_1_UNSAFE(s, i)
	Move the string offset from one code point boundary to the previous one. More...

#define	U8_BACK_1(s, start, i)
	Move the string offset from one code point boundary to the previous one. More...

#define	U8_BACK_N_UNSAFE(s, i, n)
	Move the string offset from one code point boundary to the n-th one before it, i.e., move backward by n code points. More...

#define	U8_BACK_N(s, start, i, n)
	Move the string offset from one code point boundary to the n-th one before it, i.e., move backward by n code points. More...

#define	U8_SET_CP_LIMIT_UNSAFE(s, i)
	Adjust a random-access offset to a code point boundary after a code point. More...

#define	U8_SET_CP_LIMIT(s, start, i, length)
	Adjust a random-access offset to a code point boundary after a code point. More...

Functions
U_CAPI UChar32	utf8_nextCharSafeBody (const uint8_t s, int32_t pi, int32_t length, UChar32 c, UBool strict)
	Function for handling "next code point" with error-checking. More...

U_CAPI int32_t	utf8_appendCharSafeBody (uint8_t s, int32_t i, int32_t length, UChar32 c, UBool pIsError)
	Function for handling "append code point" with error-checking. More...

U_CAPI UChar32	utf8_prevCharSafeBody (const uint8_t s, int32_t start, int32_t pi, UChar32 c, UBool strict)
	Function for handling "previous code point" with error-checking. More...

U_CAPI int32_t	utf8_back1SafeBody (const uint8_t *s, int32_t start, int32_t i)
	Function for handling "skip backward one code point" with error-checking. More...

Detailed Description

C API: 8-bit Unicode handling macros.

This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.

For more information see utf.h and the ICU User Guide Strings chapter (https://unicode-org.github.io/icu/userguide/strings).

Usage: ICU coding guidelines for if() statements should be followed when using these macros. Compound statements (curly braces {}) must be used for if-else-while... bodies and all macro statements should be terminated with semicolon.

Definition in file utf8.h.

Macro Definition Documentation

◆ U8_APPEND

#define U8_APPEND	(	s,
		i,
		capacity,
		c,
		isError
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    uint32_t __uc=(c); \
    if(__uc<=0x7f) { \
        (s)[(i)++]=(uint8_t)__uc; \
    } else if(__uc<=0x7ff && (i)+1<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else if((__uc<=0xd7ff || (0xe000<=__uc && __uc<=0xffff)) && (i)+2<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
        (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else if(0xffff<__uc && __uc<=0x10ffff && (i)+3<(capacity)) { \
        (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
        (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } else { \
        (isError)=true; \
    } \
} UPRV_BLOCK_MACRO_END

Append a code point to a string, overwriting 1 to 4 bytes.

The offset points to the current end of the string contents and is advanced (post-increment). "Safe" macro, checks for a valid code point. If a non-ASCII code point is written, checks for sufficient space in the string. If the code point is not valid or trail bytes do not fit, then isError is set to true.

Parameters

s	const uint8_t * string buffer
i	int32_t string offset, must be i<capacity
capacity	int32_t size of the string buffer
c	UChar32 code point to append
isError	output UBool set to true if an error occurs, otherwise not modified

See also: U8_APPEND_UNSAFE

Stable:: ICU 2.4

Definition at line 459 of file utf8.h.

◆ U8_APPEND_UNSAFE

#define U8_APPEND_UNSAFE	(	s,
		i,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    uint32_t __uc=(c); \
    if(__uc<=0x7f) { \
        (s)[(i)++]=(uint8_t)__uc; \
    } else { \
        if(__uc<=0x7ff) { \
            (s)[(i)++]=(uint8_t)((__uc>>6)|0xc0); \
        } else { \
            if(__uc<=0xffff) { \
                (s)[(i)++]=(uint8_t)((__uc>>12)|0xe0); \
            } else { \
                (s)[(i)++]=(uint8_t)((__uc>>18)|0xf0); \
                (s)[(i)++]=(uint8_t)(((__uc>>12)&0x3f)|0x80); \
            } \
            (s)[(i)++]=(uint8_t)(((__uc>>6)&0x3f)|0x80); \
        } \
        (s)[(i)++]=(uint8_t)((__uc&0x3f)|0x80); \
    } \
} UPRV_BLOCK_MACRO_END

Append a code point to a string, overwriting 1 to 4 bytes.

The offset points to the current end of the string contents and is advanced (post-increment). "Unsafe" macro, assumes a valid code point and sufficient space in the string. Otherwise, the result is undefined.

Parameters

s	const uint8_t * string buffer
i	string offset
c	code point to append

See also: U8_APPEND

Stable:: ICU 2.4

Definition at line 422 of file utf8.h.

◆ U8_BACK_1

#define U8_BACK_1	(	s,
		start,
		i
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    if(U8_IS_TRAIL((s)[--(i)])) { \
        (i)=utf8_back1SafeBody(s, start, (i)); \
    } \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the previous one.

(Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Safe" macro, checks for illegal sequences and for string boundaries.

Parameters

s	const uint8_t * string
start	int32_t starting string offset (usually 0)
i	int32_t string offset, must be start<i

See also: U8_BACK_1_UNSAFE

Stable:: ICU 2.4

Definition at line 791 of file utf8.h.

◆ U8_BACK_1_UNSAFE

#define U8_BACK_1_UNSAFE	(	s,
		i
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    while(U8_IS_TRAIL((s)[--(i)])) {} \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the previous one.

(Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Unsafe" macro, assumes well-formed UTF-8.

Parameters

s	const uint8_t * string
i	string offset

See also: U8_BACK_1

Stable:: ICU 2.4

Definition at line 775 of file utf8.h.

◆ U8_BACK_N

#define U8_BACK_N	(	s,
		start,
		i,
		n
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t __N=(n); \
    while(__N>0 && (i)>(start)) { \
        U8_BACK_1(s, start, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the n-th one before it, i.e., move backward by n code points.

(Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Safe" macro, checks for illegal sequences and for string boundaries.

Parameters

s	const uint8_t * string
start	int32_t index of the start of the string
i	int32_t string offset, must be start<i
n	number of code points to skip

See also: U8_BACK_N_UNSAFE

Stable:: ICU 2.4

Definition at line 832 of file utf8.h.

◆ U8_BACK_N_UNSAFE

#define U8_BACK_N_UNSAFE	(	s,
		i,
		n
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t __N=(n); \
    while(__N>0) { \
        U8_BACK_1_UNSAFE(s, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the n-th one before it, i.e., move backward by n code points.

(Pre-decrementing backward iteration.) The input offset may be the same as the string length. "Unsafe" macro, assumes well-formed UTF-8.

Parameters

s	const uint8_t * string
i	string offset
n	number of code points to skip

See also: U8_BACK_N

Stable:: ICU 2.4

Definition at line 810 of file utf8.h.

◆ U8_COUNT_TRAIL_BYTES

#define U8_COUNT_TRAIL_BYTES ( leadByte )

Value:

(U8_IS_LEAD(leadByte) ? \

((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0)+1 : 0)

U8_IS_LEAD

#define U8_IS_LEAD(c)

Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)

Definition: utf8.h:181

Counts the trail bytes for a UTF-8 lead byte.

Returns 0 for 0..0xc1 as well as for 0xf5..0xff. leadByte might be evaluated multiple times.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable.

Parameters

leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.

Internal:: Do not use. This API is for internal use only.

Definition at line 56 of file utf8.h.

◆ U8_COUNT_TRAIL_BYTES_UNSAFE

#define U8_COUNT_TRAIL_BYTES_UNSAFE ( leadByte ) (((uint8_t)(leadByte)>=0xc2)+((uint8_t)(leadByte)>=0xe0)+((uint8_t)(leadByte)>=0xf0))

Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.

Returns 0 for 0..0xc1. Undefined for 0xf5..0xff. leadByte might be evaluated multiple times.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable.

Parameters

leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.

Internal:: Do not use. This API is for internal use only.

Definition at line 71 of file utf8.h.

◆ U8_FWD_1

#define U8_FWD_1	(	s,
		i,
		length
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    uint8_t __b=(s)[(i)++]; \
    if(U8_IS_LEAD(__b) && (i)!=(length)) { \
        uint8_t __t1=(s)[i]; \
        if((0xe0<=__b && __b<0xf0)) { \
            if(U8_IS_VALID_LEAD3_AND_T1(__b, __t1) && \
                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
                ++(i); \
            } \
        } else if(__b<0xe0) { \
            if(U8_IS_TRAIL(__t1)) { \
                ++(i); \
            } \
        } else /* c>=0xf0 */ { \
            if(U8_IS_VALID_LEAD4_AND_T1(__b, __t1) && \
                    ++(i)!=(length) && U8_IS_TRAIL((s)[i]) && \
                    ++(i)!=(length) && U8_IS_TRAIL((s)[i])) { \
                ++(i); \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END

Advance the string offset from one code point boundary to the next.

(Post-incrementing iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.

The length can be negative for a NUL-terminated string.

Parameters

s	const uint8_t * string
i	int32_t string offset, must be i<length
length	int32_t string length

See also: U8_FWD_1_UNSAFE

Stable:: ICU 2.4

Definition at line 507 of file utf8.h.

◆ U8_FWD_1_UNSAFE

#define U8_FWD_1_UNSAFE	(	s,
		i
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((s)[i]); \
} UPRV_BLOCK_MACRO_END

Advance the string offset from one code point boundary to the next.

(Post-incrementing iteration.) "Unsafe" macro, assumes well-formed UTF-8.

Parameters

s	const uint8_t * string
i	string offset

See also: U8_FWD_1

Stable:: ICU 2.4

Definition at line 490 of file utf8.h.

◆ U8_FWD_N

#define U8_FWD_N	(	s,
		i,
		length,
		n
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t __N=(n); \
    while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
        U8_FWD_1(s, i, length); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

Advance the string offset from one code point boundary to the n-th next one, i.e., move forward by n code points.

(Post-incrementing iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.

The length can be negative for a NUL-terminated string.

Parameters

s	const uint8_t * string
i	int32_t string offset, must be i<length
length	int32_t string length
n	number of code points to skip

See also: U8_FWD_N_UNSAFE

Stable:: ICU 2.4

Definition at line 565 of file utf8.h.

◆ U8_FWD_N_UNSAFE

#define U8_FWD_N_UNSAFE	(	s,
		i,
		n
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t __N=(n); \
    while(__N>0) { \
        U8_FWD_1_UNSAFE(s, i); \
        --__N; \
    } \
} UPRV_BLOCK_MACRO_END

Advance the string offset from one code point boundary to the n-th next one, i.e., move forward by n code points.

(Post-incrementing iteration.) "Unsafe" macro, assumes well-formed UTF-8.

Parameters

s	const uint8_t * string
i	string offset
n	number of code points to skip

See also: U8_FWD_N

Stable:: ICU 2.4

Definition at line 542 of file utf8.h.

◆ U8_GET

#define U8_GET	(	s,
		start,
		i,
		length,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t _u8_get_index=(i); \
    U8_SET_CP_START(s, start, _u8_get_index); \
    U8_NEXT(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END

Get a code point from a string at a random-access offset, without changing the offset.

The offset may point to either the lead byte or one of the trail bytes for a code point, in which case the macro will read all of the bytes for the code point.

The length can be negative for a NUL-terminated string.

If the offset points to an illegal UTF-8 byte sequence, then c is set to a negative value. Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.

Parameters

s	const uint8_t * string
start	int32_t starting string offset
i	int32_t string offset, must be start<=i<length
length	int32_t string length
c	output UChar32 variable, set to <0 in case of an error

See also: U8_GET_UNSAFE

Stable:: ICU 2.4

Definition at line 260 of file utf8.h.

◆ U8_GET_OR_FFFD

#define U8_GET_OR_FFFD	(	s,
		start,
		i,
		length,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t _u8_get_index=(i); \
    U8_SET_CP_START(s, start, _u8_get_index); \
    U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \
} UPRV_BLOCK_MACRO_END

Get a code point from a string at a random-access offset, without changing the offset.

The offset may point to either the lead byte or one of the trail bytes for a code point, in which case the macro will read all of the bytes for the code point.

The length can be negative for a NUL-terminated string.

If the offset points to an illegal UTF-8 byte sequence, then c is set to U+FFFD. Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD.

This macro does not distinguish between a real U+FFFD in the text and U+FFFD returned for an ill-formed sequence. Use U8_GET() if that distinction is important.

Parameters

s	const uint8_t * string
start	int32_t starting string offset
i	int32_t string offset, must be start<=i<length
length	int32_t string length
c	output UChar32 variable, set to U+FFFD in case of an error

See also: U8_GET

Stable:: ICU 51

Definition at line 291 of file utf8.h.

◆ U8_GET_UNSAFE

#define U8_GET_UNSAFE	(	s,
		i,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    int32_t _u8_get_unsafe_index=(int32_t)(i); \
    U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \
    U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
} UPRV_BLOCK_MACRO_END

Get a code point from a string at a random-access offset, without changing the offset.

The offset may point to either the lead byte or one of the trail bytes for a code point, in which case the macro will read all of the bytes for the code point. The result is undefined if the offset points to an illegal UTF-8 byte sequence. Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.

Parameters

s	const uint8_t * string
i	string offset
c	output UChar32 variable

See also: U8_GET

Stable:: ICU 2.4

Definition at line 233 of file utf8.h.

◆ U8_INTERNAL_NEXT_OR_SUB

#define U8_INTERNAL_NEXT_OR_SUB	(	s,
		i,
		length,
		c,
		sub
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[(i)++]; \
    if(!U8_IS_SINGLE(c)) { \
        uint8_t __t = 0; \
        if((i)!=(length) && \
            /* fetch/validate/assemble all but last trail byte */ \
            ((c)>=0xe0 ? \
                ((c)<0xf0 ?  /* U+0800..U+FFFF except surrogates */ \
                    U8_LEAD3_T1_BITS[(c)&=0xf]&(1<<((__t=(s)[i])>>5)) && \
                    (__t&=0x3f, 1) \
                :  /* U+10000..U+10FFFF */ \
                    ((c)-=0xf0)<=4 && \
                    U8_LEAD4_T1_BITS[(__t=(s)[i])>>4]&(1<<(c)) && \
                    ((c)=((c)<<6)|(__t&0x3f), ++(i)!=(length)) && \
                    (__t=(s)[i]-0x80)<=0x3f) && \
                /* valid second-to-last trail byte */ \
                ((c)=((c)<<6)|__t, ++(i)!=(length)) \
            :  /* U+0080..U+07FF */ \
                (c)>=0xc2 && ((c)&=0x1f, 1)) && \
            /* last trail byte */ \
            (__t=(s)[i]-0x80)<=0x3f && \
            ((c)=((c)<<6)|__t, ++(i), 1)) { \
        } else { \
            (c)=(sub);  /* ill-formed*/ \
        } \
    } \
} UPRV_BLOCK_MACRO_END

Internal:: Do not use. This API is for internal use only.

Definition at line 381 of file utf8.h.

◆ U8_IS_LEAD

#define U8_IS_LEAD ( c ) ((uint8_t)((c)-0xc2)<=0x32)

Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)

Parameters

c	8-bit code unit (byte)

Returns: true or false

Stable:: ICU 2.4

Definition at line 181 of file utf8.h.

◆ U8_IS_SINGLE

#define U8_IS_SINGLE ( c ) (((c)&0x80)==0)

Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?

Parameters

c	8-bit code unit (byte)

Returns: true or false

Stable:: ICU 2.4

Definition at line 173 of file utf8.h.

◆ U8_IS_TRAIL

#define U8_IS_TRAIL ( c ) ((int8_t)(c)<-0x40)

Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)

Parameters

c	8-bit code unit (byte)

Returns: true or false

Stable:: ICU 2.4

Definition at line 190 of file utf8.h.

◆ U8_IS_VALID_LEAD3_AND_T1

#define U8_IS_VALID_LEAD3_AND_T1	(	lead,
		t1
	)	(U8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))

Internal 3-byte UTF-8 validity check.

Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.

Internal:: Do not use. This API is for internal use only.

Definition at line 98 of file utf8.h.

◆ U8_IS_VALID_LEAD4_AND_T1

#define U8_IS_VALID_LEAD4_AND_T1	(	lead,
		t1
	)	(U8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))

Internal 4-byte UTF-8 validity check.

Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.

Internal:: Do not use. This API is for internal use only.

Definition at line 115 of file utf8.h.

◆ U8_LEAD3_T1_BITS

#define U8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"

Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.

Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. Lead byte E0..EF bits 3..0 are used as byte index, first trail byte bits 7..5 are used as bit index into that byte.

See also: U8_IS_VALID_LEAD3_AND_T1

Internal:: Do not use. This API is for internal use only.

Definition at line 91 of file utf8.h.

◆ U8_LEAD4_T1_BITS

#define U8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"

Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.

Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence. First trail byte bits 7..4 are used as byte index, lead byte F0..F4 bits 2..0 are used as bit index into that byte.

See also: U8_IS_VALID_LEAD4_AND_T1

Internal:: Do not use. This API is for internal use only.

Definition at line 108 of file utf8.h.

◆ U8_LENGTH

#define U8_LENGTH ( c )

Value:

    ((uint32_t)(c)<=0x7f ? 1 : \
        ((uint32_t)(c)<=0x7ff ? 2 : \
            ((uint32_t)(c)<=0xd7ff ? 3 : \
                ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
                    ((uint32_t)(c)<=0xffff ? 3 : 4)\
                ) \
            ) \
        ) \
    )

How many code units (bytes) are used for the UTF-8 encoding of this Unicode code point?

Parameters

c	32-bit code point

Returns: 1..4, or 0 if c is a surrogate or not a Unicode code point

Stable:: ICU 2.4

Definition at line 199 of file utf8.h.

◆ U8_MASK_LEAD_BYTE

#define U8_MASK_LEAD_BYTE	(	leadByte,
		countTrailBytes
	)	((leadByte)&=(1<<(6-(countTrailBytes)))-1)

Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable.

Internal:: Do not use. This API is for internal use only.

Definition at line 81 of file utf8.h.

◆ U8_MAX_LENGTH

#define U8_MAX_LENGTH 4

The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).

Returns: 4

Stable:: ICU 2.4

Definition at line 215 of file utf8.h.

◆ U8_NEXT

#define U8_NEXT	(	s,
		i,
		length,
		c
	)	U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, U_SENTINEL)

Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary.

(Post-incrementing forward iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.

The length can be negative for a NUL-terminated string.

The offset may point to the lead byte of a multi-byte sequence, in which case the macro will read the whole sequence. If the offset points to a trail byte or an illegal UTF-8 sequence, then c is set to a negative value.

Parameters

s	const uint8_t * string
i	int32_t string offset, must be i<length
length	int32_t string length
c	output UChar32 variable, set to <0 in case of an error

See also: U8_NEXT_UNSAFE

Stable:: ICU 2.4

Definition at line 352 of file utf8.h.

◆ U8_NEXT_OR_FFFD

#define U8_NEXT_OR_FFFD	(	s,
		i,
		length,
		c
	)	U8_INTERNAL_NEXT_OR_SUB(s, i, length, c, 0xfffd)

Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary.

(Post-incrementing forward iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.

The length can be negative for a NUL-terminated string.

The offset may point to the lead byte of a multi-byte sequence, in which case the macro will read the whole sequence. If the offset points to a trail byte or an illegal UTF-8 sequence, then c is set to U+FFFD.

This macro does not distinguish between a real U+FFFD in the text and U+FFFD returned for an ill-formed sequence. Use U8_NEXT() if that distinction is important.

Parameters

s	const uint8_t * string
i	int32_t string offset, must be i<length
length	int32_t string length
c	output UChar32 variable, set to U+FFFD in case of an error

See also: U8_NEXT

Stable:: ICU 51

Definition at line 378 of file utf8.h.

◆ U8_NEXT_UNSAFE

#define U8_NEXT_UNSAFE	(	s,
		i,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[(i)++]; \
    if(!U8_IS_SINGLE(c)) { \
        if((c)<0xe0) { \
            (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
        } else if((c)<0xf0) { \
            /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
            (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
            (i)+=2; \
        } else { \
            (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
            (i)+=3; \
        } \
    } \
} UPRV_BLOCK_MACRO_END

Get a code point from a string at a code point boundary offset, and advance the offset to the next code point boundary.

(Post-incrementing forward iteration.) "Unsafe" macro, assumes well-formed UTF-8.

The offset may point to the lead byte of a multi-byte sequence, in which case the macro will read the whole sequence. The result is undefined if the offset points to a trail byte or an illegal UTF-8 sequence.

Parameters

s	const uint8_t * string
i	string offset
c	output UChar32 variable

See also: U8_NEXT

Stable:: ICU 2.4

Definition at line 316 of file utf8.h.

◆ U8_PREV

#define U8_PREV	(	s,
		start,
		i,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[--(i)]; \
    if(!U8_IS_SINGLE(c)) { \
        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \
    } \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the previous one and get the code point between them.

(Pre-decrementing backward iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.

The input offset may be the same as the string length. If the offset is behind a multi-byte sequence, then the macro will read the whole sequence. If the offset is behind a lead byte, then that itself will be returned as the code point. If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.

Parameters

s	const uint8_t * string
start	int32_t starting string offset (usually 0)
i	int32_t string offset, must be start<i
c	output UChar32 variable, set to <0 in case of an error

See also: U8_PREV_UNSAFE

Stable:: ICU 2.4

Definition at line 726 of file utf8.h.

◆ U8_PREV_OR_FFFD

#define U8_PREV_OR_FFFD	(	s,
		start,
		i,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[--(i)]; \
    if(!U8_IS_SINGLE(c)) { \
        (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \
    } \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the previous one and get the code point between them.

(Pre-decrementing backward iteration.) "Safe" macro, checks for illegal sequences and for string boundaries.

The input offset may be the same as the string length. If the offset is behind a multi-byte sequence, then the macro will read the whole sequence. If the offset is behind a lead byte, then that itself will be returned as the code point. If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD.

This macro does not distinguish between a real U+FFFD in the text and U+FFFD returned for an ill-formed sequence. Use U8_PREV() if that distinction is important.

Parameters

s	const uint8_t * string
start	int32_t starting string offset (usually 0)
i	int32_t string offset, must be start<i
c	output UChar32 variable, set to U+FFFD in case of an error

See also: U8_PREV

Stable:: ICU 51

Definition at line 757 of file utf8.h.

◆ U8_PREV_UNSAFE

#define U8_PREV_UNSAFE	(	s,
		i,
		c
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    (c)=(uint8_t)(s)[--(i)]; \
    if(U8_IS_TRAIL(c)) { \
        uint8_t __b, __count=1, __shift=6; \
\
        /* c is a trail byte */ \
        (c)&=0x3f; \
        for(;;) { \
            __b=(s)[--(i)]; \
            if(__b>=0xc0) { \
                U8_MASK_LEAD_BYTE(__b, __count); \
                (c)|=(UChar32)__b<<__shift; \
                break; \
            } else { \
                (c)|=(UChar32)(__b&0x3f)<<__shift; \
                ++__count; \
                __shift+=6; \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END

Move the string offset from one code point boundary to the previous one and get the code point between them.

(Pre-decrementing backward iteration.) "Unsafe" macro, assumes well-formed UTF-8.

The input offset may be the same as the string length. If the offset is behind a multi-byte sequence, then the macro will read the whole sequence. If the offset is behind a lead byte, then that itself will be returned as the code point. The result is undefined if the offset is behind an illegal UTF-8 sequence.

Parameters

s	const uint8_t * string
i	string offset
c	output UChar32 variable

See also: U8_PREV

Stable:: ICU 2.4

Definition at line 684 of file utf8.h.

◆ U8_SET_CP_LIMIT

#define U8_SET_CP_LIMIT	(	s,
		start,
		i,
		length
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    if((start)<(i) && ((i)<(length) || (length)<0)) { \
        U8_BACK_1(s, start, i); \
        U8_FWD_1(s, i, length); \
    } \
} UPRV_BLOCK_MACRO_END

Adjust a random-access offset to a code point boundary after a code point.

If the offset is behind a partial multi-byte sequence, then the offset is incremented to behind the whole sequence. Otherwise, it is not modified. The input offset may be the same as the string length. "Safe" macro, checks for illegal sequences and for string boundaries.

The length can be negative for a NUL-terminated string.

Parameters

s	const uint8_t * string
start	int32_t starting string offset (usually 0)
i	int32_t string offset, must be start<=i<=length
length	int32_t string length

See also: U8_SET_CP_LIMIT_UNSAFE

Stable:: ICU 2.4

Definition at line 875 of file utf8.h.

◆ U8_SET_CP_LIMIT_UNSAFE

#define U8_SET_CP_LIMIT_UNSAFE	(	s,
		i
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    U8_BACK_1_UNSAFE(s, i); \
    U8_FWD_1_UNSAFE(s, i); \
} UPRV_BLOCK_MACRO_END

Adjust a random-access offset to a code point boundary after a code point.

If the offset is behind a partial multi-byte sequence, then the offset is incremented to behind the whole sequence. Otherwise, it is not modified. The input offset may be the same as the string length. "Unsafe" macro, assumes well-formed UTF-8.

Parameters

s	const uint8_t * string
i	string offset

See also: U8_SET_CP_LIMIT

Stable:: ICU 2.4

Definition at line 853 of file utf8.h.

◆ U8_SET_CP_START

#define U8_SET_CP_START	(	s,
		start,
		i
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    if(U8_IS_TRAIL((s)[(i)])) { \
        (i)=utf8_back1SafeBody(s, start, (i)); \
    } \
} UPRV_BLOCK_MACRO_END

Adjust a random-access offset to a code point boundary at the start of a code point.

If the offset points to a UTF-8 trail byte, then the offset is moved backward to the corresponding lead byte. Otherwise, it is not modified.

"Safe" macro, checks for illegal sequences and for string boundaries. Unlike U8_TRUNCATE_IF_INCOMPLETE(), this macro always reads s[i].

Parameters

s	const uint8_t * string
start	int32_t starting string offset (usually 0)
i	int32_t string offset, must be start<=i

See also: U8_SET_CP_START_UNSAFE; U8_TRUNCATE_IF_INCOMPLETE

Stable:: ICU 2.4

Definition at line 607 of file utf8.h.

◆ U8_SET_CP_START_UNSAFE

#define U8_SET_CP_START_UNSAFE	(	s,
		i
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    while(U8_IS_TRAIL((s)[i])) { --(i); } \
} UPRV_BLOCK_MACRO_END

Adjust a random-access offset to a code point boundary at the start of a code point.

If the offset points to a UTF-8 trail byte, then the offset is moved backward to the corresponding lead byte. Otherwise, it is not modified. "Unsafe" macro, assumes well-formed UTF-8.

Parameters

s	const uint8_t * string
i	string offset

See also: U8_SET_CP_START

Stable:: ICU 2.4

Definition at line 586 of file utf8.h.

◆ U8_TRUNCATE_IF_INCOMPLETE

#define U8_TRUNCATE_IF_INCOMPLETE	(	s,
		start,
		length
	)

Value:

    UPRV_BLOCK_MACRO_BEGIN { \
    if((length)>(start)) { \
        uint8_t __b1=s[(length)-1]; \
        if(U8_IS_SINGLE(__b1)) { \
            /* common ASCII character */ \
        } else if(U8_IS_LEAD(__b1)) { \
            --(length); \
        } else if(U8_IS_TRAIL(__b1) && ((length)-2)>=(start)) { \
            uint8_t __b2=s[(length)-2]; \
            if(0xe0<=__b2 && __b2<=0xf4) { \
                if(__b2<0xf0 ? U8_IS_VALID_LEAD3_AND_T1(__b2, __b1) : \
                        U8_IS_VALID_LEAD4_AND_T1(__b2, __b1)) { \
                    (length)-=2; \
                } \
            } else if(U8_IS_TRAIL(__b2) && ((length)-3)>=(start)) { \
                uint8_t __b3=s[(length)-3]; \
                if(0xf0<=__b3 && __b3<=0xf4 && U8_IS_VALID_LEAD4_AND_T1(__b3, __b2)) { \
                    (length)-=3; \
                } \
            } \
        } \
    } \
} UPRV_BLOCK_MACRO_END

If the string ends with a UTF-8 byte sequence that is valid so far but incomplete, then reduce the length of the string to end before the lead byte of that incomplete sequence.

For example, if the string ends with E1 80, the length is reduced by 2.

In all other cases (the string ends with a complete sequence, or it is not possible for any further trail byte to extend the trailing sequence) the length remains unchanged.

Useful for processing text split across multiple buffers (save the incomplete sequence for later) and for optimizing iteration (check for string length only once per character).

"Safe" macro, checks for illegal sequences and for string boundaries. Unlike U8_SET_CP_START(), this macro never reads s[length].

(In UTF-16, simply check for U16_IS_LEAD(last code unit).)

Parameters

s	const uint8_t * string
start	int32_t starting string offset (usually 0)
length	int32_t string length (usually start<=length)

See also: U8_SET_CP_START

Stable:: ICU 61

Definition at line 639 of file utf8.h.

Function Documentation

◆ utf8_appendCharSafeBody()

U_CAPI int32_t utf8_appendCharSafeBody	(	uint8_t *	s,
		int32_t	i,
		int32_t	length,
		UChar32	c,
		UBool *	pIsError
	)

Function for handling "append code point" with error-checking.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable, and should not be hidden when other internal functions are hidden (otherwise public macros would fail to compile).

Internal:: Do not use. This API is for internal use only.

◆ utf8_back1SafeBody()

U_CAPI int32_t utf8_back1SafeBody	(	const uint8_t *	s,
		int32_t	start,
		int32_t	i
	)

Function for handling "skip backward one code point" with error-checking.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable, and should not be hidden when other internal functions are hidden (otherwise public macros would fail to compile).

Internal:: Do not use. This API is for internal use only.

◆ utf8_nextCharSafeBody()

U_CAPI UChar32 utf8_nextCharSafeBody	(	const uint8_t *	s,
		int32_t *	pi,
		int32_t	length,
		UChar32	c,
		UBool	strict
	)

Function for handling "next code point" with error-checking.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable, and should not be hidden when other internal functions are hidden (otherwise public macros would fail to compile).

Internal:: Do not use. This API is for internal use only.

◆ utf8_prevCharSafeBody()

U_CAPI UChar32 utf8_prevCharSafeBody	(	const uint8_t *	s,
		int32_t	start,
		int32_t *	pi,
		UChar32	c,
		UBool	strict
	)

Function for handling "previous code point" with error-checking.

This is internal since it is not meant to be called directly by external clients; however it is called by public macros in this file and thus must remain stable, and should not be hidden when other internal functions are hidden (otherwise public macros would fail to compile).

Internal:: Do not use. This API is for internal use only.

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ U8_APPEND

◆ U8_APPEND_UNSAFE

◆ U8_BACK_1

◆ U8_BACK_1_UNSAFE

◆ U8_BACK_N

◆ U8_BACK_N_UNSAFE

◆ U8_COUNT_TRAIL_BYTES

◆ U8_COUNT_TRAIL_BYTES_UNSAFE

◆ U8_FWD_1

◆ U8_FWD_1_UNSAFE

◆ U8_FWD_N

◆ U8_FWD_N_UNSAFE

◆ U8_GET

◆ U8_GET_OR_FFFD

◆ U8_GET_UNSAFE

◆ U8_INTERNAL_NEXT_OR_SUB

◆ U8_IS_LEAD

◆ U8_IS_SINGLE

◆ U8_IS_TRAIL

◆ U8_IS_VALID_LEAD3_AND_T1

◆ U8_IS_VALID_LEAD4_AND_T1

◆ U8_LEAD3_T1_BITS

◆ U8_LEAD4_T1_BITS

◆ U8_LENGTH

◆ U8_MASK_LEAD_BYTE

◆ U8_MAX_LENGTH

◆ U8_NEXT

◆ U8_NEXT_OR_FFFD

◆ U8_NEXT_UNSAFE

◆ U8_PREV

◆ U8_PREV_OR_FFFD

◆ U8_PREV_UNSAFE

◆ U8_SET_CP_LIMIT

◆ U8_SET_CP_LIMIT_UNSAFE

◆ U8_SET_CP_START

◆ U8_SET_CP_START_UNSAFE

◆ U8_TRUNCATE_IF_INCOMPLETE

Function Documentation

◆ utf8_appendCharSafeBody()

◆ utf8_back1SafeBody()

◆ utf8_nextCharSafeBody()

◆ utf8_prevCharSafeBody()