UTF-8 Unicode encoding. More...

#include <stdint.h>

Data Structures
struct	utf8_accumulator
	A UTF-8 character accumulator. More...

Macros
#define	UTF8_MAX_LEN 4
	Maximum length of UTF-8 sequence.
#define	UTF8_MIN_TWO 0x80
	Minimum legal value for two-byte UTF-8 sequence.
#define	UTF8_MIN_THREE 0x800
	Minimum legal value for three-byte UTF-8 sequence.
#define	UTF8_MIN_FOUR 0x10000
	Minimum legal value for four-byte UTF-8 sequence.
#define	UTF8_HIGH_BIT 0x80
	High bit of UTF-8 bytes.
#define	UTF8_CONTINUATION_BITS 6
	Number of data bits in each continuation byte.
#define	UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )
	Bit mask for data bits in a continuation byte.
#define	UTF8_CONTINUATION 0x80
	Non-data bits in a continuation byte.
#define	UTF8_IS_CONTINUATION(byte)
	Check for a continuation byte.
#define	UTF8_IS_ASCII(byte)
	Check for an ASCII byte.
#define	UTF8_INVALID 0xfffd
	Invalid character returned when decoding fails.

Functions
	FILE_LICENCE (GPL2_OR_LATER_OR_UBDL)
	FILE_SECBOOT (PERMITTED)
unsigned int	utf8_accumulate (struct utf8_accumulator *utf8, uint8_t byte)
	Accumulate Unicode character from UTF-8 byte sequence.

Detailed Description

UTF-8 Unicode encoding.

Definition in file utf8.h.

Macro Definition Documentation

◆ UTF8_MAX_LEN

#define UTF8_MAX_LEN 4

Maximum length of UTF-8 sequence.

Definition at line 16 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_MIN_TWO

#define UTF8_MIN_TWO 0x80

Minimum legal value for two-byte UTF-8 sequence.

Definition at line 19 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_MIN_THREE

#define UTF8_MIN_THREE 0x800

Minimum legal value for three-byte UTF-8 sequence.

Definition at line 22 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_MIN_FOUR

#define UTF8_MIN_FOUR 0x10000

Minimum legal value for four-byte UTF-8 sequence.

Definition at line 25 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_HIGH_BIT

#define UTF8_HIGH_BIT 0x80

High bit of UTF-8 bytes.

Definition at line 28 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_CONTINUATION_BITS

#define UTF8_CONTINUATION_BITS 6

Number of data bits in each continuation byte.

Definition at line 31 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_CONTINUATION_MASK

#define UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )

Bit mask for data bits in a continuation byte.

Definition at line 34 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_CONTINUATION

#define UTF8_CONTINUATION 0x80

Non-data bits in a continuation byte.

Definition at line 37 of file utf8.h.

◆ UTF8_IS_CONTINUATION

#define UTF8_IS_CONTINUATION ( byte )

Value:

( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )

byte

unsigned char byte

Definition smc9000.h:38

UTF8_CONTINUATION

#define UTF8_CONTINUATION

Non-data bits in a continuation byte.

Definition utf8.h:37

UTF8_CONTINUATION_MASK

#define UTF8_CONTINUATION_MASK

Bit mask for data bits in a continuation byte.

Definition utf8.h:34

Check for a continuation byte.

Parameters

byte	UTF-8 byte

Return values

is_continuation Byte is a continuation byte

Definition at line 44 of file utf8.h.

44#define UTF8_IS_CONTINUATION( byte ) \

45 ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )

Referenced by utf8_accumulate().

◆ UTF8_IS_ASCII

#define UTF8_IS_ASCII ( byte )

Value:

( ! ( (byte) & UTF8_HIGH_BIT ) )

UTF8_HIGH_BIT

#define UTF8_HIGH_BIT

High bit of UTF-8 bytes.

Definition utf8.h:28

Check for an ASCII byte.

Parameters

byte	UTF-8 byte

Return values

is_ascii Byte is an ASCII byte

Definition at line 52 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_INVALID

#define UTF8_INVALID 0xfffd

Invalid character returned when decoding fails.

Definition at line 55 of file utf8.h.

Referenced by efi_putchar(), and utf8_accumulate().

Function Documentation

◆ FILE_LICENCE()

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL )

◆ FILE_SECBOOT()

FILE_SECBOOT ( PERMITTED )

◆ utf8_accumulate()

unsigned int utf8_accumulate	(	struct utf8_accumulator *	utf8,
		uint8_t	byte )

extern

Accumulate Unicode character from UTF-8 byte sequence.

Parameters

utf8	UTF-8 accumulator
byte	UTF-8 byte

Return values

character Unicode character, or 0 if incomplete

Definition at line 44 of file utf8.c.

                                                                             {
        static unsigned int min[] = {
                UTF8_MIN_TWO,
                UTF8_MIN_THREE,
                UTF8_MIN_FOUR,
        };
        unsigned int shift;
        unsigned int len;
        uint8_t tmp;
 
        /* Handle continuation bytes */
        if ( UTF8_IS_CONTINUATION ( byte ) ) {
 
                /* Fail if this is an unexpected continuation byte */
                if ( utf8->remaining == 0 ) {
                        DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
                        return UTF8_INVALID;
                }
 
                /* Apply continuation byte */
                utf8->character <<= UTF8_CONTINUATION_BITS;
                utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
 
                /* Return 0 if more continuation bytes are expected */
                if ( --utf8->remaining != 0 )
                        return 0;
 
                /* Fail if sequence is illegal */
                if ( utf8->character < utf8->min ) {
                        DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
                               utf8->character );
                        return UTF8_INVALID;
                }
 
                /* Sanity check */
                assert ( utf8->character != 0 );
 
                /* Return completed character */
                DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
                        utf8, utf8->character );
                return utf8->character;
        }
 
        /* Reset state and report failure if this is an unexpected
         * non-continuation byte.  Do not return UTF8_INVALID since
         * doing so could cause us to drop a valid ASCII character.
         */
        if ( utf8->remaining != 0 ) {
                shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
                DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
                       utf8, byte, ( utf8->character << shift ),
                       ( ( 1 << shift ) - 1 ) );
                utf8->remaining = 0;
        }
 
        /* Handle initial bytes */
        if ( ! UTF8_IS_ASCII ( byte ) ) {
 
                /* Sanity check */
                assert ( utf8->remaining == 0 );
 
                /* Count total number of bytes in sequence */
                tmp = byte;
                len = 0;
                while ( tmp & UTF8_HIGH_BIT ) {
                        tmp <<= 1;
                        len++;
                }
 
                /* Check for illegal length */
                if ( len > UTF8_MAX_LEN ) {
                        DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
                               utf8, byte, len );
                        return UTF8_INVALID;
                }
 
                /* Store initial bits of character */
                utf8->character = ( tmp >> len );
 
                /* Store number of bytes remaining */
                len--;
                utf8->remaining = len;
                assert ( utf8->remaining > 0 );
 
                /* Store minimum legal value */
                utf8->min = min[ len - 1 ];
                assert ( utf8->min > 0 );
 
                /* Await continuation bytes */
                return 0;
        }
 
        /* Handle ASCII bytes */
        return byte;
}

References assert, utf8_accumulator::character, DBGC, DBGC2, len, min, utf8_accumulator::min, utf8_accumulator::remaining, tmp, UTF8_CONTINUATION_BITS, UTF8_CONTINUATION_MASK, UTF8_HIGH_BIT, UTF8_INVALID, UTF8_IS_ASCII, UTF8_IS_CONTINUATION, UTF8_MAX_LEN, UTF8_MIN_FOUR, UTF8_MIN_THREE, and UTF8_MIN_TWO.

Referenced by efi_putchar(), fbcon_putchar(), and utf8_accumulate_okx().

Data Structures

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ UTF8_MAX_LEN

◆ UTF8_MIN_TWO

◆ UTF8_MIN_THREE

◆ UTF8_MIN_FOUR

◆ UTF8_HIGH_BIT

◆ UTF8_CONTINUATION_BITS

◆ UTF8_CONTINUATION_MASK

◆ UTF8_CONTINUATION

◆ UTF8_IS_CONTINUATION

◆ UTF8_IS_ASCII

◆ UTF8_INVALID

Function Documentation

◆ FILE_LICENCE()

◆ FILE_SECBOOT()

◆ utf8_accumulate()