iPXE
utf8.h File Reference

UTF-8 Unicode encoding. More...

#include <stdint.h>

Go to the source code of this file.

Data Structures

struct  utf8_accumulator
 A UTF-8 character accumulator. More...

Macros

#define UTF8_MAX_LEN   4
 Maximum length of UTF-8 sequence.
#define UTF8_MIN_TWO   0x80
 Minimum legal value for two-byte UTF-8 sequence.
#define UTF8_MIN_THREE   0x800
 Minimum legal value for three-byte UTF-8 sequence.
#define UTF8_MIN_FOUR   0x10000
 Minimum legal value for four-byte UTF-8 sequence.
#define UTF8_HIGH_BIT   0x80
 High bit of UTF-8 bytes.
#define UTF8_CONTINUATION_BITS   6
 Number of data bits in each continuation byte.
#define UTF8_CONTINUATION_MASK   ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )
 Bit mask for data bits in a continuation byte.
#define UTF8_CONTINUATION   0x80
 Non-data bits in a continuation byte.
#define UTF8_IS_CONTINUATION(byte)
 Check for a continuation byte.
#define UTF8_IS_ASCII(byte)
 Check for an ASCII byte.
#define UTF8_INVALID   0xfffd
 Invalid character returned when decoding fails.

Functions

 FILE_LICENCE (GPL2_OR_LATER_OR_UBDL)
 FILE_SECBOOT (PERMITTED)
unsigned int utf8_accumulate (struct utf8_accumulator *utf8, uint8_t byte)
 Accumulate Unicode character from UTF-8 byte sequence.

Detailed Description

UTF-8 Unicode encoding.

Definition in file utf8.h.

Macro Definition Documentation

◆ UTF8_MAX_LEN

#define UTF8_MAX_LEN   4

Maximum length of UTF-8 sequence.

Definition at line 16 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_MIN_TWO

#define UTF8_MIN_TWO   0x80

Minimum legal value for two-byte UTF-8 sequence.

Definition at line 19 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_MIN_THREE

#define UTF8_MIN_THREE   0x800

Minimum legal value for three-byte UTF-8 sequence.

Definition at line 22 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_MIN_FOUR

#define UTF8_MIN_FOUR   0x10000

Minimum legal value for four-byte UTF-8 sequence.

Definition at line 25 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_HIGH_BIT

#define UTF8_HIGH_BIT   0x80

High bit of UTF-8 bytes.

Definition at line 28 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_CONTINUATION_BITS

#define UTF8_CONTINUATION_BITS   6

Number of data bits in each continuation byte.

Definition at line 31 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_CONTINUATION_MASK

#define UTF8_CONTINUATION_MASK   ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )

Bit mask for data bits in a continuation byte.

Definition at line 34 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_CONTINUATION

#define UTF8_CONTINUATION   0x80

Non-data bits in a continuation byte.

Definition at line 37 of file utf8.h.

◆ UTF8_IS_CONTINUATION

#define UTF8_IS_CONTINUATION ( byte)
Value:
unsigned char byte
Definition smc9000.h:38
#define UTF8_CONTINUATION
Non-data bits in a continuation byte.
Definition utf8.h:37
#define UTF8_CONTINUATION_MASK
Bit mask for data bits in a continuation byte.
Definition utf8.h:34

Check for a continuation byte.

Parameters
byteUTF-8 byte
Return values
is_continuationByte is a continuation byte

Definition at line 44 of file utf8.h.

44#define UTF8_IS_CONTINUATION( byte ) \
45 ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )

Referenced by utf8_accumulate().

◆ UTF8_IS_ASCII

#define UTF8_IS_ASCII ( byte)
Value:
( ! ( (byte) & UTF8_HIGH_BIT ) )
#define UTF8_HIGH_BIT
High bit of UTF-8 bytes.
Definition utf8.h:28

Check for an ASCII byte.

Parameters
byteUTF-8 byte
Return values
is_asciiByte is an ASCII byte

Definition at line 52 of file utf8.h.

Referenced by utf8_accumulate().

◆ UTF8_INVALID

#define UTF8_INVALID   0xfffd

Invalid character returned when decoding fails.

Definition at line 55 of file utf8.h.

Referenced by efi_putchar(), and utf8_accumulate().

Function Documentation

◆ FILE_LICENCE()

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL )

◆ FILE_SECBOOT()

FILE_SECBOOT ( PERMITTED )

◆ utf8_accumulate()

unsigned int utf8_accumulate ( struct utf8_accumulator * utf8,
uint8_t byte )
extern

Accumulate Unicode character from UTF-8 byte sequence.

Parameters
utf8UTF-8 accumulator
byteUTF-8 byte
Return values
characterUnicode character, or 0 if incomplete

Definition at line 44 of file utf8.c.

44 {
45 static unsigned int min[] = {
49 };
50 unsigned int shift;
51 unsigned int len;
53
54 /* Handle continuation bytes */
55 if ( UTF8_IS_CONTINUATION ( byte ) ) {
56
57 /* Fail if this is an unexpected continuation byte */
58 if ( utf8->remaining == 0 ) {
59 DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
60 return UTF8_INVALID;
61 }
62
63 /* Apply continuation byte */
65 utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
66
67 /* Return 0 if more continuation bytes are expected */
68 if ( --utf8->remaining != 0 )
69 return 0;
70
71 /* Fail if sequence is illegal */
72 if ( utf8->character < utf8->min ) {
73 DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
74 utf8->character );
75 return UTF8_INVALID;
76 }
77
78 /* Sanity check */
79 assert ( utf8->character != 0 );
80
81 /* Return completed character */
82 DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
83 utf8, utf8->character );
84 return utf8->character;
85 }
86
87 /* Reset state and report failure if this is an unexpected
88 * non-continuation byte. Do not return UTF8_INVALID since
89 * doing so could cause us to drop a valid ASCII character.
90 */
91 if ( utf8->remaining != 0 ) {
92 shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
93 DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
94 utf8, byte, ( utf8->character << shift ),
95 ( ( 1 << shift ) - 1 ) );
96 utf8->remaining = 0;
97 }
98
99 /* Handle initial bytes */
100 if ( ! UTF8_IS_ASCII ( byte ) ) {
101
102 /* Sanity check */
103 assert ( utf8->remaining == 0 );
104
105 /* Count total number of bytes in sequence */
106 tmp = byte;
107 len = 0;
108 while ( tmp & UTF8_HIGH_BIT ) {
109 tmp <<= 1;
110 len++;
111 }
112
113 /* Check for illegal length */
114 if ( len > UTF8_MAX_LEN ) {
115 DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
116 utf8, byte, len );
117 return UTF8_INVALID;
118 }
119
120 /* Store initial bits of character */
121 utf8->character = ( tmp >> len );
122
123 /* Store number of bytes remaining */
124 len--;
125 utf8->remaining = len;
126 assert ( utf8->remaining > 0 );
127
128 /* Store minimum legal value */
129 utf8->min = min[ len - 1 ];
130 assert ( utf8->min > 0 );
131
132 /* Await continuation bytes */
133 return 0;
134 }
135
136 /* Handle ASCII bytes */
137 return byte;
138}
unsigned char uint8_t
Definition stdint.h:10
#define assert(condition)
Assert a condition at run-time.
Definition assert.h:50
#define min(x, y)
Definition ath.h:36
ring len
Length.
Definition dwmac.h:226
#define DBGC2(...)
Definition compiler.h:522
#define DBGC(...)
Definition compiler.h:505
unsigned long tmp
Definition linux_pci.h:65
unsigned int remaining
Number of remaining continuation bytes.
Definition utf8.h:62
unsigned int min
Minimum legal character.
Definition utf8.h:64
unsigned int character
Character in progress.
Definition utf8.h:60
#define UTF8_CONTINUATION_BITS
Number of data bits in each continuation byte.
Definition utf8.h:31
#define UTF8_MAX_LEN
Maximum length of UTF-8 sequence.
Definition utf8.h:16
#define UTF8_MIN_THREE
Minimum legal value for three-byte UTF-8 sequence.
Definition utf8.h:22
#define UTF8_IS_CONTINUATION(byte)
Check for a continuation byte.
Definition utf8.h:44
#define UTF8_MIN_FOUR
Minimum legal value for four-byte UTF-8 sequence.
Definition utf8.h:25
#define UTF8_MIN_TWO
Minimum legal value for two-byte UTF-8 sequence.
Definition utf8.h:19
#define UTF8_INVALID
Invalid character returned when decoding fails.
Definition utf8.h:55
#define UTF8_IS_ASCII(byte)
Check for an ASCII byte.
Definition utf8.h:52

References assert, utf8_accumulator::character, DBGC, DBGC2, len, min, utf8_accumulator::min, utf8_accumulator::remaining, tmp, UTF8_CONTINUATION_BITS, UTF8_CONTINUATION_MASK, UTF8_HIGH_BIT, UTF8_INVALID, UTF8_IS_ASCII, UTF8_IS_CONTINUATION, UTF8_MAX_LEN, UTF8_MIN_FOUR, UTF8_MIN_THREE, and UTF8_MIN_TWO.

Referenced by efi_putchar(), fbcon_putchar(), and utf8_accumulate_okx().