iPXE
Data Structures | Macros | Functions
utf8.h File Reference

UTF-8 Unicode encoding. More...

#include <stdint.h>

Go to the source code of this file.

Data Structures

struct  utf8_accumulator
 A UTF-8 character accumulator. More...
 

Macros

#define UTF8_MAX_LEN   4
 Maximum length of UTF-8 sequence. More...
 
#define UTF8_MIN_TWO   0x80
 Minimum legal value for two-byte UTF-8 sequence. More...
 
#define UTF8_MIN_THREE   0x800
 Minimum legal value for three-byte UTF-8 sequence. More...
 
#define UTF8_MIN_FOUR   0x10000
 Minimum legal value for four-byte UTF-8 sequence. More...
 
#define UTF8_HIGH_BIT   0x80
 High bit of UTF-8 bytes. More...
 
#define UTF8_CONTINUATION_BITS   6
 Number of data bits in each continuation byte. More...
 
#define UTF8_CONTINUATION_MASK   ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )
 Bit mask for data bits in a continuation byte. More...
 
#define UTF8_CONTINUATION   0x80
 Non-data bits in a continuation byte. More...
 
#define UTF8_IS_CONTINUATION(byte)   ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )
 Check for a continuation byte. More...
 
#define UTF8_IS_ASCII(byte)   ( ! ( (byte) & UTF8_HIGH_BIT ) )
 Check for an ASCII byte. More...
 
#define UTF8_INVALID   0xfffd
 Invalid character returned when decoding fails. More...
 

Functions

 FILE_LICENCE (GPL2_OR_LATER_OR_UBDL)
 
unsigned int utf8_accumulate (struct utf8_accumulator *utf8, uint8_t byte)
 Accumulate Unicode character from UTF-8 byte sequence. More...
 

Detailed Description

UTF-8 Unicode encoding.

Definition in file utf8.h.

Macro Definition Documentation

◆ UTF8_MAX_LEN

#define UTF8_MAX_LEN   4

Maximum length of UTF-8 sequence.

Definition at line 15 of file utf8.h.

◆ UTF8_MIN_TWO

#define UTF8_MIN_TWO   0x80

Minimum legal value for two-byte UTF-8 sequence.

Definition at line 18 of file utf8.h.

◆ UTF8_MIN_THREE

#define UTF8_MIN_THREE   0x800

Minimum legal value for three-byte UTF-8 sequence.

Definition at line 21 of file utf8.h.

◆ UTF8_MIN_FOUR

#define UTF8_MIN_FOUR   0x10000

Minimum legal value for four-byte UTF-8 sequence.

Definition at line 24 of file utf8.h.

◆ UTF8_HIGH_BIT

#define UTF8_HIGH_BIT   0x80

High bit of UTF-8 bytes.

Definition at line 27 of file utf8.h.

◆ UTF8_CONTINUATION_BITS

#define UTF8_CONTINUATION_BITS   6

Number of data bits in each continuation byte.

Definition at line 30 of file utf8.h.

◆ UTF8_CONTINUATION_MASK

#define UTF8_CONTINUATION_MASK   ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )

Bit mask for data bits in a continuation byte.

Definition at line 33 of file utf8.h.

◆ UTF8_CONTINUATION

#define UTF8_CONTINUATION   0x80

Non-data bits in a continuation byte.

Definition at line 36 of file utf8.h.

◆ UTF8_IS_CONTINUATION

#define UTF8_IS_CONTINUATION (   byte)    ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )

Check for a continuation byte.

Parameters
byteUTF-8 byte
Return values
is_continuationByte is a continuation byte

Definition at line 43 of file utf8.h.

◆ UTF8_IS_ASCII

#define UTF8_IS_ASCII (   byte)    ( ! ( (byte) & UTF8_HIGH_BIT ) )

Check for an ASCII byte.

Parameters
byteUTF-8 byte
Return values
is_asciiByte is an ASCII byte

Definition at line 51 of file utf8.h.

◆ UTF8_INVALID

#define UTF8_INVALID   0xfffd

Invalid character returned when decoding fails.

Definition at line 54 of file utf8.h.

Function Documentation

◆ FILE_LICENCE()

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL  )

◆ utf8_accumulate()

unsigned int utf8_accumulate ( struct utf8_accumulator utf8,
uint8_t  byte 
)

Accumulate Unicode character from UTF-8 byte sequence.

Parameters
utf8UTF-8 accumulator
byteUTF-8 byte
Return values
characterUnicode character, or 0 if incomplete

Definition at line 43 of file utf8.c.

43  {
44  static unsigned int min[] = {
48  };
49  unsigned int shift;
50  unsigned int len;
51  uint8_t tmp;
52 
53  /* Handle continuation bytes */
54  if ( UTF8_IS_CONTINUATION ( byte ) ) {
55 
56  /* Fail if this is an unexpected continuation byte */
57  if ( utf8->remaining == 0 ) {
58  DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
59  return UTF8_INVALID;
60  }
61 
62  /* Apply continuation byte */
64  utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
65 
66  /* Return 0 if more continuation bytes are expected */
67  if ( --utf8->remaining != 0 )
68  return 0;
69 
70  /* Fail if sequence is illegal */
71  if ( utf8->character < utf8->min ) {
72  DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
73  utf8->character );
74  return UTF8_INVALID;
75  }
76 
77  /* Sanity check */
78  assert ( utf8->character != 0 );
79 
80  /* Return completed character */
81  DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
82  utf8, utf8->character );
83  return utf8->character;
84  }
85 
86  /* Reset state and report failure if this is an unexpected
87  * non-continuation byte. Do not return UTF8_INVALID since
88  * doing so could cause us to drop a valid ASCII character.
89  */
90  if ( utf8->remaining != 0 ) {
91  shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
92  DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
93  utf8, byte, ( utf8->character << shift ),
94  ( ( 1 << shift ) - 1 ) );
95  utf8->remaining = 0;
96  }
97 
98  /* Handle initial bytes */
99  if ( ! UTF8_IS_ASCII ( byte ) ) {
100 
101  /* Sanity check */
102  assert ( utf8->remaining == 0 );
103 
104  /* Count total number of bytes in sequence */
105  tmp = byte;
106  len = 0;
107  while ( tmp & UTF8_HIGH_BIT ) {
108  tmp <<= 1;
109  len++;
110  }
111 
112  /* Check for illegal length */
113  if ( len > UTF8_MAX_LEN ) {
114  DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
115  utf8, byte, len );
116  return UTF8_INVALID;
117  }
118 
119  /* Store initial bits of character */
120  utf8->character = ( tmp >> len );
121 
122  /* Store number of bytes remaining */
123  len--;
124  utf8->remaining = len;
125  assert ( utf8->remaining > 0 );
126 
127  /* Store minimum legal value */
128  utf8->min = min[ len - 1 ];
129  assert ( utf8->min > 0 );
130 
131  /* Await continuation bytes */
132  return 0;
133  }
134 
135  /* Handle ASCII bytes */
136  return byte;
137 }
#define UTF8_MAX_LEN
Maximum length of UTF-8 sequence.
Definition: utf8.h:15
#define UTF8_IS_CONTINUATION(byte)
Check for a continuation byte.
Definition: utf8.h:43
#define UTF8_CONTINUATION_MASK
Bit mask for data bits in a continuation byte.
Definition: utf8.h:33
#define DBGC(...)
Definition: compiler.h:505
#define min(x, y)
Definition: ath.h:34
unsigned int remaining
Number of remaining continuation bytes.
Definition: utf8.h:61
#define UTF8_INVALID
Invalid character returned when decoding fails.
Definition: utf8.h:54
unsigned long tmp
Definition: linux_pci.h:53
#define UTF8_MIN_FOUR
Minimum legal value for four-byte UTF-8 sequence.
Definition: utf8.h:24
#define UTF8_CONTINUATION_BITS
Number of data bits in each continuation byte.
Definition: utf8.h:30
unsigned int min
Minimum legal character.
Definition: utf8.h:63
assert((readw(&hdr->flags) &(GTF_reading|GTF_writing))==0)
#define UTF8_HIGH_BIT
High bit of UTF-8 bytes.
Definition: utf8.h:27
#define UTF8_IS_ASCII(byte)
Check for an ASCII byte.
Definition: utf8.h:51
unsigned char uint8_t
Definition: stdint.h:10
unsigned char byte
Definition: smc9000.h:38
#define UTF8_MIN_THREE
Minimum legal value for three-byte UTF-8 sequence.
Definition: utf8.h:21
uint32_t len
Length.
Definition: ena.h:14
#define DBGC2(...)
Definition: compiler.h:522
#define UTF8_MIN_TWO
Minimum legal value for two-byte UTF-8 sequence.
Definition: utf8.h:18
unsigned int character
Character in progress.
Definition: utf8.h:59

References assert(), utf8_accumulator::character, DBGC, DBGC2, len, min, utf8_accumulator::min, utf8_accumulator::remaining, tmp, UTF8_CONTINUATION_BITS, UTF8_CONTINUATION_MASK, UTF8_HIGH_BIT, UTF8_INVALID, UTF8_IS_ASCII, UTF8_IS_CONTINUATION, UTF8_MAX_LEN, UTF8_MIN_FOUR, UTF8_MIN_THREE, and UTF8_MIN_TWO.

Referenced by efi_putchar(), fbcon_putchar(), and utf8_accumulate_okx().