iPXE
Functions
utf8.c File Reference

UTF-8 Unicode encoding. More...

#include <stdint.h>
#include <assert.h>
#include <ipxe/utf8.h>

Go to the source code of this file.

Functions

 FILE_LICENCE (GPL2_OR_LATER_OR_UBDL)
 
 FILE_SECBOOT (PERMITTED)
 
unsigned int utf8_accumulate (struct utf8_accumulator *utf8, uint8_t byte)
 Accumulate Unicode character from UTF-8 byte sequence. More...
 

Detailed Description

UTF-8 Unicode encoding.

Definition in file utf8.c.

Function Documentation

◆ FILE_LICENCE()

FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL  )

◆ FILE_SECBOOT()

FILE_SECBOOT ( PERMITTED  )

◆ utf8_accumulate()

unsigned int utf8_accumulate ( struct utf8_accumulator utf8,
uint8_t  byte 
)

Accumulate Unicode character from UTF-8 byte sequence.

Parameters
utf8UTF-8 accumulator
byteUTF-8 byte
Return values
characterUnicode character, or 0 if incomplete

Definition at line 44 of file utf8.c.

44  {
45  static unsigned int min[] = {
49  };
50  unsigned int shift;
51  unsigned int len;
52  uint8_t tmp;
53 
54  /* Handle continuation bytes */
55  if ( UTF8_IS_CONTINUATION ( byte ) ) {
56 
57  /* Fail if this is an unexpected continuation byte */
58  if ( utf8->remaining == 0 ) {
59  DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
60  return UTF8_INVALID;
61  }
62 
63  /* Apply continuation byte */
65  utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
66 
67  /* Return 0 if more continuation bytes are expected */
68  if ( --utf8->remaining != 0 )
69  return 0;
70 
71  /* Fail if sequence is illegal */
72  if ( utf8->character < utf8->min ) {
73  DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
74  utf8->character );
75  return UTF8_INVALID;
76  }
77 
78  /* Sanity check */
79  assert ( utf8->character != 0 );
80 
81  /* Return completed character */
82  DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
83  utf8, utf8->character );
84  return utf8->character;
85  }
86 
87  /* Reset state and report failure if this is an unexpected
88  * non-continuation byte. Do not return UTF8_INVALID since
89  * doing so could cause us to drop a valid ASCII character.
90  */
91  if ( utf8->remaining != 0 ) {
92  shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
93  DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
94  utf8, byte, ( utf8->character << shift ),
95  ( ( 1 << shift ) - 1 ) );
96  utf8->remaining = 0;
97  }
98 
99  /* Handle initial bytes */
100  if ( ! UTF8_IS_ASCII ( byte ) ) {
101 
102  /* Sanity check */
103  assert ( utf8->remaining == 0 );
104 
105  /* Count total number of bytes in sequence */
106  tmp = byte;
107  len = 0;
108  while ( tmp & UTF8_HIGH_BIT ) {
109  tmp <<= 1;
110  len++;
111  }
112 
113  /* Check for illegal length */
114  if ( len > UTF8_MAX_LEN ) {
115  DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
116  utf8, byte, len );
117  return UTF8_INVALID;
118  }
119 
120  /* Store initial bits of character */
121  utf8->character = ( tmp >> len );
122 
123  /* Store number of bytes remaining */
124  len--;
125  utf8->remaining = len;
126  assert ( utf8->remaining > 0 );
127 
128  /* Store minimum legal value */
129  utf8->min = min[ len - 1 ];
130  assert ( utf8->min > 0 );
131 
132  /* Await continuation bytes */
133  return 0;
134  }
135 
136  /* Handle ASCII bytes */
137  return byte;
138 }
#define UTF8_MAX_LEN
Maximum length of UTF-8 sequence.
Definition: utf8.h:16
#define UTF8_IS_CONTINUATION(byte)
Check for a continuation byte.
Definition: utf8.h:44
#define UTF8_CONTINUATION_MASK
Bit mask for data bits in a continuation byte.
Definition: utf8.h:34
#define DBGC(...)
Definition: compiler.h:505
#define min(x, y)
Definition: ath.h:36
unsigned int remaining
Number of remaining continuation bytes.
Definition: utf8.h:62
#define UTF8_INVALID
Invalid character returned when decoding fails.
Definition: utf8.h:55
unsigned long tmp
Definition: linux_pci.h:65
#define UTF8_MIN_FOUR
Minimum legal value for four-byte UTF-8 sequence.
Definition: utf8.h:25
#define UTF8_CONTINUATION_BITS
Number of data bits in each continuation byte.
Definition: utf8.h:31
unsigned int min
Minimum legal character.
Definition: utf8.h:64
assert((readw(&hdr->flags) &(GTF_reading|GTF_writing))==0)
ring len
Length.
Definition: dwmac.h:231
#define UTF8_HIGH_BIT
High bit of UTF-8 bytes.
Definition: utf8.h:28
#define UTF8_IS_ASCII(byte)
Check for an ASCII byte.
Definition: utf8.h:52
unsigned char uint8_t
Definition: stdint.h:10
unsigned char byte
Definition: smc9000.h:38
#define UTF8_MIN_THREE
Minimum legal value for three-byte UTF-8 sequence.
Definition: utf8.h:22
#define DBGC2(...)
Definition: compiler.h:522
#define UTF8_MIN_TWO
Minimum legal value for two-byte UTF-8 sequence.
Definition: utf8.h:19
unsigned int character
Character in progress.
Definition: utf8.h:60

References assert(), utf8_accumulator::character, DBGC, DBGC2, len, min, utf8_accumulator::min, utf8_accumulator::remaining, tmp, UTF8_CONTINUATION_BITS, UTF8_CONTINUATION_MASK, UTF8_HIGH_BIT, UTF8_INVALID, UTF8_IS_ASCII, UTF8_IS_CONTINUATION, UTF8_MAX_LEN, UTF8_MIN_FOUR, UTF8_MIN_THREE, and UTF8_MIN_TWO.

Referenced by efi_putchar(), fbcon_putchar(), and utf8_accumulate_okx().