iPXE
utf8.c
Go to the documentation of this file.
1/*
2 * Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301, USA.
18 *
19 * You can also choose to distribute this program under the terms of
20 * the Unmodified Binary Distribution Licence (as given in the file
21 * COPYING.UBDL), provided that you have satisfied its requirements.
22 */
23
24FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
25FILE_SECBOOT ( PERMITTED );
26
27#include <stdint.h>
28#include <assert.h>
29#include <ipxe/utf8.h>
30
31/** @file
32 *
33 * UTF-8 Unicode encoding
34 *
35 */
36
37/**
38 * Accumulate Unicode character from UTF-8 byte sequence
39 *
40 * @v utf8 UTF-8 accumulator
41 * @v byte UTF-8 byte
42 * @ret character Unicode character, or 0 if incomplete
43 */
44unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) {
45 static unsigned int min[] = {
49 };
50 unsigned int shift;
51 unsigned int len;
53
54 /* Handle continuation bytes */
55 if ( UTF8_IS_CONTINUATION ( byte ) ) {
56
57 /* Fail if this is an unexpected continuation byte */
58 if ( utf8->remaining == 0 ) {
59 DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
60 return UTF8_INVALID;
61 }
62
63 /* Apply continuation byte */
65 utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
66
67 /* Return 0 if more continuation bytes are expected */
68 if ( --utf8->remaining != 0 )
69 return 0;
70
71 /* Fail if sequence is illegal */
72 if ( utf8->character < utf8->min ) {
73 DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
74 utf8->character );
75 return UTF8_INVALID;
76 }
77
78 /* Sanity check */
79 assert ( utf8->character != 0 );
80
81 /* Return completed character */
82 DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
83 utf8, utf8->character );
84 return utf8->character;
85 }
86
87 /* Reset state and report failure if this is an unexpected
88 * non-continuation byte. Do not return UTF8_INVALID since
89 * doing so could cause us to drop a valid ASCII character.
90 */
91 if ( utf8->remaining != 0 ) {
92 shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
93 DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
94 utf8, byte, ( utf8->character << shift ),
95 ( ( 1 << shift ) - 1 ) );
96 utf8->remaining = 0;
97 }
98
99 /* Handle initial bytes */
100 if ( ! UTF8_IS_ASCII ( byte ) ) {
101
102 /* Sanity check */
103 assert ( utf8->remaining == 0 );
104
105 /* Count total number of bytes in sequence */
106 tmp = byte;
107 len = 0;
108 while ( tmp & UTF8_HIGH_BIT ) {
109 tmp <<= 1;
110 len++;
111 }
112
113 /* Check for illegal length */
114 if ( len > UTF8_MAX_LEN ) {
115 DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
116 utf8, byte, len );
117 return UTF8_INVALID;
118 }
119
120 /* Store initial bits of character */
121 utf8->character = ( tmp >> len );
122
123 /* Store number of bytes remaining */
124 len--;
125 utf8->remaining = len;
126 assert ( utf8->remaining > 0 );
127
128 /* Store minimum legal value */
129 utf8->min = min[ len - 1 ];
130 assert ( utf8->min > 0 );
131
132 /* Await continuation bytes */
133 return 0;
134 }
135
136 /* Handle ASCII bytes */
137 return byte;
138}
unsigned char uint8_t
Definition stdint.h:10
Assertions.
#define assert(condition)
Assert a condition at run-time.
Definition assert.h:50
#define min(x, y)
Definition ath.h:36
ring len
Length.
Definition dwmac.h:226
#define DBGC2(...)
Definition compiler.h:522
#define DBGC(...)
Definition compiler.h:505
#define FILE_LICENCE(_licence)
Declare a particular licence as applying to a file.
Definition compiler.h:896
#define FILE_SECBOOT(_status)
Declare a file's UEFI Secure Boot permission status.
Definition compiler.h:926
unsigned long tmp
Definition linux_pci.h:65
unsigned char byte
Definition smc9000.h:38
A UTF-8 character accumulator.
Definition utf8.h:58
unsigned int remaining
Number of remaining continuation bytes.
Definition utf8.h:62
unsigned int min
Minimum legal character.
Definition utf8.h:64
unsigned int character
Character in progress.
Definition utf8.h:60
unsigned int utf8_accumulate(struct utf8_accumulator *utf8, uint8_t byte)
Accumulate Unicode character from UTF-8 byte sequence.
Definition utf8.c:44
UTF-8 Unicode encoding.
#define UTF8_CONTINUATION_BITS
Number of data bits in each continuation byte.
Definition utf8.h:31
#define UTF8_MAX_LEN
Maximum length of UTF-8 sequence.
Definition utf8.h:16
#define UTF8_MIN_THREE
Minimum legal value for three-byte UTF-8 sequence.
Definition utf8.h:22
#define UTF8_IS_CONTINUATION(byte)
Check for a continuation byte.
Definition utf8.h:44
#define UTF8_MIN_FOUR
Minimum legal value for four-byte UTF-8 sequence.
Definition utf8.h:25
#define UTF8_MIN_TWO
Minimum legal value for two-byte UTF-8 sequence.
Definition utf8.h:19
#define UTF8_HIGH_BIT
High bit of UTF-8 bytes.
Definition utf8.h:28
#define UTF8_INVALID
Invalid character returned when decoding fails.
Definition utf8.h:55
#define UTF8_CONTINUATION_MASK
Bit mask for data bits in a continuation byte.
Definition utf8.h:34
#define UTF8_IS_ASCII(byte)
Check for an ASCII byte.
Definition utf8.h:52