utf8_8c_source.html

 /*
  * Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2 of the
  * License, or any later version.
  *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  * 02110-1301, USA.
  *
  * You can also choose to distribute this program under the terms of
  * the Unmodified Binary Distribution Licence (as given in the file
  * COPYING.UBDL), provided that you have satisfied its requirements.
  */

 FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );

 #include <stdint.h>
 #include <assert.h>
 #include <ipxe/utf8.h>

 /** @file
  *
  * UTF-8 Unicode encoding
  *
  */

 /**
  * Accumulate Unicode character from UTF-8 byte sequence
  *
  * @v utf8              UTF-8 accumulator
  * @v byte              UTF-8 byte
  * @ret character       Unicode character, or 0 if incomplete
  */
 unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) {
         static unsigned int min[] = {
                 UTF8_MIN_TWO,
                 UTF8_MIN_THREE,
                 UTF8_MIN_FOUR,
         };
         unsigned int shift;
         unsigned int len;
         uint8_t tmp;

         /* Handle continuation bytes */
         if ( UTF8_IS_CONTINUATION ( byte ) ) {

                 /* Fail if this is an unexpected continuation byte */
                 if ( utf8->remaining == 0 ) {
                         DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
                         return UTF8_INVALID;
                 }

                 /* Apply continuation byte */
                 utf8->character <<= UTF8_CONTINUATION_BITS;
                 utf8->character |= ( byte & UTF8_CONTINUATION_MASK );

                 /* Return 0 if more continuation bytes are expected */
                 if ( --utf8->remaining != 0 )
                         return 0;

                 /* Fail if sequence is illegal */
                 if ( utf8->character < utf8->min ) {
                         DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
                                utf8->character );
                         return UTF8_INVALID;
                 }

                 /* Sanity check */
                 assert ( utf8->character != 0 );

                 /* Return completed character */
                 DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
                         utf8, utf8->character );
                 return utf8->character;
         }

         /* Reset state and report failure if this is an unexpected
          * non-continuation byte.  Do not return UTF8_INVALID since
          * doing so could cause us to drop a valid ASCII character.
          */
         if ( utf8->remaining != 0 ) {
                 shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
                 DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
                        utf8, byte, ( utf8->character << shift ),
                        ( ( 1 << shift ) - 1 ) );
                 utf8->remaining = 0;
         }

         /* Handle initial bytes */
         if ( ! UTF8_IS_ASCII ( byte ) ) {

                 /* Sanity check */
                 assert ( utf8->remaining == 0 );

                 /* Count total number of bytes in sequence */
                 tmp = byte;
                 len = 0;
                 while ( tmp & UTF8_HIGH_BIT ) {
                         tmp <<= 1;
                         len++;
                 }

                 /* Check for illegal length */
                 if ( len > UTF8_MAX_LEN ) {
                         DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
                                utf8, byte, len );
                         return UTF8_INVALID;
                 }

                 /* Store initial bits of character */
                 utf8->character = ( tmp >> len );

                 /* Store number of bytes remaining */
                 len--;
                 utf8->remaining = len;
                 assert ( utf8->remaining > 0 );

                 /* Store minimum legal value */
                 utf8->min = min[ len - 1 ];
                 assert ( utf8->min > 0 );

                 /* Await continuation bytes */
                 return 0;
         }

         /* Handle ASCII bytes */
         return byte;
 }
UTF8_MAX_LEN
#define UTF8_MAX_LEN
Maximum length of UTF-8 sequence.
Definition: utf8.h:15

UTF8_IS_CONTINUATION
#define UTF8_IS_CONTINUATION(byte)
Check for a continuation byte.
Definition: utf8.h:43

UTF8_CONTINUATION_MASK
#define UTF8_CONTINUATION_MASK
Bit mask for data bits in a continuation byte.
Definition: utf8.h:33

DBGC
#define DBGC(...)
Definition: compiler.h:505

min
#define min(x, y)
Definition: ath.h:34

utf8_accumulator::remaining
unsigned int remaining
Number of remaining continuation bytes.
Definition: utf8.h:61

UTF8_INVALID
#define UTF8_INVALID
Invalid character returned when decoding fails.
Definition: utf8.h:54

utf8.h
UTF-8 Unicode encoding.

tmp
unsigned long tmp
Definition: linux_pci.h:63

UTF8_MIN_FOUR
#define UTF8_MIN_FOUR
Minimum legal value for four-byte UTF-8 sequence.
Definition: utf8.h:24

UTF8_CONTINUATION_BITS
#define UTF8_CONTINUATION_BITS
Number of data bits in each continuation byte.
Definition: utf8.h:30

utf8_accumulator::min
unsigned int min
Minimum legal character.
Definition: utf8.h:63

assert.h
Assertions.

assert
assert((readw(&hdr->flags) &(GTF_reading|GTF_writing))==0)

UTF8_HIGH_BIT
#define UTF8_HIGH_BIT
High bit of UTF-8 bytes.
Definition: utf8.h:27

UTF8_IS_ASCII
#define UTF8_IS_ASCII(byte)
Check for an ASCII byte.
Definition: utf8.h:51

uint8_t
unsigned char uint8_t
Definition: stdint.h:10

FILE_LICENCE
FILE_LICENCE(GPL2_OR_LATER_OR_UBDL)

byte
unsigned char byte
Definition: smc9000.h:38

UTF8_MIN_THREE
#define UTF8_MIN_THREE
Minimum legal value for three-byte UTF-8 sequence.
Definition: utf8.h:21

utf8_accumulator
A UTF-8 character accumulator.
Definition: utf8.h:57

DBGC2
#define DBGC2(...)
Definition: compiler.h:522

utf8_accumulate
unsigned int utf8_accumulate(struct utf8_accumulator *utf8, uint8_t byte)
Accumulate Unicode character from UTF-8 byte sequence.
Definition: utf8.c:43

UTF8_MIN_TWO
#define UTF8_MIN_TWO
Minimum legal value for two-byte UTF-8 sequence.
Definition: utf8.h:18

stdint.h

utf8_accumulator::character
unsigned int character
Character in progress.
Definition: utf8.h:59

len
uint32_t len
Length.
Definition: ena.h:14