Line data Source code
1 : /* File: utf8stringbuf.c; Copyright and License: see below */ 2 : 3 : #include <inttypes.h> 4 : #include "utf8stringbuf/utf8stringbuf.h" 5 : 6 : /* utf8stringbuf_private_empty_buf is constantly 0, but may be overwritten by a 0 - therefore stored in a read-writeable memory page */ 7 : char utf8stringbuf_private_empty_buf[1] = ""; 8 : 9 : const char *utf8stringbuf_private_format_signed_64_bit_int = "%" PRIi64; 10 : 11 : const char *utf8stringbuf_private_format_64_bit_hex = "%" PRIx64; 12 : 13 : /*! 14 : * \fn utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source ) 15 : * \private 16 : */ 17 : /* function to write a code point as utf8, returns the number of bytes written and an error code */ 18 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len ); 19 : 20 : /* utf8 sequences longer or equal 2 bytes start with a byte with 2 highest bits set: 0xc0 */ 21 : /* utf8 sequences longer or equal 3 bytes start with a byte with 3 highest bits set: 0xe0 */ 22 : /* utf8 sequences equal 4 bytes start with a byte with 4 highest bits set: 0xf0 */ 23 : static const unsigned char utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[5] = { 0, 0, 0xc0, 0xe0, 0xf0 }; 24 : 25 : /* Note: There is some magic in the design of utf8 which makes the implementation of this function quite short */ 26 85 : unsigned int utf8_string_buf_private_make_null_termination( utf8stringbuf_t *this_ ) { 27 : unsigned int truncatedLength; 28 85 : int clearAtEnd = 1; 29 : 30 309 : for ( int searchBackwards = 2; searchBackwards <= 4; searchBackwards ++ ) { 31 237 : if ( searchBackwards > (*this_).size ) { 32 4 : break; 33 : } 34 233 : const char pattern = utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[searchBackwards]; 35 233 : if ( ( (*this_).buf[(*this_).size-searchBackwards] & pattern ) == pattern ) { 36 9 : clearAtEnd = searchBackwards; 37 9 : break; 38 : } 39 : } 40 : 41 85 : truncatedLength = (*this_).size - clearAtEnd; 42 : /* (*this_).buf[truncatedLength] = '\0'; */ /* Note: some functions like splitIn2 require complete zeroed trailings */ 43 85 : memset( &((*this_).buf[truncatedLength]), '\0', clearAtEnd ); 44 85 : return truncatedLength; 45 : } 46 : 47 14 : utf8error_t utf8stringbuf_append_char( utf8stringbuf_t *this_, const uint32_t appendix ) { 48 : utf8error_t result; 49 14 : const unsigned int start = utf8stringbuf_get_length( this_ ); 50 : int appendLen; 51 14 : result = utf8stringbuf_private_write_char( &((*this_).buf[start]), (*this_).size - start - 1, appendix, &appendLen ); 52 14 : if ( result == UTF8ERROR_SUCCESS ) { 53 11 : (*this_).buf[start+appendLen] = '\0'; 54 : } 55 14 : return result; 56 : } 57 : 58 5 : utf8error_t utf8stringbuf_append_wstr( utf8stringbuf_t *this_, const wchar_t *appendix ) { 59 5 : utf8error_t result = UTF8ERROR_NULL_PARAM; 60 5 : if ( appendix != NULL ) { 61 4 : unsigned int start = utf8stringbuf_get_length( this_ ); 62 4 : result = UTF8ERROR_SUCCESS; 63 9 : for( ; appendix[0]!=L'\0'; appendix = &(appendix[1]) ) { 64 : int appendLen; 65 7 : result |= utf8stringbuf_private_write_char( &((*this_).buf[start]), (*this_).size - start - 1, appendix[0], &appendLen ); 66 7 : if ( result != UTF8ERROR_SUCCESS ) { 67 2 : if ( ( result & UTF8ERROR_TRUNCATED ) != 0 ) { 68 2 : break; 69 : } 70 : } 71 5 : start = start + appendLen; 72 : } 73 4 : (*this_).buf[start] = '\0'; 74 : } 75 5 : return result; 76 : } 77 : 78 21 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len ) { 79 21 : *out_len = 0; 80 21 : utf8error_t result = UTF8ERROR_TRUNCATED; 81 21 : if ( source <= 0x7ff ) { 82 12 : if ( source <= 0x7f ) { 83 : /* 1 byte character */ 84 : /* check if there is enough space for the character */ 85 10 : if ( max_size >= 1 ) { 86 8 : destination[0] = source; 87 8 : *out_len = 1; 88 8 : result = UTF8ERROR_SUCCESS; 89 : } 90 : } 91 : else { 92 : /* 2 byte character */ 93 2 : if ( max_size >= 2 ) { 94 2 : destination[0] = 0xc0 | ( source >> 6 ); 95 2 : destination[1] = 0x80 | ( source & 0x3f ); 96 2 : *out_len = 2; 97 2 : result = UTF8ERROR_SUCCESS; 98 : } 99 : } 100 : } 101 : else { 102 9 : if ( source <= 0x10ffff ) { 103 8 : if ( source <= 0xffff ) { 104 : /* 3 byte character */ 105 6 : if ( max_size >= 3 ) { 106 4 : destination[0] = 0xe0 | ( source >> 12 ); 107 4 : destination[1] = 0x80 | (( source >> 6 ) & 0x3f ); 108 4 : destination[2] = 0x80 | ( source & 0x3f ); 109 4 : *out_len = 3; 110 4 : result = UTF8ERROR_SUCCESS; 111 : } 112 : } 113 : else { 114 : /* 4 byte character */ 115 2 : if ( max_size >= 4 ) { 116 2 : destination[0] = 0xf0 | ( source >> 18 ); 117 2 : destination[1] = 0x80 | (( source >> 12 ) & 0x3f ); 118 2 : destination[2] = 0x80 | (( source >> 6 ) & 0x3f ); 119 2 : destination[3] = 0x80 | ( source & 0x3f ); 120 2 : *out_len = 4; 121 2 : result = UTF8ERROR_SUCCESS; 122 : } 123 : } 124 : } 125 : else { 126 : /* note: utf8 can not encode more than 21 bits per character, and even there only 0-0x10ffff is allowed. */ 127 1 : result = UTF8ERROR_NOT_A_CODEPOINT; 128 : } 129 : } 130 21 : return result; 131 : } 132 : 133 : 134 : /* 135 : * Copyright 2012-2025 Andreas Warnke 136 : * 137 : * Licensed under the Apache License, Version 2.0 (the "License"); 138 : * you may not use this file except in compliance with the License. 139 : * You may obtain a copy of the License at 140 : * 141 : * http://www.apache.org/licenses/LICENSE-2.0 142 : * 143 : * Unless required by applicable law or agreed to in writing, software 144 : * distributed under the License is distributed on an "AS IS" BASIS, 145 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 146 : * See the License for the specific language governing permissions and 147 : * limitations under the License. 148 : */