Line data Source code
1 : /* File: utf8stringbuf.c; Copyright and License: see below */
2 :
3 : #include <inttypes.h>
4 : #include "utf8stringbuf/utf8stringbuf.h"
5 :
6 : /* utf8stringbuf_private_empty_buf is constantly 0, but may be overwritten by a 0 - therefore stored in a read-writeable memory page */
7 : char utf8stringbuf_private_empty_buf[1] = "";
8 :
9 : const char *utf8stringbuf_private_format_signed_64_bit_int = "%" PRIi64;
10 :
11 : const char *utf8stringbuf_private_format_64_bit_hex = "%" PRIx64;
12 :
13 : /*!
14 : * \fn utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source )
15 : * \private
16 : */
17 : /* function to write a code point as utf8, returns the number of bytes written and an error code */
18 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len );
19 :
20 : /* utf8 sequences longer or equal 2 bytes start with a byte with 2 highest bits set: 0xc0 */
21 : /* utf8 sequences longer or equal 3 bytes start with a byte with 3 highest bits set: 0xe0 */
22 : /* utf8 sequences equal 4 bytes start with a byte with 4 highest bits set: 0xf0 */
23 : static const unsigned char utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[5] = { 0, 0, 0xc0, 0xe0, 0xf0 };
24 :
25 : /* Note: There is some magic in the design of utf8 which makes the implementation of this function quite short */
26 85 : unsigned int utf8_string_buf_private_make_null_termination( utf8stringbuf_t *this_ ) {
27 : unsigned int truncatedLength;
28 85 : int clearAtEnd = 1;
29 :
30 309 : for ( int searchBackwards = 2; searchBackwards <= 4; searchBackwards ++ ) {
31 237 : if ( searchBackwards > (*this_).size ) {
32 4 : break;
33 : }
34 233 : const char pattern = utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[searchBackwards];
35 233 : if ( ( (*this_).buf[(*this_).size-searchBackwards] & pattern ) == pattern ) {
36 9 : clearAtEnd = searchBackwards;
37 9 : break;
38 : }
39 : }
40 :
41 85 : truncatedLength = (*this_).size - clearAtEnd;
42 : /* (*this_).buf[truncatedLength] = '\0'; */ /* Note: some functions like splitIn2 require complete zeroed trailings */
43 85 : memset( &((*this_).buf[truncatedLength]), '\0', clearAtEnd );
44 85 : return truncatedLength;
45 : }
46 :
47 14 : utf8error_t utf8stringbuf_append_char( utf8stringbuf_t *this_, const uint32_t appendix ) {
48 : utf8error_t result;
49 14 : const unsigned int start = utf8stringbuf_get_length( this_ );
50 : int appendLen;
51 14 : result = utf8stringbuf_private_write_char( &((*this_).buf[start]), (*this_).size - start - 1, appendix, &appendLen );
52 14 : if ( result == UTF8ERROR_SUCCESS ) {
53 11 : (*this_).buf[start+appendLen] = '\0';
54 : }
55 14 : return result;
56 : }
57 :
58 5 : utf8error_t utf8stringbuf_append_wstr( utf8stringbuf_t *this_, const wchar_t *appendix ) {
59 5 : utf8error_t result = UTF8ERROR_NULL_PARAM;
60 5 : if ( appendix != NULL ) {
61 4 : unsigned int start = utf8stringbuf_get_length( this_ );
62 4 : result = UTF8ERROR_SUCCESS;
63 9 : for( ; appendix[0]!=L'\0'; appendix = &(appendix[1]) ) {
64 : int appendLen;
65 7 : result |= utf8stringbuf_private_write_char( &((*this_).buf[start]), (*this_).size - start - 1, appendix[0], &appendLen );
66 7 : if ( result != UTF8ERROR_SUCCESS ) {
67 2 : if ( ( result & UTF8ERROR_TRUNCATED ) != 0 ) {
68 2 : break;
69 : }
70 : }
71 5 : start = start + appendLen;
72 : }
73 4 : (*this_).buf[start] = '\0';
74 : }
75 5 : return result;
76 : }
77 :
78 21 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len ) {
79 21 : *out_len = 0;
80 21 : utf8error_t result = UTF8ERROR_TRUNCATED;
81 21 : if ( source <= 0x7ff ) {
82 12 : if ( source <= 0x7f ) {
83 : /* 1 byte character */
84 : /* check if there is enough space for the character */
85 10 : if ( max_size >= 1 ) {
86 8 : destination[0] = source;
87 8 : *out_len = 1;
88 8 : result = UTF8ERROR_SUCCESS;
89 : }
90 : }
91 : else {
92 : /* 2 byte character */
93 2 : if ( max_size >= 2 ) {
94 2 : destination[0] = 0xc0 | ( source >> 6 );
95 2 : destination[1] = 0x80 | ( source & 0x3f );
96 2 : *out_len = 2;
97 2 : result = UTF8ERROR_SUCCESS;
98 : }
99 : }
100 : }
101 : else {
102 9 : if ( source <= 0x10ffff ) {
103 8 : if ( source <= 0xffff ) {
104 : /* 3 byte character */
105 6 : if ( max_size >= 3 ) {
106 4 : destination[0] = 0xe0 | ( source >> 12 );
107 4 : destination[1] = 0x80 | (( source >> 6 ) & 0x3f );
108 4 : destination[2] = 0x80 | ( source & 0x3f );
109 4 : *out_len = 3;
110 4 : result = UTF8ERROR_SUCCESS;
111 : }
112 : }
113 : else {
114 : /* 4 byte character */
115 2 : if ( max_size >= 4 ) {
116 2 : destination[0] = 0xf0 | ( source >> 18 );
117 2 : destination[1] = 0x80 | (( source >> 12 ) & 0x3f );
118 2 : destination[2] = 0x80 | (( source >> 6 ) & 0x3f );
119 2 : destination[3] = 0x80 | ( source & 0x3f );
120 2 : *out_len = 4;
121 2 : result = UTF8ERROR_SUCCESS;
122 : }
123 : }
124 : }
125 : else {
126 : /* note: utf8 can not encode more than 21 bits per character, and even there only 0-0x10ffff is allowed. */
127 1 : result = UTF8ERROR_NOT_A_CODEPOINT;
128 : }
129 : }
130 21 : return result;
131 : }
132 :
133 :
134 : /*
135 : * Copyright 2012-2025 Andreas Warnke
136 : *
137 : * Licensed under the Apache License, Version 2.0 (the "License");
138 : * you may not use this file except in compliance with the License.
139 : * You may obtain a copy of the License at
140 : *
141 : * http://www.apache.org/licenses/LICENSE-2.0
142 : *
143 : * Unless required by applicable law or agreed to in writing, software
144 : * distributed under the License is distributed on an "AS IS" BASIS,
145 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
146 : * See the License for the specific language governing permissions and
147 : * limitations under the License.
148 : */
|