Line data Source code
1 : /* File: utf8stringbuf.c; Copyright and License: see below */
2 :
3 : #include <inttypes.h>
4 : #include "utf8stringbuf/utf8stringbuf.h"
5 :
6 : /* utf8stringbuf_private_empty_buf is constantly 0, but may be overwritten by a 0 - therefore stored in a read-writeable memory page */
7 : char utf8stringbuf_private_empty_buf[1] = "";
8 :
9 : const char *utf8stringbuf_private_format_signed_64_bit_int = "%" PRIi64;
10 :
11 : const char *utf8stringbuf_private_format_64_bit_hex = "%" PRIx64;
12 :
13 : /*!
14 : * \fn utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source )
15 : * \private
16 : */
17 : /* function to write a code point as utf8, returns the number of bytes written and an error code */
18 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len );
19 :
20 : /* utf8 sequences longer or equal 2 bytes start with a byte with 2 highest bits set: 0xc0 */
21 : /* utf8 sequences longer or equal 3 bytes start with a byte with 3 highest bits set: 0xe0 */
22 : /* utf8 sequences equal 4 bytes start with a byte with 4 highest bits set: 0xf0 */
23 : static const unsigned char utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[5] = { 0, 0, 0xc0, 0xe0, 0xf0 };
24 :
25 : /* Note: There is some magic in the design of utf8 which makes the implementation of this function quite short */
26 95 : unsigned int utf8_string_buf_private_make_null_termination( utf8stringbuf_t this_ ) {
27 : unsigned int truncatedLength;
28 95 : int clearAtEnd = 1;
29 :
30 334 : for ( int searchBackwards = 2; searchBackwards <= 4; searchBackwards ++ ) {
31 258 : if ( searchBackwards > this_.size ) {
32 4 : break;
33 : }
34 254 : const char pattern = utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[searchBackwards];
35 254 : if ( ( this_.buf[this_.size-searchBackwards] & pattern ) == pattern ) {
36 15 : clearAtEnd = searchBackwards;
37 15 : break;
38 : }
39 : }
40 :
41 95 : truncatedLength = this_.size - clearAtEnd;
42 : /* this_.buf[truncatedLength] = '\0'; */ /* Note: some functions like splitIn2 require complete zeroed trailings */
43 95 : memset( &(this_.buf[truncatedLength]), '\0', clearAtEnd );
44 95 : return truncatedLength;
45 : }
46 :
47 : #ifdef UTF8STRINGBUF_UNCHECKED_RANGE
48 : utf8error_t utf8stringbuf_copy_region_from_buf( utf8stringbuf_t this_, const utf8stringbuf_t that, int start, int length ) {
49 : utf8error_t result;
50 : if (( start < 0 ) || ( length < 0 )) {
51 : this_.buf[0] = '\0';
52 : result = UTF8ERROR_OUT_OF_RANGE;
53 : }
54 : else {
55 : const int thatLen = utf8stringbuf_get_length( that );
56 : if ( start > thatLen ) {
57 : this_.buf[0] = '\0';
58 : result = UTF8ERROR_OUT_OF_RANGE;
59 : }
60 : else if ( start+length > thatLen ) {
61 : strncpy( this_.buf, &(that.buf[start]), this_.size );
62 : utf8_string_buf_private_make_null_termination( this_ );
63 : result = UTF8ERROR_OUT_OF_RANGE;
64 : }
65 : else {
66 : if ( length >= this_.size ) {
67 : memcpy( this_.buf, &(that.buf[start]), this_.size-1);
68 : utf8_string_buf_private_make_null_termination( this_ );
69 : result = UTF8ERROR_TRUNCATED;
70 : }
71 : else {
72 : memcpy( this_.buf, &(that.buf[start]), length );
73 : this_.buf[length] = '\0';
74 : result = UTF8ERROR_SUCCESS;
75 : }
76 : }
77 : }
78 : return result;
79 : }
80 : #endif /* UTF8STRINGBUF_UNCHECKED_RANGE */
81 :
82 : #ifdef UTF8STRINGBUF_UNCHECKED_RANGE
83 : utf8error_t utf8stringbuf_copy_region_from_str( utf8stringbuf_t this_, const char *that, int start, int length ) {
84 : utf8error_t result;
85 : if (( start < 0 ) || ( length < 0 )) {
86 : this_.buf[0] = '\0';
87 : result = UTF8ERROR_OUT_OF_RANGE;
88 : }
89 : else if ( that == NULL ) {
90 : this_.buf[0] = '\0';
91 : result = UTF8ERROR_NULL_PARAM;
92 : }
93 : else {
94 : const int thatLen = strlen( that );
95 : if ( start > thatLen ) {
96 : this_.buf[0] = '\0';
97 : result = UTF8ERROR_OUT_OF_RANGE;
98 : }
99 : else if ( start+length > thatLen ) {
100 : strncpy( this_.buf, &(that[start]), this_.size );
101 : utf8_string_buf_private_make_null_termination( this_ );
102 : result = UTF8ERROR_OUT_OF_RANGE;
103 : }
104 : else {
105 : if ( length >= this_.size ) {
106 : memcpy( this_.buf, &(that[start]), this_.size-1);
107 : utf8_string_buf_private_make_null_termination( this_ );
108 : result = UTF8ERROR_TRUNCATED;
109 : }
110 : else {
111 : memcpy( this_.buf, &(that[start]), length );
112 : this_.buf[length] = '\0';
113 : result = UTF8ERROR_SUCCESS;
114 : }
115 : }
116 : }
117 : return result;
118 : }
119 : #endif /* UTF8STRINGBUF_UNCHECKED_RANGE */
120 :
121 14 : utf8error_t utf8stringbuf_append_char( utf8stringbuf_t this_, const uint32_t appendix ) {
122 : utf8error_t result;
123 14 : const unsigned int start = utf8stringbuf_get_length( this_ );
124 : int appendLen;
125 14 : result = utf8stringbuf_private_write_char( &(this_.buf[start]), this_.size - start - 1, appendix, &appendLen );
126 14 : if ( result == UTF8ERROR_SUCCESS ) {
127 11 : this_.buf[start+appendLen] = '\0';
128 : }
129 14 : return result;
130 : }
131 :
132 5 : utf8error_t utf8stringbuf_append_wstr( utf8stringbuf_t this_, const wchar_t *appendix ) {
133 5 : utf8error_t result = UTF8ERROR_NULL_PARAM;
134 5 : if ( appendix != NULL ) {
135 4 : unsigned int start = utf8stringbuf_get_length( this_ );
136 4 : result = UTF8ERROR_SUCCESS;
137 9 : for( ; appendix[0]!=L'\0'; appendix = &(appendix[1]) ) {
138 : int appendLen;
139 7 : result |= utf8stringbuf_private_write_char( &(this_.buf[start]), this_.size - start - 1, appendix[0], &appendLen );
140 7 : if ( result != UTF8ERROR_SUCCESS ) {
141 2 : if ( ( result & UTF8ERROR_TRUNCATED ) != 0 ) {
142 2 : break;
143 : }
144 : }
145 5 : start = start + appendLen;
146 : }
147 4 : this_.buf[start] = '\0';
148 : }
149 5 : return result;
150 : }
151 :
152 21 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len ) {
153 21 : *out_len = 0;
154 21 : utf8error_t result = UTF8ERROR_TRUNCATED;
155 21 : if ( source <= 0x7ff ) {
156 12 : if ( source <= 0x7f ) {
157 : /* 1 byte character */
158 : /* check if there is enough space for the character */
159 10 : if ( max_size >= 1 ) {
160 8 : destination[0] = source;
161 8 : *out_len = 1;
162 8 : result = UTF8ERROR_SUCCESS;
163 : }
164 : }
165 : else {
166 : /* 2 byte character */
167 2 : if ( max_size >= 2 ) {
168 2 : destination[0] = 0xc0 | ( source >> 6 );
169 2 : destination[1] = 0x80 | ( source & 0x3f );
170 2 : *out_len = 2;
171 2 : result = UTF8ERROR_SUCCESS;
172 : }
173 : }
174 : }
175 : else {
176 9 : if ( source <= 0x10ffff ) {
177 8 : if ( source <= 0xffff ) {
178 : /* 3 byte character */
179 6 : if ( max_size >= 3 ) {
180 4 : destination[0] = 0xe0 | ( source >> 12 );
181 4 : destination[1] = 0x80 | (( source >> 6 ) & 0x3f );
182 4 : destination[2] = 0x80 | ( source & 0x3f );
183 4 : *out_len = 3;
184 4 : result = UTF8ERROR_SUCCESS;
185 : }
186 : }
187 : else {
188 : /* 4 byte character */
189 2 : if ( max_size >= 4 ) {
190 2 : destination[0] = 0xf0 | ( source >> 18 );
191 2 : destination[1] = 0x80 | (( source >> 12 ) & 0x3f );
192 2 : destination[2] = 0x80 | (( source >> 6 ) & 0x3f );
193 2 : destination[3] = 0x80 | ( source & 0x3f );
194 2 : *out_len = 4;
195 2 : result = UTF8ERROR_SUCCESS;
196 : }
197 : }
198 : }
199 : else {
200 : /* note: utf8 can not encode more than 21 bits per character, and even there only 0-0x10ffff is allowed. */
201 1 : result = UTF8ERROR_NOT_A_CODEPOINT;
202 : }
203 : }
204 21 : return result;
205 : }
206 :
207 2117172 : utf8error_t utf8_string_buf_private_replace_region_by_str( utf8stringbuf_t this_, unsigned int this_Length, int start, int length, const char *replacement ) {
208 2117172 : utf8error_t result = UTF8ERROR_OUT_OF_RANGE;
209 2117172 : if (( start >= 0 ) && ( start <= this_Length ) && ( length >= 0 ) && (( start + length ) <= this_Length )) {
210 2117172 : result = UTF8ERROR_SUCCESS;
211 2117172 : unsigned int replaceLen = ( replacement == NULL ) ? (0) : ( strlen(replacement) );
212 2117172 : int tailLen = this_Length - start - length;
213 2117172 : if ( length > replaceLen ) {
214 10006 : memmove( &(this_.buf[start+replaceLen]), &(this_.buf[start+length]), tailLen+1 );
215 : }
216 2107166 : else if ( length < replaceLen ) {
217 10014 : if ( ( start + replaceLen + tailLen ) < this_.size ) {
218 10004 : memmove( &(this_.buf[start+replaceLen]), &(this_.buf[start+length]), tailLen+1 );
219 : }
220 10 : else if ( ( start + replaceLen ) < this_.size ) {
221 8 : tailLen = this_.size - start - replaceLen - 1;
222 8 : memmove( &(this_.buf[start+replaceLen]), &(this_.buf[start+length]), tailLen );
223 8 : result = UTF8ERROR_TRUNCATED;
224 : }
225 : else {
226 2 : replaceLen = this_.size - start - 1;
227 2 : result = UTF8ERROR_TRUNCATED;
228 : }
229 : }
230 : else {
231 : /* original and replacement strings have equal lengths */
232 : }
233 : /* replace string */
234 2117172 : if ( replaceLen > 0 ) {
235 2117164 : const size_t replace_len = (size_t)(replaceLen);
236 2117164 : memcpy ( &(this_.buf[start]), replacement, replace_len );
237 : }
238 : /* terminate string */
239 2117172 : if ( result != UTF8ERROR_SUCCESS ) {
240 10 : utf8_string_buf_private_make_null_termination( this_ );
241 : }
242 : }
243 2117172 : return result;
244 : }
245 :
246 20 : utf8error_t utf8stringbuf_replace_all( const utf8stringbuf_t this_, const char *const ((*patterns_and_replacements)[][2]) ) {
247 20 : utf8error_t result = UTF8ERROR_NULL_PARAM;
248 :
249 : /* count input patterns */
250 20 : int maxPatternIdx = 0;
251 20 : if ( patterns_and_replacements != NULL ) {
252 19 : result = UTF8ERROR_SUCCESS;
253 278 : for ( maxPatternIdx = 0; (*patterns_and_replacements)[maxPatternIdx][0] != NULL; maxPatternIdx ++ ) {
254 : };
255 : }
256 :
257 : /* search patterns */
258 20 : unsigned int thisLen = utf8stringbuf_get_length( this_ );
259 3165827 : for ( int index = 0; index < thisLen; index ++ ) {
260 3165807 : int matchingPatternIdx = -1;
261 3165807 : unsigned int remainingLength = thisLen-index;
262 35692462 : for ( int patternIdx = 0; ( patternIdx < maxPatternIdx )&&( matchingPatternIdx == -1 ); patternIdx ++ ) {
263 32526655 : int finished = 0;
264 67180507 : for ( int compareIdx = 0; ( compareIdx <= remainingLength )&&( finished == 0 ); compareIdx ++ ) {
265 34653852 : char cmpChar = (*patterns_and_replacements)[patternIdx][0][compareIdx];
266 34653852 : if ( cmpChar == '\0' ) {
267 2117252 : if ( compareIdx != 0 ) {
268 : /* all characters were equal (and there was at least one) */
269 2117172 : matchingPatternIdx = patternIdx;
270 : }
271 2117252 : finished = 1;
272 : }
273 34653852 : if ( this_.buf[index+compareIdx] != cmpChar ) {
274 : /* difference found */
275 32526647 : finished = 1;
276 : }
277 : }
278 : }
279 : /* replace pattern */
280 3165807 : if ( matchingPatternIdx != -1 ) {
281 2117172 : const char * pattern = (*patterns_and_replacements)[matchingPatternIdx][0];
282 2117172 : int patternLen = strlen( pattern );
283 2117172 : const char * replacement = (*patterns_and_replacements)[matchingPatternIdx][1];
284 2117172 : int replaceLen = 0;
285 2117172 : if ( replacement != NULL ) {
286 2117164 : replaceLen = strlen(replacement);
287 : }
288 : utf8error_t replaceErr;
289 2117172 : replaceErr = utf8_string_buf_private_replace_region_by_str( this_, thisLen, index, patternLen, replacement );
290 2117172 : if ( replaceErr != UTF8ERROR_SUCCESS ) {
291 10 : result = UTF8ERROR_TRUNCATED;
292 10 : thisLen = utf8stringbuf_get_length( this_ );
293 : }
294 : else {
295 2117162 : thisLen = thisLen - patternLen + replaceLen;
296 : }
297 2117172 : index = index + replaceLen - 1;
298 : }
299 : }
300 :
301 20 : return result;
302 : }
303 :
304 :
305 : /*
306 : * Copyright 2012-2024 Andreas Warnke
307 : *
308 : * Licensed under the Apache License, Version 2.0 (the "License");
309 : * you may not use this file except in compliance with the License.
310 : * You may obtain a copy of the License at
311 : *
312 : * http://www.apache.org/licenses/LICENSE-2.0
313 : *
314 : * Unless required by applicable law or agreed to in writing, software
315 : * distributed under the License is distributed on an "AS IS" BASIS,
316 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
317 : * See the License for the specific language governing permissions and
318 : * limitations under the License.
319 : */
|