Line data Source code
1 : /* File: utf8stringview.inl; Copyright and License: see below */
2 :
3 : #include "u8/u8_i32.h"
4 :
5 : #ifdef __cplusplus
6 : extern "C" {
7 : #endif
8 :
9 98 : static inline utf8error_t utf8stringview_init( utf8stringview_t *this_, const char* start, size_t length )
10 : {
11 98 : assert( start != NULL );
12 98 : utf8error_t result = UTF8ERROR_SUCCESS;
13 : /* clean type would have been: char ( *start_arr )[] = (char(*)[]) start; */
14 :
15 : /* check start */
16 98 : char start_copy[4] = {'\0','\0','\0','\0'};
17 98 : const size_t start_len = ( length >= 4 ) ? 4 : length;
18 98 : memcpy( &start_copy, start, start_len );
19 :
20 98 : if ( ( 0xc0 & (start_copy[0]) ) == 0x80 )
21 : {
22 5 : if ( ( 0xc0 & (start_copy[1]) ) == 0x80 )
23 : {
24 2 : if ( ( 0xc0 & (start_copy[2]) ) == 0x80 )
25 : {
26 1 : start += 3;
27 1 : length -= 3; /* length was greater than 2 - otherwise start_copy[2] would have been 0x0 */
28 1 : result = UTF8ERROR_OUT_OF_RANGE;
29 : }
30 : else
31 : {
32 1 : start += 2;
33 1 : length -= 2; /* length was greater than 1 - otherwise start_copy[1] would have been 0x0 */
34 1 : result = UTF8ERROR_OUT_OF_RANGE;
35 : }
36 : }
37 : else
38 : {
39 3 : start += 1;
40 3 : length -= 1; /* length was greater than 0 - otherwise start_copy[0] would have been 0x0 */
41 3 : result = UTF8ERROR_OUT_OF_RANGE;
42 : }
43 : }
44 : else
45 : {
46 : /* valid start */
47 : }
48 :
49 : /* check end */
50 98 : char end_copy[4] = {'\0','\0','\0','\0'};
51 98 : const size_t end_len = ( length >= 4 ) ? 4 : length;
52 98 : memcpy( &(end_copy[4-end_len]), &(start[length-end_len]), end_len );
53 :
54 98 : if ( ( 0x80 & (end_copy[3]) ) == 0x00 )
55 : {
56 : /* valid single-byte end */
57 : }
58 : else
59 : {
60 10 : if ( ( 0xe0 & (end_copy[2]) ) == 0xc0 )
61 : {
62 : /* valid 2 byte end */
63 : }
64 9 : else if ( ( 0x80 & (end_copy[2]) ) == 0x00 )
65 : {
66 : /* 1 byte char at end_copy[2] */
67 1 : length -= 1; /* length was greater than 0 - otherwise end_copy[3] would have been 0x0 */
68 1 : result = UTF8ERROR_OUT_OF_RANGE;
69 : }
70 : else
71 : {
72 8 : if ( ( 0xf0 & (end_copy[1]) ) == 0xe0 )
73 : {
74 : /* valid 3 byte end */
75 : }
76 7 : else if ( ( 0xe0 & (end_copy[1]) ) == 0xc0 )
77 : {
78 : /* 2 byte char at end_copy[1] */
79 1 : length -= 1; /* length was greater than 0 - otherwise end_copy[3] would have been 0x0 */
80 1 : result = UTF8ERROR_OUT_OF_RANGE;
81 : }
82 6 : else if ( ( 0x80 & (end_copy[1]) ) == 0x00 )
83 : {
84 : /* 1 byte char at end_copy[1] */
85 2 : length -= 2; /* length was greater than 1 - otherwise end_copy[2] would have been 0x0 */
86 2 : result = UTF8ERROR_OUT_OF_RANGE;
87 : }
88 : else
89 : {
90 4 : if ( ( 0xf8 & (end_copy[0]) ) == 0xf0 )
91 : {
92 : /* valid 4 byte end */
93 : }
94 : else
95 : {
96 : /* assume 1 byte char at end_copy[0] */
97 2 : length -= 3; /* length was greater than 2 - otherwise end_copy[1] would have been 0x0 */
98 2 : result = UTF8ERROR_OUT_OF_RANGE;
99 : }
100 : }
101 : }
102 : }
103 :
104 98 : *this_ = (utf8stringview_t){.start=start,.length=length};
105 98 : return result;
106 : }
107 :
108 53 : static inline void utf8stringview_init_str( utf8stringview_t *this_, const char* cstring )
109 : {
110 53 : *this_ = (utf8stringview_t){.start=cstring,.length=(cstring==NULL)?0:strlen(cstring)};
111 53 : }
112 :
113 4 : static inline utf8error_t utf8stringview_init_region( utf8stringview_t *this_, const char* cstring, size_t start_idx, size_t length )
114 : {
115 4 : assert( cstring != NULL );
116 4 : utf8error_t result = UTF8ERROR_SUCCESS;
117 4 : const size_t cstring_len = strlen( cstring );
118 4 : if ( start_idx > cstring_len )
119 : {
120 1 : *this_ = (utf8stringview_t){.start=cstring+start_idx,.length=0};
121 1 : result |= UTF8ERROR_OUT_OF_RANGE;
122 : }
123 : else
124 : {
125 3 : const size_t max_len = cstring_len - start_idx;
126 3 : result |= utf8stringview_init( this_, cstring+start_idx, u8_i32_min2( length, max_len ) );
127 3 : if ( length > max_len )
128 : {
129 : /* notify that stringview cannot exceed the cstring */
130 1 : result |= UTF8ERROR_OUT_OF_RANGE;
131 : }
132 : }
133 4 : return result;
134 : }
135 :
136 151 : static inline void utf8stringview_destroy( utf8stringview_t *this_ )
137 : {
138 151 : *this_ = (utf8stringview_t){.start=NULL,.length=0};
139 151 : }
140 :
141 28134 : static inline const char* utf8stringview_get_start( const utf8stringview_t *this_ ) {
142 28134 : return (*this_).start;
143 : }
144 :
145 25606 : static inline size_t utf8stringview_get_length( const utf8stringview_t *this_ ) {
146 25606 : return (*this_).length;
147 : }
148 :
149 3 : static inline size_t utf8stringview_count_codepoints( const utf8stringview_t *this_ ) {
150 3 : size_t result = 0;
151 3 : unsigned int skip = 0;
152 3 : if ( (*this_).start != NULL ) {
153 38 : for ( size_t pos = 0; pos < (*this_).length; pos ++ )
154 : {
155 35 : if ( skip > 0 )
156 : {
157 16 : skip --;
158 16 : if ( skip == 0 ) {
159 7 : result ++; /* This is the last byte of a multi byte code point */
160 : }
161 : }
162 : else
163 : {
164 19 : const unsigned char firstByte = (const unsigned char) ((*this_).start[pos]);
165 19 : if (( 0x80 & firstByte ) == 0x00 )
166 : {
167 10 : result ++; /* This is a 1 byte code point */
168 : }
169 9 : else if (( 0xc0 & firstByte ) == 0x80 )
170 : {
171 : /* This is not a valid first byte, skipping to the next byte... */
172 : }
173 8 : else if (( 0xe0 & firstByte ) == 0xc0 )
174 : {
175 2 : skip = 1; /* This is the start of a 2 byte code point */
176 : }
177 6 : else if (( 0xf0 & firstByte ) == 0xe0 )
178 : {
179 3 : skip = 2; /* This is the start of a 3 byte code point */
180 : }
181 3 : else if (( 0xf8 & firstByte ) == 0xf0 )
182 : {
183 3 : skip = 3; /* This is the start of a 4 byte code point */
184 : }
185 : else
186 : {
187 : /* This is not a valid first byte, skipping to the next byte... */
188 : }
189 : }
190 : }
191 : }
192 3 : return result;
193 : }
194 :
195 1228 : static inline bool utf8stringview_equals_str( const utf8stringview_t *this_, const char *that )
196 : {
197 : bool result;
198 1228 : if ( that != NULL )
199 : {
200 1227 : size_t len = strlen( that );
201 1227 : if ( len == (*this_).length )
202 : {
203 899 : if ( ( len == 0 )/*&&( this_.length == 0 )*/)
204 : {
205 1 : result = true;
206 : }
207 : else
208 : {
209 898 : result = ( 0 == memcmp ( (*this_).start, that, len ) );
210 : }
211 : }
212 : else
213 : {
214 328 : result = false;
215 : }
216 : }
217 : else
218 : {
219 1 : result = false;
220 : }
221 1228 : return result;
222 : }
223 :
224 4 : static inline bool utf8stringview_equals_view( const utf8stringview_t *this_, const utf8stringview_t *that )
225 : {
226 4 : assert( that != NULL );
227 : bool result;
228 4 : if ( (*that).length == (*this_).length )
229 : {
230 2 : if ( ( (*that).length == 0 )/*&&( this_.length == 0 )*/)
231 : {
232 1 : result = true;
233 : }
234 : else
235 : {
236 1 : result = ( 0 == memcmp ( (*this_).start, (*that).start, (*that).length ) );
237 : }
238 : }
239 : else
240 : {
241 2 : result = false;
242 : }
243 4 : return result;
244 : }
245 :
246 6 : static inline bool utf8stringview_starts_with_str( const utf8stringview_t *this_, utf8string_t *that )
247 : {
248 6 : bool result = false;
249 6 : if (( this_ != NULL )&&( that != NULL ))
250 : {
251 5 : const size_t that_len = strlen( that );
252 5 : if ( that_len <= (*this_).length )
253 : {
254 4 : result = ( 0 == memcmp( (*this_).start, that, that_len ) );
255 : }
256 : else
257 : {
258 1 : result = false;
259 : }
260 : }
261 6 : return result;
262 : }
263 :
264 5 : static inline bool utf8stringview_starts_with_view( const utf8stringview_t *this_, const utf8stringview_t *that )
265 : {
266 5 : assert( that != NULL );
267 5 : bool result = false;
268 5 : if (( this_ != NULL )&&( that != NULL ))
269 : {
270 5 : if ( (*that).length <= (*this_).length )
271 : {
272 4 : result = ( 0 == memcmp( (*this_).start, (*that).start, (*that).length ) );
273 : }
274 : else
275 : {
276 1 : result = false;
277 : }
278 : }
279 5 : return result;
280 : }
281 :
282 6 : static inline bool utf8stringview_ends_with_str( const utf8stringview_t *this_, utf8string_t *that )
283 : {
284 6 : bool result = false;
285 6 : if (( this_ != NULL )&&( that != NULL ))
286 : {
287 5 : const size_t that_len = strlen( that );
288 5 : if ( that_len <= (*this_).length )
289 : {
290 4 : result = ( 0 == memcmp( (*this_).start + (*this_).length - that_len, that, that_len ) );
291 : }
292 : else
293 : {
294 1 : result = false;
295 : }
296 : }
297 6 : return result;
298 : }
299 :
300 5 : static inline bool utf8stringview_ends_with_view( const utf8stringview_t *this_, const utf8stringview_t *that )
301 : {
302 5 : assert( that != NULL );
303 5 : bool result = false;
304 5 : if (( this_ != NULL )&&( that != NULL ))
305 : {
306 5 : if ( (*that).length <= (*this_).length )
307 : {
308 4 : result = ( 0 == memcmp( (*this_).start + (*this_).length - (*that).length, (*that).start, (*that).length ) );
309 : }
310 : else
311 : {
312 1 : result = false;
313 : }
314 : }
315 5 : return result;
316 : }
317 :
318 6 : static inline bool utf8stringview_contains_str( const utf8stringview_t *this_, utf8string_t *that )
319 : {
320 6 : bool result = false;
321 6 : if (( this_ != NULL )&&( that != NULL ))
322 : {
323 5 : const size_t that_len = strlen( that );
324 5 : if ( that_len <= (*this_).length )
325 : {
326 4 : const char *const end = (*this_).start + (*this_).length - that_len;
327 13 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == false ); pos ++ )
328 : {
329 9 : if ( 0 == memcmp( pos, that, that_len ) )
330 : {
331 3 : result = true;
332 : }
333 : }
334 : }
335 : }
336 6 : return result;
337 : }
338 :
339 5 : static inline bool utf8stringview_contains_view( const utf8stringview_t *this_, const utf8stringview_t *that )
340 : {
341 5 : assert( that != NULL );
342 5 : bool result = false;
343 5 : if (( this_ != NULL )&&( that != NULL ))
344 : {
345 5 : if ( (*that).length <= (*this_).length )
346 : {
347 4 : const char *const end = (*this_).start + (*this_).length - (*that).length;
348 13 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == false ); pos ++ )
349 : {
350 9 : if ( 0 == memcmp( pos, (*that).start, (*that).length ) )
351 : {
352 3 : result = true;
353 : }
354 : }
355 : }
356 : }
357 5 : return result;
358 : }
359 :
360 16 : static inline utf8error_t utf8stringview_split_at_first_str( const utf8stringview_t *this_,
361 : utf8string_t *pattern,
362 : utf8stringview_t *out_before,
363 : utf8stringview_t *out_after )
364 : {
365 16 : utf8error_t result = UTF8ERROR_NOT_FOUND;
366 16 : if (( pattern != NULL )&&( this_ != NULL ))
367 15 : {
368 15 : const size_t pattern_len = strlen( pattern );
369 15 : if ( pattern_len <= (*this_).length )
370 : {
371 11 : const char *const end = (*this_).start + (*this_).length - pattern_len;
372 34 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == UTF8ERROR_NOT_FOUND ); pos ++ )
373 : {
374 23 : if ( 0 == memcmp( pos, pattern, pattern_len ) )
375 : {
376 9 : result = UTF8ERROR_SUCCESS;
377 9 : if ( out_before != NULL )
378 : {
379 8 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = ( pos - (*this_).start ) };
380 : }
381 9 : if ( out_after != NULL )
382 : {
383 8 : *out_after = (utf8stringview_t){ .start = ( pos + pattern_len ), .length = ( end - pos ) };
384 : }
385 : }
386 : }
387 : }
388 : }
389 : else
390 : {
391 1 : result = UTF8ERROR_NULL_PARAM;
392 : }
393 16 : return result;
394 : }
395 :
396 5 : static inline utf8error_t utf8stringview_split_at_first_view( const utf8stringview_t *this_,
397 : const utf8stringview_t *pattern,
398 : utf8stringview_t *out_before,
399 : utf8stringview_t *out_after )
400 : {
401 5 : assert( pattern != NULL );
402 5 : utf8error_t result = UTF8ERROR_NOT_FOUND;
403 5 : if (( pattern != NULL )&&( this_ != NULL ))
404 : {
405 5 : if ( (*pattern).length <= (*this_).length )
406 : {
407 4 : const char *const end = (*this_).start + (*this_).length - (*pattern).length;
408 12 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == UTF8ERROR_NOT_FOUND ); pos ++ )
409 : {
410 8 : if ( 0 == memcmp( pos, (*pattern).start, (*pattern).length ) )
411 : {
412 3 : result = UTF8ERROR_SUCCESS;
413 3 : if ( out_before != NULL )
414 : {
415 2 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = ( pos - (*this_).start ) };
416 : }
417 3 : if ( out_after != NULL )
418 : {
419 2 : *out_after = (utf8stringview_t){ .start = ( pos + (*pattern).length ), .length = ( end - pos ) };
420 : }
421 : }
422 : }
423 : }
424 : }
425 : else
426 : {
427 0 : result = UTF8ERROR_NULL_PARAM;
428 : }
429 5 : return result;
430 : }
431 :
432 18 : static inline utf8error_t utf8stringview_split_at_last_str( const utf8stringview_t *this_,
433 : utf8string_t *pattern,
434 : utf8stringview_t *out_before,
435 : utf8stringview_t *out_after )
436 : {
437 18 : utf8error_t result = UTF8ERROR_NOT_FOUND;
438 18 : if (( pattern != NULL )&&( this_ != NULL ))
439 17 : {
440 17 : const size_t pattern_len = strlen( pattern );
441 17 : if ( pattern_len <= (*this_).length )
442 : {
443 358 : for ( ptrdiff_t pos = (*this_).length - pattern_len; ( pos >= 0 )&&( result == UTF8ERROR_NOT_FOUND ); pos -- )
444 : {
445 342 : if ( 0 == memcmp( (*this_).start + pos, pattern, pattern_len ) )
446 : {
447 7 : result = UTF8ERROR_SUCCESS;
448 7 : if ( out_before != NULL )
449 : {
450 6 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = pos };
451 : }
452 7 : if ( out_after != NULL )
453 : {
454 6 : *out_after = (utf8stringview_t){ .start = ( (*this_).start + pos + pattern_len ), .length = ( (*this_).length - pattern_len - pos ) };
455 : }
456 : }
457 : }
458 : }
459 : }
460 : else
461 : {
462 1 : result = UTF8ERROR_NULL_PARAM;
463 : }
464 18 : return result;
465 : }
466 :
467 5 : static inline utf8error_t utf8stringview_split_at_last_view( const utf8stringview_t *this_,
468 : const utf8stringview_t *pattern,
469 : utf8stringview_t *out_before,
470 : utf8stringview_t *out_after )
471 : {
472 5 : assert( pattern != NULL );
473 5 : utf8error_t result = UTF8ERROR_NOT_FOUND;
474 5 : if (( pattern != NULL )&&( this_ != NULL ))
475 : {
476 5 : if ( (*pattern).length <= (*this_).length )
477 : {
478 14 : for ( ptrdiff_t pos = (*this_).length - (*pattern).length; ( pos >= 0 )&&( result == UTF8ERROR_NOT_FOUND ); pos -- )
479 : {
480 10 : if ( 0 == memcmp( (*this_).start + pos, (*pattern).start, (*pattern).length ) )
481 : {
482 3 : result = UTF8ERROR_SUCCESS;
483 3 : if ( out_before != NULL )
484 : {
485 2 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = pos };
486 : }
487 3 : if ( out_after != NULL )
488 : {
489 2 : *out_after = (utf8stringview_t){ .start = ( (*this_).start + pos + (*pattern).length ), .length = ( (*this_).length - (*pattern).length - pos ) };
490 : }
491 : }
492 : }
493 : }
494 : }
495 : else
496 : {
497 0 : result = UTF8ERROR_NULL_PARAM;
498 : }
499 5 : return result;
500 : }
501 :
502 : #ifdef __cplusplus
503 : }
504 : #endif
505 :
506 :
507 : /*
508 : * Copyright 2021-2024 Andreas Warnke
509 : *
510 : * Licensed under the Apache License, Version 2.0 (the "License");
511 : * you may not use this file except in compliance with the License.
512 : * You may obtain a copy of the License at
513 : *
514 : * http://www.apache.org/licenses/LICENSE-2.0
515 : *
516 : * Unless required by applicable law or agreed to in writing, software
517 : * distributed under the License is distributed on an "AS IS" BASIS,
518 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
519 : * See the License for the specific language governing permissions and
520 : * limitations under the License.
521 : */
|