Line data Source code
1 : /* File: utf8stringview.inl; Copyright and License: see below */
2 :
3 : #include "u8/u8_i32.h"
4 :
5 : #ifdef __cplusplus
6 : extern "C" {
7 : #endif
8 :
9 71 : static inline utf8error_t utf8stringview_init( utf8stringview_t *this_, const char* start, size_t length )
10 : {
11 71 : assert( start != NULL );
12 71 : utf8error_t result = UTF8ERROR_SUCCESS;
13 : /* clean type would have been: char ( *start_arr )[] = (char(*)[]) start; */
14 :
15 : /* check start */
16 71 : char start_copy[4] = {'\0','\0','\0','\0'};
17 71 : const size_t start_len = ( length >= 4 ) ? 4 : length;
18 71 : memcpy( &start_copy, start, start_len );
19 :
20 71 : if ( ( 0xc0 & (start_copy[0]) ) == 0x80 )
21 : {
22 5 : if ( ( 0xc0 & (start_copy[1]) ) == 0x80 )
23 : {
24 2 : if ( ( 0xc0 & (start_copy[2]) ) == 0x80 )
25 : {
26 1 : start += 3;
27 1 : length -= 3; /* length was greater than 2 - otherwise start_copy[2] would have been 0x0 */
28 1 : result = UTF8ERROR_OUT_OF_RANGE;
29 : }
30 : else
31 : {
32 1 : start += 2;
33 1 : length -= 2; /* length was greater than 1 - otherwise start_copy[1] would have been 0x0 */
34 1 : result = UTF8ERROR_OUT_OF_RANGE;
35 : }
36 : }
37 : else
38 : {
39 3 : start += 1;
40 3 : length -= 1; /* length was greater than 0 - otherwise start_copy[0] would have been 0x0 */
41 3 : result = UTF8ERROR_OUT_OF_RANGE;
42 : }
43 : }
44 : else
45 : {
46 : /* valid start */
47 : }
48 :
49 : /* check end */
50 71 : char end_copy[4] = {'\0','\0','\0','\0'};
51 71 : const size_t end_len = ( length >= 4 ) ? 4 : length;
52 71 : memcpy( &(end_copy[4-end_len]), &(start[length-end_len]), end_len );
53 :
54 71 : if ( ( 0x80 & (end_copy[3]) ) == 0x00 )
55 : {
56 : /* valid single-byte end */
57 : }
58 : else
59 : {
60 9 : if ( ( 0xe0 & (end_copy[2]) ) == 0xc0 )
61 : {
62 : /* valid 2 byte end */
63 : }
64 8 : else if ( ( 0x80 & (end_copy[2]) ) == 0x00 )
65 : {
66 : /* 1 byte char at end_copy[2] */
67 1 : length -= 1; /* length was greater than 0 - otherwise end_copy[3] would have been 0x0 */
68 1 : result = UTF8ERROR_OUT_OF_RANGE;
69 : }
70 : else
71 : {
72 7 : if ( ( 0xf0 & (end_copy[1]) ) == 0xe0 )
73 : {
74 : /* valid 3 byte end */
75 : }
76 6 : else if ( ( 0xe0 & (end_copy[1]) ) == 0xc0 )
77 : {
78 : /* 2 byte char at end_copy[1] */
79 1 : length -= 1; /* length was greater than 0 - otherwise end_copy[3] would have been 0x0 */
80 1 : result = UTF8ERROR_OUT_OF_RANGE;
81 : }
82 5 : else if ( ( 0x80 & (end_copy[1]) ) == 0x00 )
83 : {
84 : /* 1 byte char at end_copy[1] */
85 1 : length -= 2; /* length was greater than 1 - otherwise end_copy[2] would have been 0x0 */
86 1 : result = UTF8ERROR_OUT_OF_RANGE;
87 : }
88 : else
89 : {
90 4 : if ( ( 0xf8 & (end_copy[0]) ) == 0xf0 )
91 : {
92 : /* valid 4 byte end */
93 : }
94 : else
95 : {
96 : /* assume 1 byte char at end_copy[0] */
97 2 : length -= 3; /* length was greater than 2 - otherwise end_copy[1] would have been 0x0 */
98 2 : result = UTF8ERROR_OUT_OF_RANGE;
99 : }
100 : }
101 : }
102 : }
103 :
104 71 : *this_ = (utf8stringview_t){.start=start,.length=length};
105 71 : return result;
106 : }
107 :
108 61 : static inline void utf8stringview_init_str( utf8stringview_t *this_, const char* cstring )
109 : {
110 61 : *this_ = (utf8stringview_t){.start=cstring,.length=(cstring==NULL)?0:strlen(cstring)};
111 61 : }
112 :
113 16 : static inline utf8error_t utf8stringview_init_region( utf8stringview_t *this_, const char* cstring, size_t start_idx, size_t length )
114 : {
115 16 : assert( cstring != NULL );
116 16 : utf8error_t result = UTF8ERROR_SUCCESS;
117 16 : const size_t cstring_len = strlen( cstring );
118 16 : if ( start_idx > cstring_len )
119 : {
120 1 : *this_ = (utf8stringview_t){.start=cstring+start_idx,.length=0};
121 1 : result |= UTF8ERROR_OUT_OF_RANGE;
122 : }
123 : else
124 : {
125 15 : const size_t max_len = cstring_len - start_idx;
126 15 : result |= utf8stringview_init( this_, cstring+start_idx, u8_i32_min2( length, max_len ) );
127 15 : if ( length > max_len )
128 : {
129 : /* notify that stringview cannot exceed the cstring */
130 1 : result |= UTF8ERROR_OUT_OF_RANGE;
131 : }
132 : }
133 16 : return result;
134 : }
135 :
136 128 : static inline void utf8stringview_destroy( utf8stringview_t *this_ )
137 : {
138 128 : *this_ = (utf8stringview_t){.start=NULL,.length=0};
139 128 : }
140 :
141 78552 : static inline const char* utf8stringview_get_start( const utf8stringview_t *this_ ) {
142 78552 : return (*this_).start;
143 : }
144 :
145 76104 : static inline size_t utf8stringview_get_length( const utf8stringview_t *this_ ) {
146 76104 : return (*this_).length;
147 : }
148 :
149 3 : static inline size_t utf8stringview_count_codepoints( const utf8stringview_t *this_ ) {
150 3 : size_t result = 0;
151 3 : unsigned int skip = 0;
152 3 : if ( (*this_).start != NULL ) {
153 38 : for ( size_t pos = 0; pos < (*this_).length; pos ++ )
154 : {
155 35 : if ( skip > 0 )
156 : {
157 16 : skip --;
158 16 : if ( skip == 0 ) {
159 7 : result ++; /* This is the last byte of a multi byte code point */
160 : }
161 : }
162 : else
163 : {
164 19 : const unsigned char firstByte = (const unsigned char) ((*this_).start[pos]);
165 19 : if (( 0x80 & firstByte ) == 0x00 )
166 : {
167 10 : result ++; /* This is a 1 byte code point */
168 : }
169 9 : else if (( 0xc0 & firstByte ) == 0x80 )
170 : {
171 : /* This is not a valid first byte, skipping to the next byte... */
172 : }
173 8 : else if (( 0xe0 & firstByte ) == 0xc0 )
174 : {
175 2 : skip = 1; /* This is the start of a 2 byte code point */
176 : }
177 6 : else if (( 0xf0 & firstByte ) == 0xe0 )
178 : {
179 3 : skip = 2; /* This is the start of a 3 byte code point */
180 : }
181 3 : else if (( 0xf8 & firstByte ) == 0xf0 )
182 : {
183 3 : skip = 3; /* This is the start of a 4 byte code point */
184 : }
185 : else
186 : {
187 : /* This is not a valid first byte, skipping to the next byte... */
188 : }
189 : }
190 : }
191 : }
192 3 : return result;
193 : }
194 :
195 1236 : static inline bool utf8stringview_equals_str( const utf8stringview_t *this_, const char *that )
196 : {
197 : bool result;
198 1236 : if ( that != NULL )
199 : {
200 1235 : size_t len = strlen( that );
201 1235 : if ( len == (*this_).length )
202 : {
203 905 : if ( ( len == 0 )/*&&( this_.length == 0 )*/)
204 : {
205 4 : result = true;
206 : }
207 : else
208 : {
209 901 : result = ( 0 == memcmp ( (*this_).start, that, len ) );
210 : }
211 : }
212 : else
213 : {
214 330 : result = false;
215 : }
216 : }
217 : else
218 : {
219 1 : result = false;
220 : }
221 1236 : return result;
222 : }
223 :
224 4 : static inline bool utf8stringview_equals_view( const utf8stringview_t *this_, const utf8stringview_t *that )
225 : {
226 4 : assert( that != NULL );
227 : bool result;
228 4 : if ( (*that).length == (*this_).length )
229 : {
230 2 : if ( ( (*that).length == 0 )/*&&( this_.length == 0 )*/)
231 : {
232 1 : result = true;
233 : }
234 : else
235 : {
236 1 : result = ( 0 == memcmp ( (*this_).start, (*that).start, (*that).length ) );
237 : }
238 : }
239 : else
240 : {
241 2 : result = false;
242 : }
243 4 : return result;
244 : }
245 :
246 14 : static inline bool utf8stringview_starts_with_str( const utf8stringview_t *this_, utf8string_t *that )
247 : {
248 14 : bool result = false;
249 14 : if (( this_ != NULL )&&( that != NULL ))
250 : {
251 13 : const size_t that_len = strlen( that );
252 13 : if ( that_len <= (*this_).length )
253 : {
254 9 : result = ( 0 == memcmp( (*this_).start, that, that_len ) );
255 : }
256 : else
257 : {
258 4 : result = false;
259 : }
260 : }
261 14 : return result;
262 : }
263 :
264 5 : static inline bool utf8stringview_starts_with_view( const utf8stringview_t *this_, const utf8stringview_t *that )
265 : {
266 5 : assert( that != NULL );
267 5 : bool result = false;
268 5 : if (( this_ != NULL )&&( that != NULL ))
269 : {
270 5 : if ( (*that).length <= (*this_).length )
271 : {
272 4 : result = ( 0 == memcmp( (*this_).start, (*that).start, (*that).length ) );
273 : }
274 : else
275 : {
276 1 : result = false;
277 : }
278 : }
279 5 : return result;
280 : }
281 :
282 6 : static inline bool utf8stringview_ends_with_str( const utf8stringview_t *this_, utf8string_t *that )
283 : {
284 6 : bool result = false;
285 6 : if (( this_ != NULL )&&( that != NULL ))
286 : {
287 5 : const size_t that_len = strlen( that );
288 5 : if ( that_len <= (*this_).length )
289 : {
290 4 : result = ( 0 == memcmp( (*this_).start + (*this_).length - that_len, that, that_len ) );
291 : }
292 : else
293 : {
294 1 : result = false;
295 : }
296 : }
297 6 : return result;
298 : }
299 :
300 5 : static inline bool utf8stringview_ends_with_view( const utf8stringview_t *this_, const utf8stringview_t *that )
301 : {
302 5 : assert( that != NULL );
303 5 : bool result = false;
304 5 : if (( this_ != NULL )&&( that != NULL ))
305 : {
306 5 : if ( (*that).length <= (*this_).length )
307 : {
308 4 : result = ( 0 == memcmp( (*this_).start + (*this_).length - (*that).length, (*that).start, (*that).length ) );
309 : }
310 : else
311 : {
312 1 : result = false;
313 : }
314 : }
315 5 : return result;
316 : }
317 :
318 6 : static inline bool utf8stringview_contains_str( const utf8stringview_t *this_, utf8string_t *that )
319 : {
320 6 : bool result = false;
321 6 : if (( this_ != NULL )&&( that != NULL ))
322 : {
323 5 : const size_t that_len = strlen( that );
324 5 : if ( that_len <= (*this_).length )
325 : {
326 4 : const char *const end = (*this_).start + (*this_).length - that_len;
327 13 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == false ); pos ++ )
328 : {
329 9 : if ( 0 == memcmp( pos, that, that_len ) )
330 : {
331 3 : result = true;
332 : }
333 : }
334 : }
335 : }
336 6 : return result;
337 : }
338 :
339 5 : static inline bool utf8stringview_contains_view( const utf8stringview_t *this_, const utf8stringview_t *that )
340 : {
341 5 : assert( that != NULL );
342 5 : bool result = false;
343 5 : if (( this_ != NULL )&&( that != NULL ))
344 : {
345 5 : if ( (*that).length <= (*this_).length )
346 : {
347 4 : const char *const end = (*this_).start + (*this_).length - (*that).length;
348 13 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == false ); pos ++ )
349 : {
350 9 : if ( 0 == memcmp( pos, (*that).start, (*that).length ) )
351 : {
352 3 : result = true;
353 : }
354 : }
355 : }
356 : }
357 5 : return result;
358 : }
359 :
360 16 : static inline utf8error_t utf8stringview_split_at_first_str( const utf8stringview_t *this_,
361 : utf8string_t *pattern,
362 : utf8stringview_t *out_before,
363 : utf8stringview_t *out_after )
364 : {
365 16 : utf8error_t result = UTF8ERROR_NOT_FOUND;
366 :
367 16 : if (( pattern != NULL )&&( this_ != NULL ))
368 15 : {
369 15 : const size_t pattern_len = strlen( pattern );
370 15 : if ( pattern_len <= (*this_).length )
371 : {
372 11 : const char *const end = (*this_).start + (*this_).length - pattern_len;
373 34 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == UTF8ERROR_NOT_FOUND ); pos ++ )
374 : {
375 23 : if ( 0 == memcmp( pos, pattern, pattern_len ) )
376 : {
377 9 : result = UTF8ERROR_SUCCESS;
378 9 : if ( out_before != NULL )
379 : {
380 8 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = ( pos - (*this_).start ) };
381 : }
382 9 : if ( out_after != NULL )
383 : {
384 8 : *out_after = (utf8stringview_t){ .start = ( pos + pattern_len ), .length = ( end - pos ) };
385 : }
386 : }
387 : }
388 : }
389 : }
390 : else
391 : {
392 1 : result = UTF8ERROR_NULL_PARAM;
393 : }
394 :
395 16 : return result;
396 : }
397 :
398 5 : static inline utf8error_t utf8stringview_split_at_first_view( const utf8stringview_t *this_,
399 : const utf8stringview_t *pattern,
400 : utf8stringview_t *out_before,
401 : utf8stringview_t *out_after )
402 : {
403 5 : assert( pattern != NULL );
404 5 : utf8error_t result = UTF8ERROR_NOT_FOUND;
405 :
406 5 : if ( (*pattern).length <= (*this_).length )
407 : {
408 4 : const char *const end = (*this_).start + (*this_).length - (*pattern).length;
409 12 : for ( const char* pos = (*this_).start; ( pos <= end )&&( result == UTF8ERROR_NOT_FOUND ); pos ++ )
410 : {
411 8 : if ( 0 == memcmp( pos, (*pattern).start, (*pattern).length ) )
412 : {
413 3 : result = UTF8ERROR_SUCCESS;
414 3 : if ( out_before != NULL )
415 : {
416 2 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = ( pos - (*this_).start ) };
417 : }
418 3 : if ( out_after != NULL )
419 : {
420 2 : *out_after = (utf8stringview_t){ .start = ( pos + (*pattern).length ), .length = ( end - pos ) };
421 : }
422 : }
423 : }
424 : }
425 :
426 5 : return result;
427 : }
428 :
429 18 : static inline utf8error_t utf8stringview_split_at_last_str( const utf8stringview_t *this_,
430 : utf8string_t *pattern,
431 : utf8stringview_t *out_before,
432 : utf8stringview_t *out_after )
433 : {
434 18 : utf8error_t result = UTF8ERROR_NOT_FOUND;
435 :
436 18 : if (( pattern != NULL )&&( this_ != NULL ))
437 17 : {
438 17 : const size_t pattern_len = strlen( pattern );
439 17 : if ( pattern_len <= (*this_).length )
440 : {
441 358 : for ( ptrdiff_t pos = (*this_).length - pattern_len; ( pos >= 0 )&&( result == UTF8ERROR_NOT_FOUND ); pos -- )
442 : {
443 342 : if ( 0 == memcmp( (*this_).start + pos, pattern, pattern_len ) )
444 : {
445 7 : result = UTF8ERROR_SUCCESS;
446 7 : if ( out_before != NULL )
447 : {
448 6 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = pos };
449 : }
450 7 : if ( out_after != NULL )
451 : {
452 6 : *out_after = (utf8stringview_t){ .start = ( (*this_).start + pos + pattern_len ), .length = ( (*this_).length - pattern_len - pos ) };
453 : }
454 : }
455 : }
456 : }
457 : }
458 : else
459 : {
460 1 : result = UTF8ERROR_NULL_PARAM;
461 : }
462 :
463 18 : return result;
464 : }
465 :
466 5 : static inline utf8error_t utf8stringview_split_at_last_view( const utf8stringview_t *this_,
467 : const utf8stringview_t *pattern,
468 : utf8stringview_t *out_before,
469 : utf8stringview_t *out_after )
470 : {
471 5 : assert( pattern != NULL );
472 5 : utf8error_t result = UTF8ERROR_NOT_FOUND;
473 :
474 5 : if ( (*pattern).length <= (*this_).length )
475 : {
476 14 : for ( ptrdiff_t pos = (*this_).length - (*pattern).length; ( pos >= 0 )&&( result == UTF8ERROR_NOT_FOUND ); pos -- )
477 : {
478 10 : if ( 0 == memcmp( (*this_).start + pos, (*pattern).start, (*pattern).length ) )
479 : {
480 3 : result = UTF8ERROR_SUCCESS;
481 3 : if ( out_before != NULL )
482 : {
483 2 : *out_before = (utf8stringview_t){ .start = (*this_).start, .length = pos };
484 : }
485 3 : if ( out_after != NULL )
486 : {
487 2 : *out_after = (utf8stringview_t){ .start = ( (*this_).start + pos + (*pattern).length ), .length = ( (*this_).length - (*pattern).length - pos ) };
488 : }
489 : }
490 : }
491 : }
492 :
493 5 : return result;
494 : }
495 :
496 : #ifdef __cplusplus
497 : }
498 : #endif
499 :
500 :
501 : /*
502 : * Copyright 2021-2024 Andreas Warnke
503 : *
504 : * Licensed under the Apache License, Version 2.0 (the "License");
505 : * you may not use this file except in compliance with the License.
506 : * You may obtain a copy of the License at
507 : *
508 : * http://www.apache.org/licenses/LICENSE-2.0
509 : *
510 : * Unless required by applicable law or agreed to in writing, software
511 : * distributed under the License is distributed on an "AS IS" BASIS,
512 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
513 : * See the License for the specific language governing permissions and
514 : * limitations under the License.
515 : */
|