Line data Source code
1 : /* File: utf8stringviewtokenizer.inl; Copyright and License: see below */
2 :
3 : #ifdef __cplusplus
4 : extern "C" {
5 : #endif
6 :
7 43 : static inline void utf8stringviewtokenizer_init ( utf8stringviewtokenizer_t *this_,
8 : const utf8stringview_t *input_text,
9 : utf8stringviewtokenmode_t mode )
10 : {
11 43 : assert( input_text != NULL );
12 43 : (*this_).remaining_input_text = *input_text;
13 43 : (*this_).mode = mode;
14 43 : (*this_).last_token_line = 0;
15 43 : (*this_).current_line = 1;
16 43 : utf8stringviewtokenizer_private_skip_space( this_ );
17 43 : }
18 :
19 43 : static inline void utf8stringviewtokenizer_destroy ( utf8stringviewtokenizer_t *this_ )
20 : {
21 43 : }
22 :
23 598 : static inline bool utf8stringviewtokenizer_has_next ( const utf8stringviewtokenizer_t *this_ )
24 : {
25 598 : return ( utf8stringview_get_length( &((*this_).remaining_input_text) ) != 0 );
26 : }
27 :
28 28 : static inline utf8stringviewtokenmode_t utf8stringviewtokenizer_get_mode ( utf8stringviewtokenizer_t *this_ )
29 : {
30 28 : return (*this_).mode;
31 : }
32 :
33 58 : static inline void utf8stringviewtokenizer_set_mode ( utf8stringviewtokenizer_t *this_, utf8stringviewtokenmode_t mode )
34 : {
35 58 : (*this_).mode = mode;
36 58 : }
37 :
38 621 : static inline utf8stringview_t utf8stringviewtokenizer_next ( utf8stringviewtokenizer_t *this_ )
39 : {
40 : utf8stringview_t result;
41 621 : (*this_).last_token_line = (*this_).current_line;
42 621 : const char *const tok_start = utf8stringview_get_start( &((*this_).remaining_input_text) );
43 621 : const size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
44 621 : if ( len > 0 )
45 : {
46 613 : size_t tok_len = 0;
47 613 : bool end_found = false;
48 : /* check for numbers */
49 613 : if ( (*this_).mode != UTF8STRINGVIEWTOKENMODE_TEXT )
50 : {
51 415 : const size_t num_len = utf8stringviewtokenizer_private_get_number_len( this_ );
52 415 : if ( num_len != 0 )
53 : {
54 215 : tok_len = num_len;
55 215 : end_found = true;
56 : }
57 : }
58 : /* check for special characters / standalone-tokens */
59 613 : if ( ! end_found )
60 : {
61 398 : const bool is_stanalone
62 398 : = ( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT_ONLY ) || utf8stringviewtokenizer_private_is_standalone( this_, tok_start[0] );
63 398 : if ( is_stanalone )
64 : {
65 319 : tok_len = 1;
66 319 : end_found = true;
67 : }
68 : }
69 : /* check for alphanumerical tokens, end when other token found */
70 823 : for ( size_t probe_idx = 1; ( probe_idx < len )&&( ! end_found ); probe_idx ++ )
71 : {
72 : end_found
73 210 : = utf8stringviewtokenizer_private_is_space( this_, tok_start[probe_idx] )
74 210 : || utf8stringviewtokenizer_private_is_standalone( this_, tok_start[probe_idx] );
75 210 : tok_len = probe_idx;
76 : }
77 : /* determine result */
78 613 : if ( end_found )
79 : {
80 608 : result = UTF8STRINGVIEW(tok_start,tok_len);
81 608 : (*this_).remaining_input_text = UTF8STRINGVIEW(tok_start+tok_len,len-tok_len);
82 608 : utf8stringviewtokenizer_private_skip_space( this_ );
83 : }
84 : else
85 : {
86 5 : result = (*this_).remaining_input_text;
87 5 : (*this_).remaining_input_text = UTF8STRINGVIEW_EMPTY;
88 : }
89 : }
90 : else
91 : {
92 8 : result = UTF8STRINGVIEW_EMPTY;
93 : }
94 621 : return result;
95 : }
96 :
97 11 : static inline uint32_t utf8stringviewtokenizer_get_line ( const utf8stringviewtokenizer_t *this_ )
98 : {
99 11 : return (*this_).last_token_line;
100 : }
101 :
102 1052 : static inline bool utf8stringviewtokenizer_private_is_space( utf8stringviewtokenizer_t *this_, char ascii )
103 : {
104 1052 : const unsigned char u_asc = (unsigned char) ascii;
105 : /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */
106 1052 : return ( u_asc <= 0x20 )||( u_asc == 0x7f );
107 : }
108 :
109 416 : static inline bool utf8stringviewtokenizer_private_is_standalone( utf8stringviewtokenizer_t *this_, char ascii )
110 : {
111 416 : const unsigned char u_asc = (unsigned char) ascii;
112 416 : return (( u_asc >= 0x21 )&&( u_asc <= 0x2f )) || (( u_asc >= 0x3a )&&( u_asc <= 0x40 )) || (( u_asc >= 0x5b )&&( u_asc <= 0x5e ))
113 832 : || (( u_asc == 0x60 )) || (( u_asc >= 0x7b )&&( u_asc <= 0x7e ));
114 : }
115 :
116 : enum utf8stringviewtokenizer_private_number_passed_enum {
117 : UTF8STRINGVIEWTOKENIZER_INIT = 0, /*!< nothing passed yet */
118 : UTF8STRINGVIEWTOKENIZER_MANT_SIGN = 1, /*!< sign of mantissa passed */
119 : UTF8STRINGVIEWTOKENIZER_MANT_INT = 2, /*!< some integer portion of mantissa passed */
120 : UTF8STRINGVIEWTOKENIZER_END_INT = 3, /*!< whatever valid integer number might have been processed, we are beyond now */
121 : UTF8STRINGVIEWTOKENIZER_MANT_POINT = 4, /*!< decimal point of mantissa passed */
122 : UTF8STRINGVIEWTOKENIZER_MANT_FRACT = 5, /*!< some fraction of mantissa passed */
123 : UTF8STRINGVIEWTOKENIZER_BASE = 6, /*!< base e identifier passed */
124 : UTF8STRINGVIEWTOKENIZER_EXP_SIGN = 7, /*!< sign of exponent passed */
125 : UTF8STRINGVIEWTOKENIZER_EXP_INT = 8, /*!< some integer portion of exponent passed */
126 : UTF8STRINGVIEWTOKENIZER_INFINITY = 9, /*!< name of Infinity being processed */
127 : UTF8STRINGVIEWTOKENIZER_NAN = 10, /*!< name of NaN being processed */
128 : UTF8STRINGVIEWTOKENIZER_END_FLOAT = 11, /*!< whatever valid float number might have been processed, we are beyond now */
129 : };
130 :
131 415 : static inline size_t utf8stringviewtokenizer_private_get_number_len( utf8stringviewtokenizer_t *this_ )
132 : {
133 415 : const char *start = utf8stringview_get_start( &((*this_).remaining_input_text) );
134 415 : const size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
135 415 : enum utf8stringviewtokenizer_private_number_passed_enum state = UTF8STRINGVIEWTOKENIZER_INIT;
136 415 : const bool float_mode = (( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT )||( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT_ONLY ));
137 415 : const enum utf8stringviewtokenizer_private_number_passed_enum end_state
138 415 : = float_mode ? UTF8STRINGVIEWTOKENIZER_END_FLOAT : UTF8STRINGVIEWTOKENIZER_END_INT;
139 415 : size_t valid_len = 0;
140 :
141 1205 : for ( size_t probe_idx = 0; ( probe_idx < len )&&( state < end_state ); probe_idx ++ )
142 : {
143 790 : char probe = start[probe_idx];
144 : /* printf("state:%i\n",state); */
145 790 : switch ( state )
146 : {
147 415 : case UTF8STRINGVIEWTOKENIZER_INIT:
148 : {
149 415 : if (( probe == '+' )||( probe == '-' ))
150 : {
151 50 : state = UTF8STRINGVIEWTOKENIZER_MANT_SIGN;
152 : }
153 365 : else if (( probe >= '0' )&&( probe <= '9' ))
154 : {
155 169 : state = UTF8STRINGVIEWTOKENIZER_MANT_INT;
156 169 : valid_len = probe_idx+1;
157 : }
158 196 : else if (( probe == 'i' )||( probe == 'I' ))
159 : {
160 5 : state = UTF8STRINGVIEWTOKENIZER_INFINITY;
161 : }
162 191 : else if (( probe == 'n' )||( probe == 'N' ))
163 : {
164 2 : state = UTF8STRINGVIEWTOKENIZER_NAN;
165 : }
166 : else
167 : {
168 189 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
169 : }
170 : }
171 415 : break;
172 :
173 50 : case UTF8STRINGVIEWTOKENIZER_MANT_SIGN:
174 : {
175 50 : if (( probe >= '0' )&&( probe <= '9' ))
176 : {
177 41 : state = UTF8STRINGVIEWTOKENIZER_MANT_INT;
178 41 : valid_len = probe_idx+1;
179 : }
180 9 : else if (( probe == 'i' )||( probe == 'I' ))
181 : {
182 3 : state = UTF8STRINGVIEWTOKENIZER_INFINITY;
183 : }
184 : else
185 : {
186 : /* a mantissa has to have at least i digit in the integer portion */
187 6 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
188 : }
189 : }
190 50 : break;
191 :
192 274 : case UTF8STRINGVIEWTOKENIZER_MANT_INT:
193 : {
194 274 : if (( probe >= '0' )&&( probe <= '9' ))
195 : {
196 : /* stay in state = UTF8STRINGVIEWTOKENIZER_MANT_INT; */
197 64 : valid_len = probe_idx+1;
198 : }
199 210 : else if ( probe == '.' )
200 : {
201 12 : state = UTF8STRINGVIEWTOKENIZER_MANT_POINT;
202 12 : if ( float_mode ) /* do not consider the point as valid in integer mode */
203 : {
204 10 : valid_len = probe_idx+1;
205 : }
206 : }
207 198 : else if (( probe == 'e' )||( probe == 'E' ))
208 : {
209 8 : state = UTF8STRINGVIEWTOKENIZER_BASE;
210 : }
211 : else
212 : {
213 190 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
214 : }
215 : }
216 274 : break;
217 :
218 10 : case UTF8STRINGVIEWTOKENIZER_MANT_POINT:
219 : {
220 10 : if (( probe >= '0' )&&( probe <= '9' ))
221 : {
222 7 : state = UTF8STRINGVIEWTOKENIZER_MANT_FRACT;
223 7 : valid_len = probe_idx+1;
224 : }
225 3 : else if (( probe == 'e' )||( probe == 'E' ))
226 : {
227 2 : state = UTF8STRINGVIEWTOKENIZER_BASE;
228 : }
229 : else
230 : {
231 1 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
232 : }
233 : }
234 10 : break;
235 :
236 12 : case UTF8STRINGVIEWTOKENIZER_MANT_FRACT:
237 : {
238 12 : if (( probe >= '0' )&&( probe <= '9' ))
239 : {
240 : /* stay in state = UTF8STRINGVIEWTOKENIZER_MANT_FRACT; */
241 5 : valid_len = probe_idx+1;
242 : }
243 7 : else if (( probe == 'e' )||( probe == 'E' ))
244 : {
245 2 : state = UTF8STRINGVIEWTOKENIZER_BASE;
246 : }
247 : else
248 : {
249 5 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
250 : }
251 : }
252 12 : break;
253 :
254 10 : case UTF8STRINGVIEWTOKENIZER_BASE:
255 : {
256 10 : if (( probe == '+' )||( probe == '-' ))
257 : {
258 4 : state = UTF8STRINGVIEWTOKENIZER_EXP_SIGN;
259 : }
260 6 : else if (( probe >= '0' )&&( probe <= '9' ))
261 : {
262 4 : state = UTF8STRINGVIEWTOKENIZER_EXP_INT;
263 4 : valid_len = probe_idx+1;
264 : }
265 : else
266 : {
267 2 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
268 : }
269 : }
270 10 : break;
271 :
272 4 : case UTF8STRINGVIEWTOKENIZER_EXP_SIGN:
273 : {
274 4 : if (( probe >= '0' )&&( probe <= '9' ))
275 : {
276 3 : state = UTF8STRINGVIEWTOKENIZER_EXP_INT;
277 3 : valid_len = probe_idx+1;
278 : }
279 : else
280 : {
281 1 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
282 : }
283 : }
284 4 : break;
285 :
286 9 : case UTF8STRINGVIEWTOKENIZER_EXP_INT:
287 : {
288 9 : if (( probe >= '0' )&&( probe <= '9' ))
289 : {
290 : /* stay in state = UTF8STRINGVIEWTOKENIZER_EXP_INT; */
291 3 : valid_len = probe_idx+1;
292 : }
293 : else
294 : {
295 6 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
296 : }
297 : }
298 9 : break;
299 :
300 4 : case UTF8STRINGVIEWTOKENIZER_INFINITY:
301 : {
302 : /* the first character has already been processed */
303 4 : const size_t infinity_len = strlen("nfinity");
304 4 : if ( (probe_idx + infinity_len) <= len )
305 : {
306 4 : const bool is_infinity
307 4 : = ( 0 == memcmp( start+probe_idx, "nfinity", infinity_len ) )
308 4 : ||( 0 == memcmp( start+probe_idx, "NFINITY", infinity_len ) );
309 4 : if (is_infinity)
310 : {
311 3 : valid_len = probe_idx+infinity_len;
312 : }
313 : }
314 4 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
315 : }
316 :
317 6 : case UTF8STRINGVIEWTOKENIZER_NAN:
318 : {
319 : /* the first character has already been processed */
320 6 : const size_t nan_len = strlen("aN");
321 6 : if ( (probe_idx + nan_len) <= len )
322 : {
323 6 : const bool is_nan
324 6 : = ( 0 == memcmp( start+probe_idx, "an", nan_len ) )
325 5 : ||( 0 == memcmp( start+probe_idx, "aN", nan_len ) )
326 11 : ||( 0 == memcmp( start+probe_idx, "AN", nan_len ) );
327 6 : if (is_nan)
328 : {
329 2 : valid_len = probe_idx+nan_len;
330 : }
331 : }
332 6 : state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
333 : }
334 :
335 6 : case UTF8STRINGVIEWTOKENIZER_END_INT: /* finished, no further processign... */
336 : case UTF8STRINGVIEWTOKENIZER_END_FLOAT:
337 : {
338 : /* finished, the for loop is ended */
339 : }
340 6 : break;
341 : }
342 : }
343 415 : return valid_len;
344 : }
345 :
346 651 : static inline void utf8stringviewtokenizer_private_skip_space ( utf8stringviewtokenizer_t *this_ )
347 : {
348 651 : const char *start = utf8stringview_get_start( &((*this_).remaining_input_text) );
349 651 : size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
350 873 : while ( ( len > 0 ) && ( utf8stringviewtokenizer_private_is_space( this_, *start ) ) )
351 : {
352 222 : if ( *start == '\n' )
353 : {
354 32 : (*this_).current_line ++;
355 : }
356 222 : len --;
357 222 : start ++;
358 : }
359 651 : (*this_).remaining_input_text = UTF8STRINGVIEW(start,len);
360 651 : }
361 :
362 : #ifdef __cplusplus
363 : }
364 : #endif
365 :
366 :
367 : /*
368 : Copyright 2023-2024 Andreas Warnke
369 :
370 : Licensed under the Apache License, Version 2.0 (the "License");
371 : you may not use this file except in compliance with the License.
372 : You may obtain a copy of the License at
373 :
374 : http://www.apache.org/licenses/LICENSE-2.0
375 :
376 : Unless required by applicable law or agreed to in writing, software
377 : distributed under the License is distributed on an "AS IS" BASIS,
378 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
379 : See the License for the specific language governing permissions and
380 : limitations under the License.
381 : */
|