LCOV - code coverage report
Current view: top level - u8stream/include/utf8stringbuf - utf8stringviewtokenizer.inl (source / functions) Hit Total Coverage
Test: crystal-facet-uml_v1.61.0_covts Lines: 164 164 100.0 %
Date: 2024-10-26 21:44:38 Functions: 11 11 100.0 %

          Line data    Source code
       1             : /* File: utf8stringviewtokenizer.inl; Copyright and License: see below */
       2             : 
       3             : #ifdef __cplusplus
       4             : extern "C" {
       5             : #endif
       6             : 
       7          43 : static inline void utf8stringviewtokenizer_init ( utf8stringviewtokenizer_t *this_,
       8             :                                                   const utf8stringview_t *input_text,
       9             :                                                   utf8stringviewtokenmode_t mode )
      10             : {
      11          43 :     assert( input_text != NULL );
      12          43 :     (*this_).remaining_input_text = *input_text;
      13          43 :     (*this_).mode = mode;
      14          43 :     (*this_).last_token_line = 0;
      15          43 :     (*this_).current_line = 1;
      16          43 :     utf8stringviewtokenizer_private_skip_space( this_ );
      17          43 : }
      18             : 
      19          43 : static inline void utf8stringviewtokenizer_destroy ( utf8stringviewtokenizer_t *this_ )
      20             : {
      21          43 : }
      22             : 
      23         598 : static inline bool utf8stringviewtokenizer_has_next ( const utf8stringviewtokenizer_t *this_ )
      24             : {
      25         598 :     return ( utf8stringview_get_length( &((*this_).remaining_input_text) ) != 0 );
      26             : }
      27             : 
      28          28 : static inline utf8stringviewtokenmode_t utf8stringviewtokenizer_get_mode ( utf8stringviewtokenizer_t *this_ )
      29             : {
      30          28 :     return (*this_).mode;
      31             : }
      32             : 
      33          58 : static inline void utf8stringviewtokenizer_set_mode ( utf8stringviewtokenizer_t *this_, utf8stringviewtokenmode_t mode )
      34             : {
      35          58 :     (*this_).mode = mode;
      36          58 : }
      37             : 
      38         621 : static inline utf8stringview_t utf8stringviewtokenizer_next ( utf8stringviewtokenizer_t *this_ )
      39             : {
      40             :     utf8stringview_t result;
      41         621 :     (*this_).last_token_line = (*this_).current_line;
      42         621 :     const char *const tok_start = utf8stringview_get_start( &((*this_).remaining_input_text) );
      43         621 :     const size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
      44         621 :     if ( len > 0 )
      45             :     {
      46         613 :         size_t tok_len = 0;
      47         613 :         bool end_found = false;
      48             :         /* check for numbers */
      49         613 :         if ( (*this_).mode != UTF8STRINGVIEWTOKENMODE_TEXT )
      50             :         {
      51         415 :             const size_t num_len = utf8stringviewtokenizer_private_get_number_len( this_ );
      52         415 :             if ( num_len != 0 )
      53             :             {
      54         215 :                 tok_len = num_len;
      55         215 :                 end_found = true;
      56             :             }
      57             :         }
      58             :         /* check for special characters / standalone-tokens */
      59         613 :         if ( ! end_found )
      60             :         {
      61         398 :             const bool is_stanalone
      62         398 :                 = ( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT_ONLY ) || utf8stringviewtokenizer_private_is_standalone( this_, tok_start[0] );
      63         398 :             if ( is_stanalone )
      64             :             {
      65         319 :                 tok_len = 1;
      66         319 :                 end_found = true;
      67             :             }
      68             :         }
      69             :         /* check for alphanumerical tokens, end when other token found */
      70         823 :         for ( size_t probe_idx = 1; ( probe_idx < len )&&( ! end_found ); probe_idx ++ )
      71             :         {
      72             :             end_found
      73         210 :                 = utf8stringviewtokenizer_private_is_space( this_, tok_start[probe_idx] )
      74         210 :                 || utf8stringviewtokenizer_private_is_standalone( this_, tok_start[probe_idx] );
      75         210 :             tok_len = probe_idx;
      76             :         }
      77             :         /* determine result */
      78         613 :         if ( end_found )
      79             :         {
      80         608 :             result = UTF8STRINGVIEW(tok_start,tok_len);
      81         608 :             (*this_).remaining_input_text = UTF8STRINGVIEW(tok_start+tok_len,len-tok_len);
      82         608 :             utf8stringviewtokenizer_private_skip_space( this_ );
      83             :         }
      84             :         else
      85             :         {
      86           5 :             result = (*this_).remaining_input_text;
      87           5 :             (*this_).remaining_input_text = UTF8STRINGVIEW_EMPTY;
      88             :         }
      89             :     }
      90             :     else
      91             :     {
      92           8 :         result = UTF8STRINGVIEW_EMPTY;
      93             :     }
      94         621 :     return result;
      95             : }
      96             : 
      97          11 : static inline uint32_t utf8stringviewtokenizer_get_line ( const utf8stringviewtokenizer_t *this_ )
      98             : {
      99          11 :     return (*this_).last_token_line;
     100             : }
     101             : 
     102        1052 : static inline bool utf8stringviewtokenizer_private_is_space( utf8stringviewtokenizer_t *this_, char ascii )
     103             : {
     104        1052 :     const unsigned char u_asc = (unsigned char) ascii;
     105             :     /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */
     106        1052 :     return ( u_asc <= 0x20 )||( u_asc == 0x7f );
     107             : }
     108             : 
     109         416 : static inline bool utf8stringviewtokenizer_private_is_standalone( utf8stringviewtokenizer_t *this_, char ascii )
     110             : {
     111         416 :     const unsigned char u_asc = (unsigned char) ascii;
     112         416 :     return (( u_asc >= 0x21 )&&( u_asc <= 0x2f )) || (( u_asc >= 0x3a )&&( u_asc <= 0x40 )) || (( u_asc >= 0x5b )&&( u_asc <= 0x5e ))
     113         832 :         || (( u_asc == 0x60 )) || (( u_asc >= 0x7b )&&( u_asc <= 0x7e ));
     114             : }
     115             : 
     116             : enum utf8stringviewtokenizer_private_number_passed_enum {
     117             :     UTF8STRINGVIEWTOKENIZER_INIT = 0,  /*!< nothing passed yet */
     118             :     UTF8STRINGVIEWTOKENIZER_MANT_SIGN = 1,  /*!< sign of mantissa passed */
     119             :     UTF8STRINGVIEWTOKENIZER_MANT_INT = 2,  /*!< some integer portion of mantissa passed */
     120             :     UTF8STRINGVIEWTOKENIZER_END_INT = 3,  /*!< whatever valid integer number might have been processed, we are beyond now */
     121             :     UTF8STRINGVIEWTOKENIZER_MANT_POINT = 4,  /*!< decimal point of mantissa passed */
     122             :     UTF8STRINGVIEWTOKENIZER_MANT_FRACT = 5,  /*!< some fraction of mantissa passed */
     123             :     UTF8STRINGVIEWTOKENIZER_BASE = 6,  /*!< base e identifier passed */
     124             :     UTF8STRINGVIEWTOKENIZER_EXP_SIGN = 7,  /*!< sign of exponent passed */
     125             :     UTF8STRINGVIEWTOKENIZER_EXP_INT = 8,  /*!< some integer portion of exponent passed */
     126             :     UTF8STRINGVIEWTOKENIZER_INFINITY = 9,  /*!< name of Infinity being processed */
     127             :     UTF8STRINGVIEWTOKENIZER_NAN = 10,  /*!< name of NaN being processed */
     128             :     UTF8STRINGVIEWTOKENIZER_END_FLOAT = 11,  /*!< whatever valid float number might have been processed, we are beyond now */
     129             : };
     130             : 
     131         415 : static inline size_t utf8stringviewtokenizer_private_get_number_len( utf8stringviewtokenizer_t *this_ )
     132             : {
     133         415 :     const char *start = utf8stringview_get_start( &((*this_).remaining_input_text) );
     134         415 :     const size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
     135         415 :     enum utf8stringviewtokenizer_private_number_passed_enum state = UTF8STRINGVIEWTOKENIZER_INIT;
     136         415 :     const bool float_mode = (( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT )||( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT_ONLY ));
     137         415 :     const enum utf8stringviewtokenizer_private_number_passed_enum end_state
     138         415 :         = float_mode ? UTF8STRINGVIEWTOKENIZER_END_FLOAT : UTF8STRINGVIEWTOKENIZER_END_INT;
     139         415 :     size_t valid_len = 0;
     140             : 
     141        1205 :     for ( size_t probe_idx = 0; ( probe_idx < len )&&( state < end_state ); probe_idx ++ )
     142             :     {
     143         790 :         char probe = start[probe_idx];
     144             :         /* printf("state:%i\n",state); */
     145         790 :         switch ( state )
     146             :         {
     147         415 :             case UTF8STRINGVIEWTOKENIZER_INIT:
     148             :             {
     149         415 :                 if (( probe == '+' )||( probe == '-' ))
     150             :                 {
     151          50 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_SIGN;
     152             :                 }
     153         365 :                 else if (( probe >= '0' )&&( probe <= '9' ))
     154             :                 {
     155         169 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_INT;
     156         169 :                     valid_len = probe_idx+1;
     157             :                 }
     158         196 :                 else if (( probe == 'i' )||( probe == 'I' ))
     159             :                 {
     160           5 :                     state = UTF8STRINGVIEWTOKENIZER_INFINITY;
     161             :                 }
     162         191 :                 else if (( probe == 'n' )||( probe == 'N' ))
     163             :                 {
     164           2 :                     state = UTF8STRINGVIEWTOKENIZER_NAN;
     165             :                 }
     166             :                 else
     167             :                 {
     168         189 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     169             :                 }
     170             :             }
     171         415 :             break;
     172             : 
     173          50 :             case UTF8STRINGVIEWTOKENIZER_MANT_SIGN:
     174             :             {
     175          50 :                 if (( probe >= '0' )&&( probe <= '9' ))
     176             :                 {
     177          41 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_INT;
     178          41 :                     valid_len = probe_idx+1;
     179             :                 }
     180           9 :                 else if (( probe == 'i' )||( probe == 'I' ))
     181             :                 {
     182           3 :                     state = UTF8STRINGVIEWTOKENIZER_INFINITY;
     183             :                 }
     184             :                 else
     185             :                 {
     186             :                     /* a mantissa has to have at least i digit in the integer portion */
     187           6 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     188             :                 }
     189             :             }
     190          50 :             break;
     191             : 
     192         274 :             case UTF8STRINGVIEWTOKENIZER_MANT_INT:
     193             :             {
     194         274 :                 if (( probe >= '0' )&&( probe <= '9' ))
     195             :                 {
     196             :                     /* stay in state = UTF8STRINGVIEWTOKENIZER_MANT_INT; */
     197          64 :                     valid_len = probe_idx+1;
     198             :                 }
     199         210 :                 else if ( probe == '.' )
     200             :                 {
     201          12 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_POINT;
     202          12 :                     if ( float_mode )  /* do not consider the point as valid in integer mode */
     203             :                     {
     204          10 :                         valid_len = probe_idx+1;
     205             :                     }
     206             :                 }
     207         198 :                 else if (( probe == 'e' )||( probe == 'E' ))
     208             :                 {
     209           8 :                     state = UTF8STRINGVIEWTOKENIZER_BASE;
     210             :                 }
     211             :                 else
     212             :                 {
     213         190 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     214             :                 }
     215             :             }
     216         274 :             break;
     217             : 
     218          10 :             case UTF8STRINGVIEWTOKENIZER_MANT_POINT:
     219             :             {
     220          10 :                 if (( probe >= '0' )&&( probe <= '9' ))
     221             :                 {
     222           7 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_FRACT;
     223           7 :                     valid_len = probe_idx+1;
     224             :                 }
     225           3 :                 else if (( probe == 'e' )||( probe == 'E' ))
     226             :                 {
     227           2 :                     state = UTF8STRINGVIEWTOKENIZER_BASE;
     228             :                 }
     229             :                 else
     230             :                 {
     231           1 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     232             :                 }
     233             :             }
     234          10 :             break;
     235             : 
     236          12 :             case UTF8STRINGVIEWTOKENIZER_MANT_FRACT:
     237             :             {
     238          12 :                 if (( probe >= '0' )&&( probe <= '9' ))
     239             :                 {
     240             :                     /* stay in state = UTF8STRINGVIEWTOKENIZER_MANT_FRACT; */
     241           5 :                     valid_len = probe_idx+1;
     242             :                 }
     243           7 :                 else if (( probe == 'e' )||( probe == 'E' ))
     244             :                 {
     245           2 :                     state = UTF8STRINGVIEWTOKENIZER_BASE;
     246             :                 }
     247             :                 else
     248             :                 {
     249           5 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     250             :                 }
     251             :             }
     252          12 :             break;
     253             : 
     254          10 :             case UTF8STRINGVIEWTOKENIZER_BASE:
     255             :             {
     256          10 :                 if (( probe == '+' )||( probe == '-' ))
     257             :                 {
     258           4 :                     state = UTF8STRINGVIEWTOKENIZER_EXP_SIGN;
     259             :                 }
     260           6 :                 else if (( probe >= '0' )&&( probe <= '9' ))
     261             :                 {
     262           4 :                     state = UTF8STRINGVIEWTOKENIZER_EXP_INT;
     263           4 :                     valid_len = probe_idx+1;
     264             :                 }
     265             :                 else
     266             :                 {
     267           2 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     268             :                 }
     269             :             }
     270          10 :             break;
     271             : 
     272           4 :             case UTF8STRINGVIEWTOKENIZER_EXP_SIGN:
     273             :             {
     274           4 :                 if (( probe >= '0' )&&( probe <= '9' ))
     275             :                 {
     276           3 :                     state = UTF8STRINGVIEWTOKENIZER_EXP_INT;
     277           3 :                     valid_len = probe_idx+1;
     278             :                 }
     279             :                 else
     280             :                 {
     281           1 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     282             :                 }
     283             :             }
     284           4 :             break;
     285             : 
     286           9 :             case UTF8STRINGVIEWTOKENIZER_EXP_INT:
     287             :             {
     288           9 :                 if (( probe >= '0' )&&( probe <= '9' ))
     289             :                 {
     290             :                     /* stay in state = UTF8STRINGVIEWTOKENIZER_EXP_INT; */
     291           3 :                     valid_len = probe_idx+1;
     292             :                 }
     293             :                 else
     294             :                 {
     295           6 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     296             :                 }
     297             :             }
     298           9 :             break;
     299             : 
     300           4 :             case UTF8STRINGVIEWTOKENIZER_INFINITY:
     301             :             {
     302             :                 /* the first character has already been processed */
     303           4 :                 const size_t infinity_len = strlen("nfinity");
     304           4 :                 if ( (probe_idx + infinity_len) <= len )
     305             :                 {
     306           4 :                     const bool is_infinity
     307           4 :                         = ( 0 == memcmp( start+probe_idx, "nfinity", infinity_len ) )
     308           4 :                         ||( 0 == memcmp( start+probe_idx, "NFINITY", infinity_len ) );
     309           4 :                     if (is_infinity)
     310             :                     {
     311           3 :                         valid_len = probe_idx+infinity_len;
     312             :                     }
     313             :                 }
     314           4 :                 state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     315             :             }
     316             : 
     317           6 :             case UTF8STRINGVIEWTOKENIZER_NAN:
     318             :             {
     319             :                 /* the first character has already been processed */
     320           6 :                 const size_t nan_len = strlen("aN");
     321           6 :                 if ( (probe_idx + nan_len) <= len )
     322             :                 {
     323           6 :                     const bool is_nan
     324           6 :                         = ( 0 == memcmp( start+probe_idx, "an", nan_len ) )
     325           5 :                         ||( 0 == memcmp( start+probe_idx, "aN", nan_len ) )
     326          11 :                         ||( 0 == memcmp( start+probe_idx, "AN", nan_len ) );
     327           6 :                     if (is_nan)
     328             :                     {
     329           2 :                         valid_len = probe_idx+nan_len;
     330             :                     }
     331             :                 }
     332           6 :                 state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     333             :             }
     334             : 
     335           6 :             case UTF8STRINGVIEWTOKENIZER_END_INT:  /* finished, no further processign... */
     336             :             case UTF8STRINGVIEWTOKENIZER_END_FLOAT:
     337             :             {
     338             :                 /* finished, the for loop is ended */
     339             :             }
     340           6 :             break;
     341             :         }
     342             :     }
     343         415 :     return valid_len;
     344             : }
     345             : 
     346         651 : static inline void utf8stringviewtokenizer_private_skip_space ( utf8stringviewtokenizer_t *this_ )
     347             : {
     348         651 :     const char *start = utf8stringview_get_start( &((*this_).remaining_input_text) );
     349         651 :     size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
     350         873 :     while ( ( len > 0 ) && ( utf8stringviewtokenizer_private_is_space( this_, *start ) ) )
     351             :     {
     352         222 :         if ( *start == '\n' )
     353             :         {
     354          32 :             (*this_).current_line ++;
     355             :         }
     356         222 :         len --;
     357         222 :         start ++;
     358             :     }
     359         651 :     (*this_).remaining_input_text = UTF8STRINGVIEW(start,len);
     360         651 : }
     361             : 
     362             : #ifdef __cplusplus
     363             : }
     364             : #endif
     365             : 
     366             : 
     367             : /*
     368             : Copyright 2023-2024 Andreas Warnke
     369             : 
     370             : Licensed under the Apache License, Version 2.0 (the "License");
     371             : you may not use this file except in compliance with the License.
     372             : You may obtain a copy of the License at
     373             : 
     374             :     http://www.apache.org/licenses/LICENSE-2.0
     375             : 
     376             : Unless required by applicable law or agreed to in writing, software
     377             : distributed under the License is distributed on an "AS IS" BASIS,
     378             : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     379             : See the License for the specific language governing permissions and
     380             : limitations under the License.
     381             : */

Generated by: LCOV version 1.16