LCOV - code coverage report
Current view: top level - u8stream/include/utf8stringbuf - utf8stringviewtokenizer.inl (source / functions) Coverage Total Hit
Test: crystal-facet-uml_v1.63.2_covts Lines: 100.0 % 164 164
Test Date: 2025-05-01 10:10:14 Functions: 100.0 % 11 11

            Line data    Source code
       1              : /* File: utf8stringviewtokenizer.inl; Copyright and License: see below */
       2              : 
       3              : #ifdef __cplusplus
       4              : extern "C" {
       5              : #endif
       6              : 
       7           43 : static inline void utf8stringviewtokenizer_init ( utf8stringviewtokenizer_t *this_,
       8              :                                                   const utf8stringview_t *input_text,
       9              :                                                   utf8stringviewtokenmode_t mode )
      10              : {
      11           43 :     assert( input_text != NULL );
      12           43 :     (*this_).remaining_input_text = *input_text;
      13           43 :     (*this_).mode = mode;
      14           43 :     (*this_).last_token_line = 0;
      15           43 :     (*this_).current_line = 1;
      16           43 :     utf8stringviewtokenizer_private_skip_space( this_ );
      17           43 : }
      18              : 
      19           43 : static inline void utf8stringviewtokenizer_destroy ( utf8stringviewtokenizer_t *this_ )
      20              : {
      21           43 : }
      22              : 
      23          598 : static inline bool utf8stringviewtokenizer_has_next ( const utf8stringviewtokenizer_t *this_ )
      24              : {
      25          598 :     return ( utf8stringview_get_length( &((*this_).remaining_input_text) ) != 0 );
      26              : }
      27              : 
      28           28 : static inline utf8stringviewtokenmode_t utf8stringviewtokenizer_get_mode ( utf8stringviewtokenizer_t *this_ )
      29              : {
      30           28 :     return (*this_).mode;
      31              : }
      32              : 
      33           58 : static inline void utf8stringviewtokenizer_set_mode ( utf8stringviewtokenizer_t *this_, utf8stringviewtokenmode_t mode )
      34              : {
      35           58 :     (*this_).mode = mode;
      36           58 : }
      37              : 
      38          621 : static inline utf8stringview_t utf8stringviewtokenizer_next ( utf8stringviewtokenizer_t *this_ )
      39              : {
      40              :     utf8stringview_t result;
      41          621 :     (*this_).last_token_line = (*this_).current_line;
      42          621 :     const char *const tok_start = utf8stringview_get_start( &((*this_).remaining_input_text) );
      43          621 :     const size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
      44          621 :     if ( len > 0 )
      45              :     {
      46          613 :         size_t tok_len = 0;
      47          613 :         bool end_found = false;
      48              :         /* check for numbers */
      49          613 :         if ( (*this_).mode != UTF8STRINGVIEWTOKENMODE_TEXT )
      50              :         {
      51          415 :             const size_t num_len = utf8stringviewtokenizer_private_get_number_len( this_ );
      52          415 :             if ( num_len != 0 )
      53              :             {
      54          215 :                 tok_len = num_len;
      55          215 :                 end_found = true;
      56              :             }
      57              :         }
      58              :         /* check for special characters / standalone-tokens */
      59          613 :         if ( ! end_found )
      60              :         {
      61          398 :             const bool is_stanalone
      62          398 :                 = ( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT_ONLY ) || utf8stringviewtokenizer_private_is_standalone( this_, tok_start[0] );
      63          398 :             if ( is_stanalone )
      64              :             {
      65          319 :                 tok_len = 1;
      66          319 :                 end_found = true;
      67              :             }
      68              :         }
      69              :         /* check for alphanumerical tokens, end when other token found */
      70          823 :         for ( size_t probe_idx = 1; ( probe_idx < len )&&( ! end_found ); probe_idx ++ )
      71              :         {
      72              :             end_found
      73          210 :                 = utf8stringviewtokenizer_private_is_space( this_, tok_start[probe_idx] )
      74          210 :                 || utf8stringviewtokenizer_private_is_standalone( this_, tok_start[probe_idx] );
      75          210 :             tok_len = probe_idx;
      76              :         }
      77              :         /* determine result */
      78          613 :         if ( end_found )
      79              :         {
      80          608 :             result = UTF8STRINGVIEW(tok_start,tok_len);
      81          608 :             (*this_).remaining_input_text = UTF8STRINGVIEW(tok_start+tok_len,len-tok_len);
      82          608 :             utf8stringviewtokenizer_private_skip_space( this_ );
      83              :         }
      84              :         else
      85              :         {
      86            5 :             result = (*this_).remaining_input_text;
      87            5 :             (*this_).remaining_input_text = UTF8STRINGVIEW_EMPTY;
      88              :         }
      89              :     }
      90              :     else
      91              :     {
      92            8 :         result = UTF8STRINGVIEW_EMPTY;
      93              :     }
      94          621 :     return result;
      95              : }
      96              : 
      97           11 : static inline uint32_t utf8stringviewtokenizer_get_line ( const utf8stringviewtokenizer_t *this_ )
      98              : {
      99           11 :     return (*this_).last_token_line;
     100              : }
     101              : 
     102         1052 : static inline bool utf8stringviewtokenizer_private_is_space( utf8stringviewtokenizer_t *this_, char ascii )
     103              : {
     104         1052 :     const unsigned char u_asc = (unsigned char) ascii;
     105              :     /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */
     106         1052 :     return ( u_asc <= 0x20 )||( u_asc == 0x7f );
     107              : }
     108              : 
     109          416 : static inline bool utf8stringviewtokenizer_private_is_standalone( utf8stringviewtokenizer_t *this_, char ascii )
     110              : {
     111          416 :     const unsigned char u_asc = (unsigned char) ascii;
     112          416 :     return (( u_asc >= 0x21 )&&( u_asc <= 0x2f )) || (( u_asc >= 0x3a )&&( u_asc <= 0x40 )) || (( u_asc >= 0x5b )&&( u_asc <= 0x5e ))
     113          832 :         || (( u_asc == 0x60 )) || (( u_asc >= 0x7b )&&( u_asc <= 0x7e ));
     114              : }
     115              : 
     116              : enum utf8stringviewtokenizer_private_number_passed_enum {
     117              :     UTF8STRINGVIEWTOKENIZER_INIT = 0,  /*!< nothing passed yet */
     118              :     UTF8STRINGVIEWTOKENIZER_MANT_SIGN = 1,  /*!< sign of mantissa passed */
     119              :     UTF8STRINGVIEWTOKENIZER_MANT_INT = 2,  /*!< some integer portion of mantissa passed */
     120              :     UTF8STRINGVIEWTOKENIZER_END_INT = 3,  /*!< whatever valid integer number might have been processed, we are beyond now */
     121              :     UTF8STRINGVIEWTOKENIZER_MANT_POINT = 4,  /*!< decimal point of mantissa passed */
     122              :     UTF8STRINGVIEWTOKENIZER_MANT_FRACT = 5,  /*!< some fraction of mantissa passed */
     123              :     UTF8STRINGVIEWTOKENIZER_BASE = 6,  /*!< base e identifier passed */
     124              :     UTF8STRINGVIEWTOKENIZER_EXP_SIGN = 7,  /*!< sign of exponent passed */
     125              :     UTF8STRINGVIEWTOKENIZER_EXP_INT = 8,  /*!< some integer portion of exponent passed */
     126              :     UTF8STRINGVIEWTOKENIZER_INFINITY = 9,  /*!< name of Infinity being processed */
     127              :     UTF8STRINGVIEWTOKENIZER_NAN = 10,  /*!< name of NaN being processed */
     128              :     UTF8STRINGVIEWTOKENIZER_END_FLOAT = 11,  /*!< whatever valid float number might have been processed, we are beyond now */
     129              : };
     130              : 
     131          415 : static inline size_t utf8stringviewtokenizer_private_get_number_len( utf8stringviewtokenizer_t *this_ )
     132              : {
     133          415 :     const char *start = utf8stringview_get_start( &((*this_).remaining_input_text) );
     134          415 :     const size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
     135          415 :     enum utf8stringviewtokenizer_private_number_passed_enum state = UTF8STRINGVIEWTOKENIZER_INIT;
     136          415 :     const bool float_mode = (( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT )||( (*this_).mode == UTF8STRINGVIEWTOKENMODE_FLOAT_ONLY ));
     137          415 :     const enum utf8stringviewtokenizer_private_number_passed_enum end_state
     138          415 :         = float_mode ? UTF8STRINGVIEWTOKENIZER_END_FLOAT : UTF8STRINGVIEWTOKENIZER_END_INT;
     139          415 :     size_t valid_len = 0;
     140              : 
     141         1205 :     for ( size_t probe_idx = 0; ( probe_idx < len )&&( state < end_state ); probe_idx ++ )
     142              :     {
     143          790 :         char probe = start[probe_idx];
     144              :         /* printf("state:%i\n",state); */
     145          790 :         switch ( state )
     146              :         {
     147          415 :             case UTF8STRINGVIEWTOKENIZER_INIT:
     148              :             {
     149          415 :                 if (( probe == '+' )||( probe == '-' ))
     150              :                 {
     151           50 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_SIGN;
     152              :                 }
     153          365 :                 else if (( probe >= '0' )&&( probe <= '9' ))
     154              :                 {
     155          169 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_INT;
     156          169 :                     valid_len = probe_idx+1;
     157              :                 }
     158          196 :                 else if (( probe == 'i' )||( probe == 'I' ))
     159              :                 {
     160            5 :                     state = UTF8STRINGVIEWTOKENIZER_INFINITY;
     161              :                 }
     162          191 :                 else if (( probe == 'n' )||( probe == 'N' ))
     163              :                 {
     164            2 :                     state = UTF8STRINGVIEWTOKENIZER_NAN;
     165              :                 }
     166              :                 else
     167              :                 {
     168          189 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     169              :                 }
     170              :             }
     171          415 :             break;
     172              : 
     173           50 :             case UTF8STRINGVIEWTOKENIZER_MANT_SIGN:
     174              :             {
     175           50 :                 if (( probe >= '0' )&&( probe <= '9' ))
     176              :                 {
     177           41 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_INT;
     178           41 :                     valid_len = probe_idx+1;
     179              :                 }
     180            9 :                 else if (( probe == 'i' )||( probe == 'I' ))
     181              :                 {
     182            3 :                     state = UTF8STRINGVIEWTOKENIZER_INFINITY;
     183              :                 }
     184              :                 else
     185              :                 {
     186              :                     /* a mantissa has to have at least i digit in the integer portion */
     187            6 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     188              :                 }
     189              :             }
     190           50 :             break;
     191              : 
     192          274 :             case UTF8STRINGVIEWTOKENIZER_MANT_INT:
     193              :             {
     194          274 :                 if (( probe >= '0' )&&( probe <= '9' ))
     195              :                 {
     196              :                     /* stay in state = UTF8STRINGVIEWTOKENIZER_MANT_INT; */
     197           64 :                     valid_len = probe_idx+1;
     198              :                 }
     199          210 :                 else if ( probe == '.' )
     200              :                 {
     201           12 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_POINT;
     202           12 :                     if ( float_mode )  /* do not consider the point as valid in integer mode */
     203              :                     {
     204           10 :                         valid_len = probe_idx+1;
     205              :                     }
     206              :                 }
     207          198 :                 else if (( probe == 'e' )||( probe == 'E' ))
     208              :                 {
     209            8 :                     state = UTF8STRINGVIEWTOKENIZER_BASE;
     210              :                 }
     211              :                 else
     212              :                 {
     213          190 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     214              :                 }
     215              :             }
     216          274 :             break;
     217              : 
     218           10 :             case UTF8STRINGVIEWTOKENIZER_MANT_POINT:
     219              :             {
     220           10 :                 if (( probe >= '0' )&&( probe <= '9' ))
     221              :                 {
     222            7 :                     state = UTF8STRINGVIEWTOKENIZER_MANT_FRACT;
     223            7 :                     valid_len = probe_idx+1;
     224              :                 }
     225            3 :                 else if (( probe == 'e' )||( probe == 'E' ))
     226              :                 {
     227            2 :                     state = UTF8STRINGVIEWTOKENIZER_BASE;
     228              :                 }
     229              :                 else
     230              :                 {
     231            1 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     232              :                 }
     233              :             }
     234           10 :             break;
     235              : 
     236           12 :             case UTF8STRINGVIEWTOKENIZER_MANT_FRACT:
     237              :             {
     238           12 :                 if (( probe >= '0' )&&( probe <= '9' ))
     239              :                 {
     240              :                     /* stay in state = UTF8STRINGVIEWTOKENIZER_MANT_FRACT; */
     241            5 :                     valid_len = probe_idx+1;
     242              :                 }
     243            7 :                 else if (( probe == 'e' )||( probe == 'E' ))
     244              :                 {
     245            2 :                     state = UTF8STRINGVIEWTOKENIZER_BASE;
     246              :                 }
     247              :                 else
     248              :                 {
     249            5 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     250              :                 }
     251              :             }
     252           12 :             break;
     253              : 
     254           10 :             case UTF8STRINGVIEWTOKENIZER_BASE:
     255              :             {
     256           10 :                 if (( probe == '+' )||( probe == '-' ))
     257              :                 {
     258            4 :                     state = UTF8STRINGVIEWTOKENIZER_EXP_SIGN;
     259              :                 }
     260            6 :                 else if (( probe >= '0' )&&( probe <= '9' ))
     261              :                 {
     262            4 :                     state = UTF8STRINGVIEWTOKENIZER_EXP_INT;
     263            4 :                     valid_len = probe_idx+1;
     264              :                 }
     265              :                 else
     266              :                 {
     267            2 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     268              :                 }
     269              :             }
     270           10 :             break;
     271              : 
     272            4 :             case UTF8STRINGVIEWTOKENIZER_EXP_SIGN:
     273              :             {
     274            4 :                 if (( probe >= '0' )&&( probe <= '9' ))
     275              :                 {
     276            3 :                     state = UTF8STRINGVIEWTOKENIZER_EXP_INT;
     277            3 :                     valid_len = probe_idx+1;
     278              :                 }
     279              :                 else
     280              :                 {
     281            1 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     282              :                 }
     283              :             }
     284            4 :             break;
     285              : 
     286            9 :             case UTF8STRINGVIEWTOKENIZER_EXP_INT:
     287              :             {
     288            9 :                 if (( probe >= '0' )&&( probe <= '9' ))
     289              :                 {
     290              :                     /* stay in state = UTF8STRINGVIEWTOKENIZER_EXP_INT; */
     291            3 :                     valid_len = probe_idx+1;
     292              :                 }
     293              :                 else
     294              :                 {
     295            6 :                     state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     296              :                 }
     297              :             }
     298            9 :             break;
     299              : 
     300            4 :             case UTF8STRINGVIEWTOKENIZER_INFINITY:
     301              :             {
     302              :                 /* the first character has already been processed */
     303            4 :                 const size_t infinity_len = strlen("nfinity");
     304            4 :                 if ( (probe_idx + infinity_len) <= len )
     305              :                 {
     306            4 :                     const bool is_infinity
     307            4 :                         = ( 0 == memcmp( start+probe_idx, "nfinity", infinity_len ) )
     308            4 :                         ||( 0 == memcmp( start+probe_idx, "NFINITY", infinity_len ) );
     309            4 :                     if (is_infinity)
     310              :                     {
     311            3 :                         valid_len = probe_idx+infinity_len;
     312              :                     }
     313              :                 }
     314            4 :                 state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     315              :             }
     316              : 
     317            6 :             case UTF8STRINGVIEWTOKENIZER_NAN:
     318              :             {
     319              :                 /* the first character has already been processed */
     320            6 :                 const size_t nan_len = strlen("aN");
     321            6 :                 if ( (probe_idx + nan_len) <= len )
     322              :                 {
     323            6 :                     const bool is_nan
     324            6 :                         = ( 0 == memcmp( start+probe_idx, "an", nan_len ) )
     325            5 :                         ||( 0 == memcmp( start+probe_idx, "aN", nan_len ) )
     326           11 :                         ||( 0 == memcmp( start+probe_idx, "AN", nan_len ) );
     327            6 :                     if (is_nan)
     328              :                     {
     329            2 :                         valid_len = probe_idx+nan_len;
     330              :                     }
     331              :                 }
     332            6 :                 state = UTF8STRINGVIEWTOKENIZER_END_FLOAT;
     333              :             }
     334              : 
     335            6 :             case UTF8STRINGVIEWTOKENIZER_END_INT:  /* finished, no further processign... */
     336              :             case UTF8STRINGVIEWTOKENIZER_END_FLOAT:
     337              :             {
     338              :                 /* finished, the for loop is ended */
     339              :             }
     340            6 :             break;
     341              :         }
     342              :     }
     343          415 :     return valid_len;
     344              : }
     345              : 
     346          651 : static inline void utf8stringviewtokenizer_private_skip_space ( utf8stringviewtokenizer_t *this_ )
     347              : {
     348          651 :     const char *start = utf8stringview_get_start( &((*this_).remaining_input_text) );
     349          651 :     size_t len = utf8stringview_get_length( &((*this_).remaining_input_text) );
     350          873 :     while ( ( len > 0 ) && ( utf8stringviewtokenizer_private_is_space( this_, *start ) ) )
     351              :     {
     352          222 :         if ( *start == '\n' )
     353              :         {
     354           32 :             (*this_).current_line ++;
     355              :         }
     356          222 :         len --;
     357          222 :         start ++;
     358              :     }
     359          651 :     (*this_).remaining_input_text = UTF8STRINGVIEW(start,len);
     360          651 : }
     361              : 
     362              : #ifdef __cplusplus
     363              : }
     364              : #endif
     365              : 
     366              : 
     367              : /*
     368              : Copyright 2023-2025 Andreas Warnke
     369              : 
     370              : Licensed under the Apache License, Version 2.0 (the "License");
     371              : you may not use this file except in compliance with the License.
     372              : You may obtain a copy of the License at
     373              : 
     374              :     http://www.apache.org/licenses/LICENSE-2.0
     375              : 
     376              : Unless required by applicable law or agreed to in writing, software
     377              : distributed under the License is distributed on an "AS IS" BASIS,
     378              : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     379              : See the License for the specific language governing permissions and
     380              : limitations under the License.
     381              : */
        

Generated by: LCOV version 2.0-1