LCOV - code coverage report
Current view: top level - u8stream/include/utf8stringbuf - utf8stringlines.inl (source / functions) Coverage Total Hit
Test: crystal-facet-uml_v1.64.2_covts Lines: 100.0 % 67 67
Test Date: 2025-06-21 07:03:58 Functions: 100.0 % 7 7

            Line data    Source code
       1              : /* File: utf8stringlines.inl; Copyright and License: see below */
       2              : 
       3              : #ifdef __cplusplus
       4              : extern "C" {
       5              : #endif
       6              : 
       7           11 : static inline void utf8stringlines_init ( utf8stringlines_t *this_, const utf8stringview_t *lines_list, uint32_t line_length )
       8              : {
       9           11 :     assert( lines_list != NULL );
      10           11 :     (*this_).remaining = *lines_list;
      11           11 :     (*this_).line_length = line_length;
      12           11 :     (*this_).next_is_end = false;
      13           11 :     (*this_).has_next = true;
      14           11 :     utf8stringlines_private_step_to_next( this_ );
      15           11 : }
      16              : 
      17           11 : static inline void utf8stringlines_destroy ( utf8stringlines_t *this_ )
      18              : {
      19           11 : }
      20              : 
      21           46 : static inline bool utf8stringlines_has_next ( const utf8stringlines_t *this_ )
      22              : {
      23           46 :     return (*this_).has_next;
      24              : }
      25              : 
      26           29 : static inline utf8stringview_t utf8stringlines_next ( utf8stringlines_t *this_ )
      27              : {
      28           29 :     utf8stringview_t result = (*this_).next;
      29           29 :     utf8stringlines_private_step_to_next( this_ );
      30           29 :     return result;
      31              : }
      32              : 
      33           40 : static inline void utf8stringlines_private_step_to_next ( utf8stringlines_t *this_ )
      34              : {
      35           40 :     if ( (*this_).next_is_end )
      36              :     {
      37           16 :         (*this_).has_next = false;
      38           16 :         (*this_).next = UTF8STRINGVIEW_EMPTY;
      39              :     }
      40              :     else
      41              :     {
      42              :         /* search good line end */
      43           24 :         uint_fast32_t line_end_pos = 0;
      44           24 :         uint_fast32_t a_good_pos = 0;
      45           24 :         uint_fast32_t codepoints = 0;
      46           24 :         bool force_next_line = false;  /* a \n line break enforces a next line even if that is empty */
      47           24 :         const char *start = utf8stringview_get_start( &((*this_).remaining) );
      48           24 :         const size_t len = utf8stringview_get_length( &((*this_).remaining) );
      49          261 :         for ( uint_fast32_t probe_idx = 0; ( probe_idx < len )&&( line_end_pos == 0 ); probe_idx ++ )
      50              :         {
      51              :             /* note: char has platform dependencies, e.g.some_char==0xff has undefined results */
      52          237 :             unsigned char probe = start[probe_idx] & 0xff;  /* set higher bytes to 0 */
      53              : 
      54              :             /* analyze the current character */
      55          237 :             if (( 0xc0 & probe ) == 0x80 )
      56              :             {
      57              :                 /* This is not a first byte of an utf8-character byte sequence; check for asian line break possibility */
      58           61 :                 if ( probe_idx >= 2 )
      59              :                 {
      60           57 :                     unsigned char prelast = start[probe_idx-2] & 0xff;  /* set higher bytes to 0 */
      61           57 :                     unsigned char last = start[probe_idx-1] & 0xff;  /* set higher bytes to 0 */
      62           57 :                     if ( utf8stringlines_private_is_ideographic_comma( this_, prelast, last, probe ) )
      63              :                     {
      64            3 :                         a_good_pos = probe_idx + 1;
      65              :                     }
      66              :                 }
      67              :             }
      68              :             else
      69              :             {
      70          176 :                 codepoints ++;
      71          176 :                 if ( utf8stringlines_private_is_space( this_, probe ) )
      72              :                 {
      73              :                     /* this can only happen for 1-byte code points */
      74           24 :                     a_good_pos = probe_idx + 1;
      75              :                 }
      76              :             }
      77              : 
      78              :             /* evaluate if this is a good cutting point */
      79          237 :             if ( probe == '\n' )
      80              :             {
      81            5 :                 line_end_pos = probe_idx + 1;
      82            5 :                 force_next_line = true;
      83              :             }
      84          232 :             else if ( codepoints >= (*this_).line_length )
      85              :             {
      86              :                 /* we are beyond the limit */
      87              :                 /* take the best we have till now */
      88           32 :                 line_end_pos = a_good_pos;
      89              :             }
      90              :         }
      91              : 
      92              :         /* cut stringview at good line end position */
      93           24 :         if ( line_end_pos != 0 )
      94              :         {
      95           14 :             utf8stringview_t before = UTF8STRINGVIEW_EMPTY;
      96           14 :             utf8stringview_t after = UTF8STRINGVIEW_EMPTY;
      97              : 
      98           14 :             utf8error_t err1 = utf8stringview_init_region( &before, start, 0 /*start_idx*/, line_end_pos /*length*/ );
      99           14 :             assert( err1 == UTF8ERROR_SUCCESS );
     100              :             (void) err1;  /* ok to ignore an error - should not happen */
     101           14 :             utf8error_t err2 = utf8stringview_init_region( &after, start, line_end_pos /*start_idx*/, len-line_end_pos /*length*/ );
     102           14 :             assert( err2 == UTF8ERROR_SUCCESS );
     103              :             (void) err2;  /* ok to ignore an error - should not happen */
     104              : 
     105           14 :             (*this_).next_is_end = ( 0 == utf8stringview_get_length( &after ))&&( ! force_next_line );
     106           14 :             (*this_).next = before;
     107           14 :             (*this_).remaining = after;
     108              :         }
     109              :         else
     110              :         {
     111              :             /* no suitable line end found */
     112           10 :             (*this_).next_is_end = true;
     113           10 :             (*this_).next = (*this_).remaining;
     114           10 :             (*this_).remaining = UTF8STRINGVIEW_EMPTY;
     115              :         }
     116              :     }
     117           40 : }
     118              : 
     119          176 : static inline bool utf8stringlines_private_is_space( utf8stringlines_t *this_, unsigned char ascii )
     120              : {
     121              :     /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */
     122          176 :     return ( ascii <= 0x20 )||( ascii == 0x7f );
     123              : }
     124              : 
     125           57 : static inline bool utf8stringlines_private_is_ideographic_comma( utf8stringlines_t *this_,
     126              :                                                                  unsigned char utf8_first,
     127              :                                                                  unsigned char utf8_second,
     128              :                                                                  unsigned char utf8_third )
     129              : {
     130              :     /* note: a full coverage of unicode is more complicated, */
     131              :     /* see https://stackoverflow.com/questions/9506869/are-there-character-collections-for-all-international-full-stop-punctuations */
     132              :     /* this function only covers a small set of use cases: */
     133              :     /* U+03000 IDEOGRAPHIC SPACE (maybe not needed?) */
     134           57 :     const bool is_ideo_space = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x80 );
     135              :     /* U+03002 IDEOGRAPHIC FULL STOP/COMMA (both seem frequently used) */
     136           57 :     const bool is_ideo_comma = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x81 );
     137           57 :     const bool is_ideo_fullstop = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x82 );
     138              :     /* U+0FF0E FULLWIDTH FULL STOP/COMMA (both seem frequently used) */
     139           57 :     const bool is_full_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8c );
     140           57 :     const bool is_full_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8e );
     141              :     /* U+0FF61 HALFWIDTH IDEOGRAPHIC FULL STOP/COMMA (maybe not needed?)*/
     142           57 :     const bool is_half_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa4 );
     143           57 :     const bool is_half_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa1 );
     144           57 :     return is_ideo_space || is_ideo_comma || is_ideo_fullstop || is_full_comma || is_full_fullstop || is_half_comma || is_half_fullstop;
     145              : }
     146              : 
     147              : #ifdef __cplusplus
     148              : }
     149              : #endif
     150              : 
     151              : 
     152              : /*
     153              : Copyright 2025-2025 Andreas Warnke
     154              : 
     155              : Licensed under the Apache License, Version 2.0 (the "License");
     156              : you may not use this file except in compliance with the License.
     157              : You may obtain a copy of the License at
     158              : 
     159              :     http://www.apache.org/licenses/LICENSE-2.0
     160              : 
     161              : Unless required by applicable law or agreed to in writing, software
     162              : distributed under the License is distributed on an "AS IS" BASIS,
     163              : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     164              : See the License for the specific language governing permissions and
     165              : limitations under the License.
     166              : */
        

Generated by: LCOV version 2.0-1