LCOV - code coverage report
Current view: top level - u8stream/include/utf8stringbuf - utf8stringlines.inl (source / functions) Hit Total Coverage
Test: crystal-facet-uml_v1.65.6_covts Lines: 67 67 100.0 %
Date: 2025-09-25 21:07:53 Functions: 7 7 100.0 %

          Line data    Source code
       1             : /* File: utf8stringlines.inl; Copyright and License: see below */
       2             : 
       3             : #ifdef __cplusplus
       4             : extern "C" {
       5             : #endif
       6             : 
       7          11 : static inline void utf8stringlines_init ( utf8stringlines_t *this_, const utf8stringview_t *lines_list, uint32_t line_length )
       8             : {
       9          11 :     assert( lines_list != NULL );
      10          11 :     (*this_).remaining = *lines_list;
      11          11 :     (*this_).line_length = line_length;
      12          11 :     (*this_).next_is_end = false;
      13          11 :     (*this_).has_next = true;
      14          11 :     utf8stringlines_private_step_to_next( this_ );
      15          11 : }
      16             : 
      17          11 : static inline void utf8stringlines_destroy ( utf8stringlines_t *this_ )
      18             : {
      19          11 : }
      20             : 
      21          46 : static inline bool utf8stringlines_has_next ( const utf8stringlines_t *this_ )
      22             : {
      23          46 :     return (*this_).has_next;
      24             : }
      25             : 
      26          29 : static inline utf8stringview_t utf8stringlines_next ( utf8stringlines_t *this_ )
      27             : {
      28          29 :     utf8stringview_t result = (*this_).next;
      29          29 :     utf8stringlines_private_step_to_next( this_ );
      30          29 :     return result;
      31             : }
      32             : 
      33          40 : static inline void utf8stringlines_private_step_to_next ( utf8stringlines_t *this_ )
      34             : {
      35          40 :     if ( (*this_).next_is_end )
      36             :     {
      37          16 :         (*this_).has_next = false;
      38          16 :         (*this_).next = UTF8STRINGVIEW_EMPTY;
      39             :     }
      40             :     else
      41             :     {
      42             :         /* search good line end */
      43          24 :         uint_fast32_t line_end_pos = 0;
      44          24 :         uint_fast32_t a_good_pos = 0;
      45          24 :         uint_fast32_t codepoints = 0;
      46          24 :         bool force_next_line = false;  /* a \n line break enforces a next line even if that is empty */
      47          24 :         const char *start = utf8stringview_get_start( &((*this_).remaining) );
      48          24 :         const size_t len = utf8stringview_get_length( &((*this_).remaining) );
      49         261 :         for ( uint_fast32_t probe_idx = 0; ( probe_idx < len )&&( line_end_pos == 0 ); probe_idx ++ )
      50             :         {
      51             :             /* note: char has platform dependencies, e.g.some_char==0xff has undefined results */
      52         237 :             unsigned char probe = start[probe_idx] & 0xff;  /* set higher bytes to 0 */
      53             : 
      54             :             /* analyze the current character */
      55         237 :             if (( 0xc0 & probe ) == 0x80 )
      56             :             {
      57             :                 /* This is not a first byte of an utf8-character byte sequence; check for asian line break possibility */
      58          61 :                 if ( probe_idx >= 2 )
      59             :                 {
      60          57 :                     unsigned char prelast = start[probe_idx-2] & 0xff;  /* set higher bytes to 0 */
      61          57 :                     unsigned char last = start[probe_idx-1] & 0xff;  /* set higher bytes to 0 */
      62          57 :                     if ( utf8stringlines_private_is_ideographic_comma( this_, prelast, last, probe ) )
      63             :                     {
      64           3 :                         a_good_pos = probe_idx + 1;
      65             :                     }
      66             :                 }
      67             :             }
      68             :             else
      69             :             {
      70         176 :                 codepoints ++;
      71         176 :                 if ( utf8stringlines_private_is_space( this_, probe ) )
      72             :                 {
      73             :                     /* this can only happen for 1-byte code points */
      74          24 :                     a_good_pos = probe_idx + 1;
      75             :                 }
      76             :             }
      77             : 
      78             :             /* evaluate if this is a good cutting point */
      79         237 :             if ( probe == '\n' )
      80             :             {
      81           5 :                 line_end_pos = probe_idx + 1;
      82           5 :                 force_next_line = true;
      83             :             }
      84         232 :             else if ( codepoints >= (*this_).line_length )
      85             :             {
      86             :                 /* we are beyond the limit */
      87             :                 /* take the best we have till now */
      88          32 :                 line_end_pos = a_good_pos;
      89             :             }
      90             :         }
      91             : 
      92             :         /* cut stringview at good line end position */
      93          24 :         if ( line_end_pos != 0 )
      94             :         {
      95          14 :             utf8stringview_t before = UTF8STRINGVIEW_EMPTY;
      96          14 :             utf8stringview_t after = UTF8STRINGVIEW_EMPTY;
      97             : 
      98          14 :             utf8error_t err1 = utf8stringview_init_region( &before, start, 0 /*start_idx*/, line_end_pos /*length*/ );
      99          14 :             assert( err1 == UTF8ERROR_SUCCESS );
     100             :             (void) err1;  /* ok to ignore an error - should not happen */
     101          14 :             utf8error_t err2 = utf8stringview_init_region( &after, start, line_end_pos /*start_idx*/, len-line_end_pos /*length*/ );
     102          14 :             assert( err2 == UTF8ERROR_SUCCESS );
     103             :             (void) err2;  /* ok to ignore an error - should not happen */
     104             : 
     105          14 :             (*this_).next_is_end = ( 0 == utf8stringview_get_length( &after ))&&( ! force_next_line );
     106          14 :             (*this_).next = before;
     107          14 :             (*this_).remaining = after;
     108             :         }
     109             :         else
     110             :         {
     111             :             /* no suitable line end found */
     112          10 :             (*this_).next_is_end = true;
     113          10 :             (*this_).next = (*this_).remaining;
     114          10 :             (*this_).remaining = UTF8STRINGVIEW_EMPTY;
     115             :         }
     116             :     }
     117          40 : }
     118             : 
     119         176 : static inline bool utf8stringlines_private_is_space( utf8stringlines_t *this_, unsigned char ascii )
     120             : {
     121             :     /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */
     122         176 :     return ( ascii <= 0x20 )||( ascii == 0x7f );
     123             : }
     124             : 
     125          57 : static inline bool utf8stringlines_private_is_ideographic_comma( utf8stringlines_t *this_,
     126             :                                                                  unsigned char utf8_first,
     127             :                                                                  unsigned char utf8_second,
     128             :                                                                  unsigned char utf8_third )
     129             : {
     130             :     /* note: a full coverage of unicode is more complicated, */
     131             :     /* see https://stackoverflow.com/questions/9506869/are-there-character-collections-for-all-international-full-stop-punctuations */
     132             :     /* this function only covers a small set of use cases: */
     133             :     /* U+03000 IDEOGRAPHIC SPACE (maybe not needed?) */
     134          57 :     const bool is_ideo_space = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x80 );
     135             :     /* U+03002 IDEOGRAPHIC FULL STOP/COMMA (both seem frequently used) */
     136          57 :     const bool is_ideo_comma = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x81 );
     137          57 :     const bool is_ideo_fullstop = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x82 );
     138             :     /* U+0FF0E FULLWIDTH FULL STOP/COMMA (both seem frequently used) */
     139          57 :     const bool is_full_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8c );
     140          57 :     const bool is_full_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8e );
     141             :     /* U+0FF61 HALFWIDTH IDEOGRAPHIC FULL STOP/COMMA (maybe not needed?)*/
     142          57 :     const bool is_half_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa4 );
     143          57 :     const bool is_half_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa1 );
     144          57 :     return is_ideo_space || is_ideo_comma || is_ideo_fullstop || is_full_comma || is_full_fullstop || is_half_comma || is_half_fullstop;
     145             : }
     146             : 
     147             : #ifdef __cplusplus
     148             : }
     149             : #endif
     150             : 
     151             : 
     152             : /*
     153             : Copyright 2025-2025 Andreas Warnke
     154             : 
     155             : Licensed under the Apache License, Version 2.0 (the "License");
     156             : you may not use this file except in compliance with the License.
     157             : You may obtain a copy of the License at
     158             : 
     159             :     http://www.apache.org/licenses/LICENSE-2.0
     160             : 
     161             : Unless required by applicable law or agreed to in writing, software
     162             : distributed under the License is distributed on an "AS IS" BASIS,
     163             : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     164             : See the License for the specific language governing permissions and
     165             : limitations under the License.
     166             : */

Generated by: LCOV version 1.16