Line data Source code
1 : /* File: utf8stringlines.inl; Copyright and License: see below */ 2 : 3 : #ifdef __cplusplus 4 : extern "C" { 5 : #endif 6 : 7 11 : static inline void utf8stringlines_init ( utf8stringlines_t *this_, const utf8stringview_t *lines_list, uint32_t line_length ) 8 : { 9 11 : assert( lines_list != NULL ); 10 11 : (*this_).remaining = *lines_list; 11 11 : (*this_).line_length = line_length; 12 11 : (*this_).next_is_end = false; 13 11 : (*this_).has_next = true; 14 11 : utf8stringlines_private_step_to_next( this_ ); 15 11 : } 16 : 17 11 : static inline void utf8stringlines_destroy ( utf8stringlines_t *this_ ) 18 : { 19 11 : } 20 : 21 46 : static inline bool utf8stringlines_has_next ( const utf8stringlines_t *this_ ) 22 : { 23 46 : return (*this_).has_next; 24 : } 25 : 26 29 : static inline utf8stringview_t utf8stringlines_next ( utf8stringlines_t *this_ ) 27 : { 28 29 : utf8stringview_t result = (*this_).next; 29 29 : utf8stringlines_private_step_to_next( this_ ); 30 29 : return result; 31 : } 32 : 33 40 : static inline void utf8stringlines_private_step_to_next ( utf8stringlines_t *this_ ) 34 : { 35 40 : if ( (*this_).next_is_end ) 36 : { 37 16 : (*this_).has_next = false; 38 16 : (*this_).next = UTF8STRINGVIEW_EMPTY; 39 : } 40 : else 41 : { 42 : /* search good line end */ 43 24 : uint_fast32_t line_end_pos = 0; 44 24 : uint_fast32_t a_good_pos = 0; 45 24 : uint_fast32_t codepoints = 0; 46 24 : bool force_next_line = false; /* a \n line break enforces a next line even if that is empty */ 47 24 : const char *start = utf8stringview_get_start( &((*this_).remaining) ); 48 24 : const size_t len = utf8stringview_get_length( &((*this_).remaining) ); 49 261 : for ( uint_fast32_t probe_idx = 0; ( probe_idx < len )&&( line_end_pos == 0 ); probe_idx ++ ) 50 : { 51 : /* note: char has platform dependencies, e.g.some_char==0xff has undefined results */ 52 237 : unsigned char probe = start[probe_idx] & 0xff; /* set higher bytes to 0 */ 53 : 54 : /* analyze the current character */ 55 237 : if (( 0xc0 & probe ) == 0x80 ) 56 : { 57 : /* This is not a first byte of an utf8-character byte sequence; check for asian line break possibility */ 58 61 : if ( probe_idx >= 2 ) 59 : { 60 57 : unsigned char prelast = start[probe_idx-2] & 0xff; /* set higher bytes to 0 */ 61 57 : unsigned char last = start[probe_idx-1] & 0xff; /* set higher bytes to 0 */ 62 57 : if ( utf8stringlines_private_is_ideographic_comma( this_, prelast, last, probe ) ) 63 : { 64 3 : a_good_pos = probe_idx + 1; 65 : } 66 : } 67 : } 68 : else 69 : { 70 176 : codepoints ++; 71 176 : if ( utf8stringlines_private_is_space( this_, probe ) ) 72 : { 73 : /* this can only happen for 1-byte code points */ 74 24 : a_good_pos = probe_idx + 1; 75 : } 76 : } 77 : 78 : /* evaluate if this is a good cutting point */ 79 237 : if ( probe == '\n' ) 80 : { 81 5 : line_end_pos = probe_idx + 1; 82 5 : force_next_line = true; 83 : } 84 232 : else if ( codepoints >= (*this_).line_length ) 85 : { 86 : /* we are beyond the limit */ 87 : /* take the best we have till now */ 88 32 : line_end_pos = a_good_pos; 89 : } 90 : } 91 : 92 : /* cut stringview at good line end position */ 93 24 : if ( line_end_pos != 0 ) 94 : { 95 14 : utf8stringview_t before = UTF8STRINGVIEW_EMPTY; 96 14 : utf8stringview_t after = UTF8STRINGVIEW_EMPTY; 97 : 98 14 : utf8error_t err1 = utf8stringview_init_region( &before, start, 0 /*start_idx*/, line_end_pos /*length*/ ); 99 14 : assert( err1 == UTF8ERROR_SUCCESS ); 100 : (void) err1; /* ok to ignore an error - should not happen */ 101 14 : utf8error_t err2 = utf8stringview_init_region( &after, start, line_end_pos /*start_idx*/, len-line_end_pos /*length*/ ); 102 14 : assert( err2 == UTF8ERROR_SUCCESS ); 103 : (void) err2; /* ok to ignore an error - should not happen */ 104 : 105 14 : (*this_).next_is_end = ( 0 == utf8stringview_get_length( &after ))&&( ! force_next_line ); 106 14 : (*this_).next = before; 107 14 : (*this_).remaining = after; 108 : } 109 : else 110 : { 111 : /* no suitable line end found */ 112 10 : (*this_).next_is_end = true; 113 10 : (*this_).next = (*this_).remaining; 114 10 : (*this_).remaining = UTF8STRINGVIEW_EMPTY; 115 : } 116 : } 117 40 : } 118 : 119 176 : static inline bool utf8stringlines_private_is_space( utf8stringlines_t *this_, unsigned char ascii ) 120 : { 121 : /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */ 122 176 : return ( ascii <= 0x20 )||( ascii == 0x7f ); 123 : } 124 : 125 57 : static inline bool utf8stringlines_private_is_ideographic_comma( utf8stringlines_t *this_, 126 : unsigned char utf8_first, 127 : unsigned char utf8_second, 128 : unsigned char utf8_third ) 129 : { 130 : /* note: a full coverage of unicode is more complicated, */ 131 : /* see https://stackoverflow.com/questions/9506869/are-there-character-collections-for-all-international-full-stop-punctuations */ 132 : /* this function only covers a small set of use cases: */ 133 : /* U+03000 IDEOGRAPHIC SPACE (maybe not needed?) */ 134 57 : const bool is_ideo_space = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x80 ); 135 : /* U+03002 IDEOGRAPHIC FULL STOP/COMMA (both seem frequently used) */ 136 57 : const bool is_ideo_comma = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x81 ); 137 57 : const bool is_ideo_fullstop = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x82 ); 138 : /* U+0FF0E FULLWIDTH FULL STOP/COMMA (both seem frequently used) */ 139 57 : const bool is_full_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8c ); 140 57 : const bool is_full_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8e ); 141 : /* U+0FF61 HALFWIDTH IDEOGRAPHIC FULL STOP/COMMA (maybe not needed?)*/ 142 57 : const bool is_half_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa4 ); 143 57 : const bool is_half_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa1 ); 144 57 : return is_ideo_space || is_ideo_comma || is_ideo_fullstop || is_full_comma || is_full_fullstop || is_half_comma || is_half_fullstop; 145 : } 146 : 147 : #ifdef __cplusplus 148 : } 149 : #endif 150 : 151 : 152 : /* 153 : Copyright 2025-2025 Andreas Warnke 154 : 155 : Licensed under the Apache License, Version 2.0 (the "License"); 156 : you may not use this file except in compliance with the License. 157 : You may obtain a copy of the License at 158 : 159 : http://www.apache.org/licenses/LICENSE-2.0 160 : 161 : Unless required by applicable law or agreed to in writing, software 162 : distributed under the License is distributed on an "AS IS" BASIS, 163 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 164 : See the License for the specific language governing permissions and 165 : limitations under the License. 166 : */