Line data Source code
1 : /* File: utf8stringlines.inl; Copyright and License: see below */
2 :
3 : #ifdef __cplusplus
4 : extern "C" {
5 : #endif
6 :
7 11 : static inline void utf8stringlines_init ( utf8stringlines_t *this_, const utf8stringview_t *lines_list, uint32_t line_length )
8 : {
9 11 : assert( lines_list != NULL );
10 11 : (*this_).remaining = *lines_list;
11 11 : (*this_).line_length = line_length;
12 11 : (*this_).next_is_end = false;
13 11 : (*this_).has_next = true;
14 11 : utf8stringlines_private_step_to_next( this_ );
15 11 : }
16 :
17 11 : static inline void utf8stringlines_destroy ( utf8stringlines_t *this_ )
18 : {
19 11 : }
20 :
21 46 : static inline bool utf8stringlines_has_next ( const utf8stringlines_t *this_ )
22 : {
23 46 : return (*this_).has_next;
24 : }
25 :
26 29 : static inline utf8stringview_t utf8stringlines_next ( utf8stringlines_t *this_ )
27 : {
28 29 : utf8stringview_t result = (*this_).next;
29 29 : utf8stringlines_private_step_to_next( this_ );
30 29 : return result;
31 : }
32 :
33 40 : static inline void utf8stringlines_private_step_to_next ( utf8stringlines_t *this_ )
34 : {
35 40 : if ( (*this_).next_is_end )
36 : {
37 16 : (*this_).has_next = false;
38 16 : (*this_).next = UTF8STRINGVIEW_EMPTY;
39 : }
40 : else
41 : {
42 : /* search good line end */
43 24 : uint_fast32_t line_end_pos = 0;
44 24 : uint_fast32_t a_good_pos = 0;
45 24 : uint_fast32_t codepoints = 0;
46 24 : bool force_next_line = false; /* a \n line break enforces a next line even if that is empty */
47 24 : const char *start = utf8stringview_get_start( &((*this_).remaining) );
48 24 : const size_t len = utf8stringview_get_length( &((*this_).remaining) );
49 261 : for ( uint_fast32_t probe_idx = 0; ( probe_idx < len )&&( line_end_pos == 0 ); probe_idx ++ )
50 : {
51 : /* note: char has platform dependencies, e.g.some_char==0xff has undefined results */
52 237 : unsigned char probe = start[probe_idx] & 0xff; /* set higher bytes to 0 */
53 :
54 : /* analyze the current character */
55 237 : if (( 0xc0 & probe ) == 0x80 )
56 : {
57 : /* This is not a first byte of an utf8-character byte sequence; check for asian line break possibility */
58 61 : if ( probe_idx >= 2 )
59 : {
60 57 : unsigned char prelast = start[probe_idx-2] & 0xff; /* set higher bytes to 0 */
61 57 : unsigned char last = start[probe_idx-1] & 0xff; /* set higher bytes to 0 */
62 57 : if ( utf8stringlines_private_is_ideographic_comma( this_, prelast, last, probe ) )
63 : {
64 3 : a_good_pos = probe_idx + 1;
65 : }
66 : }
67 : }
68 : else
69 : {
70 176 : codepoints ++;
71 176 : if ( utf8stringlines_private_is_space( this_, probe ) )
72 : {
73 : /* this can only happen for 1-byte code points */
74 24 : a_good_pos = probe_idx + 1;
75 : }
76 : }
77 :
78 : /* evaluate if this is a good cutting point */
79 237 : if ( probe == '\n' )
80 : {
81 5 : line_end_pos = probe_idx + 1;
82 5 : force_next_line = true;
83 : }
84 232 : else if ( codepoints >= (*this_).line_length )
85 : {
86 : /* we are beyond the limit */
87 : /* take the best we have till now */
88 32 : line_end_pos = a_good_pos;
89 : }
90 : }
91 :
92 : /* cut stringview at good line end position */
93 24 : if ( line_end_pos != 0 )
94 : {
95 14 : utf8stringview_t before = UTF8STRINGVIEW_EMPTY;
96 14 : utf8stringview_t after = UTF8STRINGVIEW_EMPTY;
97 :
98 14 : utf8error_t err1 = utf8stringview_init_region( &before, start, 0 /*start_idx*/, line_end_pos /*length*/ );
99 14 : assert( err1 == UTF8ERROR_SUCCESS );
100 : (void) err1; /* ok to ignore an error - should not happen */
101 14 : utf8error_t err2 = utf8stringview_init_region( &after, start, line_end_pos /*start_idx*/, len-line_end_pos /*length*/ );
102 14 : assert( err2 == UTF8ERROR_SUCCESS );
103 : (void) err2; /* ok to ignore an error - should not happen */
104 :
105 14 : (*this_).next_is_end = ( 0 == utf8stringview_get_length( &after ))&&( ! force_next_line );
106 14 : (*this_).next = before;
107 14 : (*this_).remaining = after;
108 : }
109 : else
110 : {
111 : /* no suitable line end found */
112 10 : (*this_).next_is_end = true;
113 10 : (*this_).next = (*this_).remaining;
114 10 : (*this_).remaining = UTF8STRINGVIEW_EMPTY;
115 : }
116 : }
117 40 : }
118 :
119 176 : static inline bool utf8stringlines_private_is_space( utf8stringlines_t *this_, unsigned char ascii )
120 : {
121 : /* 0x0 - 0x19 are control chars like line break and tab, 0x20 is space, 0x7f is a control character */
122 176 : return ( ascii <= 0x20 )||( ascii == 0x7f );
123 : }
124 :
125 57 : static inline bool utf8stringlines_private_is_ideographic_comma( utf8stringlines_t *this_,
126 : unsigned char utf8_first,
127 : unsigned char utf8_second,
128 : unsigned char utf8_third )
129 : {
130 : /* note: a full coverage of unicode is more complicated, */
131 : /* see https://stackoverflow.com/questions/9506869/are-there-character-collections-for-all-international-full-stop-punctuations */
132 : /* this function only covers a small set of use cases: */
133 : /* U+03000 IDEOGRAPHIC SPACE (maybe not needed?) */
134 57 : const bool is_ideo_space = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x80 );
135 : /* U+03002 IDEOGRAPHIC FULL STOP/COMMA (both seem frequently used) */
136 57 : const bool is_ideo_comma = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x81 );
137 57 : const bool is_ideo_fullstop = ( utf8_first == 0xe3 )&&( utf8_second == 0x80 )&&( utf8_third == 0x82 );
138 : /* U+0FF0E FULLWIDTH FULL STOP/COMMA (both seem frequently used) */
139 57 : const bool is_full_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8c );
140 57 : const bool is_full_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbc )&&( utf8_third == 0x8e );
141 : /* U+0FF61 HALFWIDTH IDEOGRAPHIC FULL STOP/COMMA (maybe not needed?)*/
142 57 : const bool is_half_comma = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa4 );
143 57 : const bool is_half_fullstop = ( utf8_first == 0xef )&&( utf8_second == 0xbd )&&( utf8_third == 0xa1 );
144 57 : return is_ideo_space || is_ideo_comma || is_ideo_fullstop || is_full_comma || is_full_fullstop || is_half_comma || is_half_fullstop;
145 : }
146 :
147 : #ifdef __cplusplus
148 : }
149 : #endif
150 :
151 :
152 : /*
153 : Copyright 2025-2025 Andreas Warnke
154 :
155 : Licensed under the Apache License, Version 2.0 (the "License");
156 : you may not use this file except in compliance with the License.
157 : You may obtain a copy of the License at
158 :
159 : http://www.apache.org/licenses/LICENSE-2.0
160 :
161 : Unless required by applicable law or agreed to in writing, software
162 : distributed under the License is distributed on an "AS IS" BASIS,
163 : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
164 : See the License for the specific language governing permissions and
165 : limitations under the License.
166 : */
|