Line data Source code
1 : /* File: utf8codepoint.inl; Copyright and License: see below */
2 :
3 : #include <stdint.h>
4 : #include <inttypes.h>
5 : #include <string.h>
6 : #include <assert.h>
7 :
8 : #ifdef __cplusplus
9 : extern "C" {
10 : #endif
11 :
12 : /*!
13 : * \enum utf8codepoint_enum
14 : * \private
15 : */
16 : /* enumeration for invalid code points */
17 : enum utf8codepoint_enum {UTF8CODEPOINT_INVALID_LEN=0,};
18 :
19 4133 : static inline utf8codepoint_t utf8codepoint( uint32_t code_point ) {
20 : utf8codepoint_t result;
21 : /* for balanced performance between standard and worst cases, */
22 : /* this_ is implemented as asymmetric decision tree: */
23 : /* in the best case, we have 2 comparisons, in the worst case 3 */
24 4133 : result.byte_length =
25 : ( code_point <= 0x7ff )
26 : ? (
27 : ( code_point <= 0x7f ) ? 1 : 2
28 : )
29 4133 : : (
30 : ( code_point <= 0x10ffff )
31 : ? (
32 : ( code_point <= 0xffff ) ? 3 : 4
33 : )
34 : : UTF8CODEPOINT_INVALID_LEN
35 : );
36 4133 : result.code_point = code_point;
37 4133 : return result;
38 : }
39 :
40 3879 : static inline utf8codepoint_t utf8codepoint_new( const char *that, unsigned int max_size ) {
41 3879 : utf8codepoint_t result = { UTF8CODEPOINT_INVALID_LEN, 0x0, };
42 3879 : if (( that != NULL )&&( max_size > 0 ))
43 : {
44 3877 : const unsigned char firstByte = (const unsigned char) (that[0]);
45 3877 : if (( 0x80 & firstByte ) == 0x00 )
46 : {
47 : /* 7-bit ASCII character */
48 3856 : result.byte_length = 1;
49 3856 : result.code_point = firstByte;
50 : }
51 21 : else if ( firstByte < 0xe0 )
52 : {
53 8 : if ( max_size >= 2 )
54 : {
55 7 : const unsigned char secondByte = (const unsigned char) (that[1]);
56 7 : if (( ( 0xe0 & firstByte ) == 0xc0 ) && ( ( 0xc0 & secondByte ) == 0x80 ))
57 : {
58 : /* first and second byte are valid */
59 3 : result.byte_length = 2;
60 3 : result.code_point = (((uint32_t)(firstByte & 0x1f))<<6)
61 3 : |(secondByte & 0x3f);
62 : }
63 : }
64 : }
65 13 : else if ( firstByte < 0xf0 )
66 : {
67 7 : if ( max_size >= 3 )
68 : {
69 7 : const unsigned char secondByte = (const unsigned char) (that[1]);
70 7 : const unsigned char thirdByte = (const unsigned char) (that[2]);
71 7 : if (( ( 0xc0 & secondByte ) == 0x80 ) && ( ( 0xc0 & thirdByte ) == 0x80 ))
72 : {
73 : /* second and third bytes are valid */
74 7 : result.byte_length = 3;
75 7 : result.code_point = (((uint32_t)(firstByte & 0x0f))<<12)
76 7 : |(((uint32_t)(secondByte & 0x3f))<<6)
77 7 : |(thirdByte & 0x3f);
78 : }
79 : }
80 : }
81 6 : else if ( firstByte < 0xf8 )
82 : {
83 6 : if ( max_size >= 4 )
84 : {
85 4 : const unsigned char secondByte = (const unsigned char) (that[1]);
86 4 : const unsigned char thirdByte = (const unsigned char) (that[2]);
87 4 : const unsigned char fourthByte = (const unsigned char) (that[3]);
88 4 : if (( ( 0xc0 & secondByte ) == 0x80 )
89 4 : && ( ( 0xc0 & thirdByte ) == 0x80 )
90 4 : && ( ( 0xc0 & fourthByte ) == 0x80 ))
91 : {
92 : /* second, third and fourth bytes are valid */
93 3 : result.byte_length = 4;
94 3 : result.code_point = (((uint32_t)(firstByte & 0x07))<<18)
95 3 : |(((uint32_t)(secondByte & 0x3f))<<12)
96 3 : |(((uint32_t)(thirdByte & 0x3f))<<6)
97 3 : |(fourthByte & 0x3f);
98 3 : if ( result.code_point > 0x10ffff )
99 : {
100 : /* invalid */
101 1 : result.byte_length = UTF8CODEPOINT_INVALID_LEN;
102 : }
103 : }
104 : }
105 : }
106 : }
107 3879 : return result;
108 : }
109 :
110 7720 : static inline uint32_t utf8codepoint_get_char( const utf8codepoint_t *this_ ) {
111 7720 : return (*this_).code_point;
112 : }
113 :
114 7737 : static inline unsigned int utf8codepoint_get_length( const utf8codepoint_t *this_ ) {
115 7737 : return (*this_).byte_length;
116 : }
117 :
118 3847 : static inline utf8codepointseq_t utf8codepoint_get_utf8( const utf8codepoint_t *this_ ) {
119 : utf8codepointseq_t result;
120 3847 : const uint32_t code_point = (*this_).code_point;
121 :
122 3847 : if ( code_point <= 0x7ff )
123 : {
124 3843 : if ( code_point <= 0x7f )
125 : {
126 3842 : result.seq[0] = code_point;
127 3842 : result.seq[1] = '\0';
128 3842 : result.seq[2] = '\0';
129 3842 : result.seq[3] = '\0';
130 3842 : assert( (*this_).byte_length == 1 );
131 : }
132 : else
133 : {
134 1 : result.seq[0] = (0xc0 | (code_point>>6));
135 1 : result.seq[1] = (0x80 | (code_point&0x3f));
136 1 : result.seq[2] = '\0';
137 1 : result.seq[3] = '\0';
138 1 : assert( (*this_).byte_length == 2 );
139 : }
140 : }
141 : else
142 : {
143 4 : if ( code_point <= 0x10ffff )
144 : {
145 3 : if ( code_point <= 0xffff )
146 : {
147 2 : result.seq[0] = (0xe0 | (code_point>>12));
148 2 : result.seq[1] = (0x80 | ((code_point>>6)&0x3f));
149 2 : result.seq[2] = (0x80 | (code_point&0x3f));
150 2 : result.seq[3] = '\0';
151 2 : assert( (*this_).byte_length == 3 );
152 : }
153 : else
154 : {
155 1 : result.seq[0] = (0xf0 | (code_point>>18));
156 1 : result.seq[1] = (0x80 | ((code_point>>12)&0x3f));
157 1 : result.seq[2] = (0x80 | ((code_point>>6)&0x3f));
158 1 : result.seq[3] = (0x80 | (code_point&0x3f));
159 1 : assert( (*this_).byte_length == 4 );
160 : }
161 : }
162 : else
163 : {
164 : /* UTF8CODEPOINT_INVALID_LEN */
165 1 : result.seq[0] = '\0';
166 1 : result.seq[1] = '\0';
167 1 : result.seq[2] = '\0';
168 1 : result.seq[3] = '\0';
169 1 : assert( (*this_).byte_length == 0 );
170 : }
171 : }
172 :
173 3847 : return result;
174 : }
175 :
176 8009 : static inline int utf8codepoint_is_valid( const utf8codepoint_t *this_ ) {
177 8009 : return ( UTF8CODEPOINT_INVALID_LEN != (*this_).byte_length ) ? 1 : 0;
178 : }
179 :
180 22 : static inline int utf8codepoint_is_unicode( const utf8codepoint_t *this_ ) {
181 22 : int result = 0;
182 22 : if ( (*this_).byte_length != UTF8CODEPOINT_INVALID_LEN ) {
183 20 : if ( (*this_).code_point < 0xd800 ) {
184 2 : result = 1;
185 : }
186 18 : else if (( (*this_).code_point > 0xdfff ) && ( (*this_).code_point < 0xfdd0 )) {
187 2 : result = 1;
188 : }
189 16 : else if (( (*this_).code_point > 0xfdef ) && ( (*this_).code_point < 0x110000 )) {
190 12 : if (( (*this_).code_point & 0x00fffe ) != 0x00fffe ) {
191 6 : result = 1;
192 : }
193 : }
194 : }
195 22 : return result;
196 : }
197 :
198 : #ifdef __cplusplus
199 : }
200 : #endif
201 :
202 :
203 : /*
204 : * Copyright 2012-2025 Andreas Warnke
205 : *
206 : * Licensed under the Apache License, Version 2.0 (the "License");
207 : * you may not use this file except in compliance with the License.
208 : * You may obtain a copy of the License at
209 : *
210 : * http://www.apache.org/licenses/LICENSE-2.0
211 : *
212 : * Unless required by applicable law or agreed to in writing, software
213 : * distributed under the License is distributed on an "AS IS" BASIS,
214 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
215 : * See the License for the specific language governing permissions and
216 : * limitations under the License.
217 : */
|