Line data Source code
1 : /* File: utf8codepoint.inl; Copyright and License: see below */ 2 : 3 : #include <stdint.h> 4 : #include <inttypes.h> 5 : #include <string.h> 6 : #include <assert.h> 7 : 8 : #ifdef __cplusplus 9 : extern "C" { 10 : #endif 11 : 12 : /*! 13 : * \enum utf8codepoint_enum 14 : * \private 15 : */ 16 : /* enumeration for invalid code points */ 17 : enum utf8codepoint_enum {UTF8CODEPOINT_INVALID_LEN=0,}; 18 : 19 4133 : static inline utf8codepoint_t utf8codepoint( uint32_t code_point ) { 20 : utf8codepoint_t result; 21 : /* for balanced performance between standard and worst cases, */ 22 : /* this_ is implemented as asymmetric decision tree: */ 23 : /* in the best case, we have 2 comparisons, in the worst case 3 */ 24 4133 : result.byte_length = 25 : ( code_point <= 0x7ff ) 26 : ? ( 27 : ( code_point <= 0x7f ) ? 1 : 2 28 : ) 29 4133 : : ( 30 : ( code_point <= 0x10ffff ) 31 : ? ( 32 : ( code_point <= 0xffff ) ? 3 : 4 33 : ) 34 : : UTF8CODEPOINT_INVALID_LEN 35 : ); 36 4133 : result.code_point = code_point; 37 4133 : return result; 38 : } 39 : 40 3879 : static inline utf8codepoint_t utf8codepoint_init( const char *that, unsigned int max_size ) { 41 3879 : utf8codepoint_t result = { UTF8CODEPOINT_INVALID_LEN, 0x0, }; 42 3879 : if (( that != NULL )&&( max_size > 0 )) 43 : { 44 3877 : const unsigned char firstByte = (const unsigned char) (that[0]); 45 3877 : if (( 0x80 & firstByte ) == 0x00 ) 46 : { 47 : /* 7-bit ASCII character */ 48 3856 : result.byte_length = 1; 49 3856 : result.code_point = firstByte; 50 : } 51 21 : else if ( firstByte < 0xe0 ) 52 : { 53 8 : if ( max_size >= 2 ) 54 : { 55 7 : const unsigned char secondByte = (const unsigned char) (that[1]); 56 7 : if (( ( 0xe0 & firstByte ) == 0xc0 ) && ( ( 0xc0 & secondByte ) == 0x80 )) 57 : { 58 : /* first and second byte are valid */ 59 3 : result.byte_length = 2; 60 3 : result.code_point = (((uint32_t)(firstByte & 0x1f))<<6) 61 3 : |(secondByte & 0x3f); 62 : } 63 : } 64 : } 65 13 : else if ( firstByte < 0xf0 ) 66 : { 67 7 : if ( max_size >= 3 ) 68 : { 69 7 : const unsigned char secondByte = (const unsigned char) (that[1]); 70 7 : const unsigned char thirdByte = (const unsigned char) (that[2]); 71 7 : if (( ( 0xc0 & secondByte ) == 0x80 ) && ( ( 0xc0 & thirdByte ) == 0x80 )) 72 : { 73 : /* second and third bytes are valid */ 74 7 : result.byte_length = 3; 75 7 : result.code_point = (((uint32_t)(firstByte & 0x0f))<<12) 76 7 : |(((uint32_t)(secondByte & 0x3f))<<6) 77 7 : |(thirdByte & 0x3f); 78 : } 79 : } 80 : } 81 6 : else if ( firstByte < 0xf8 ) 82 : { 83 6 : if ( max_size >= 4 ) 84 : { 85 4 : const unsigned char secondByte = (const unsigned char) (that[1]); 86 4 : const unsigned char thirdByte = (const unsigned char) (that[2]); 87 4 : const unsigned char fourthByte = (const unsigned char) (that[3]); 88 4 : if (( ( 0xc0 & secondByte ) == 0x80 ) 89 4 : && ( ( 0xc0 & thirdByte ) == 0x80 ) 90 4 : && ( ( 0xc0 & fourthByte ) == 0x80 )) 91 : { 92 : /* second, third and fourth bytes are valid */ 93 3 : result.byte_length = 4; 94 3 : result.code_point = (((uint32_t)(firstByte & 0x07))<<18) 95 3 : |(((uint32_t)(secondByte & 0x3f))<<12) 96 3 : |(((uint32_t)(thirdByte & 0x3f))<<6) 97 3 : |(fourthByte & 0x3f); 98 3 : if ( result.code_point > 0x10ffff ) 99 : { 100 : /* invalid */ 101 1 : result.byte_length = UTF8CODEPOINT_INVALID_LEN; 102 : } 103 : } 104 : } 105 : } 106 : } 107 3879 : return result; 108 : } 109 : 110 7720 : static inline uint32_t utf8codepoint_get_char( const utf8codepoint_t this_ ) { 111 7720 : return this_.code_point; 112 : } 113 : 114 7737 : static inline unsigned int utf8codepoint_get_length( const utf8codepoint_t this_ ) { 115 7737 : return this_.byte_length; 116 : } 117 : 118 3847 : static inline utf8codepointseq_t utf8codepoint_get_utf8( const utf8codepoint_t this_ ) { 119 : utf8codepointseq_t result; 120 3847 : const uint32_t code_point = this_.code_point; 121 : 122 3847 : if ( code_point <= 0x7ff ) 123 : { 124 3843 : if ( code_point <= 0x7f ) 125 : { 126 3842 : result.seq[0] = code_point; 127 3842 : result.seq[1] = '\0'; 128 3842 : result.seq[2] = '\0'; 129 3842 : result.seq[3] = '\0'; 130 3842 : assert( this_.byte_length == 1 ); 131 : } 132 : else 133 : { 134 1 : result.seq[0] = (0xc0 | (code_point>>6)); 135 1 : result.seq[1] = (0x80 | (code_point&0x3f)); 136 1 : result.seq[2] = '\0'; 137 1 : result.seq[3] = '\0'; 138 1 : assert( this_.byte_length == 2 ); 139 : } 140 : } 141 : else 142 : { 143 4 : if ( code_point <= 0x10ffff ) 144 : { 145 3 : if ( code_point <= 0xffff ) 146 : { 147 2 : result.seq[0] = (0xe0 | (code_point>>12)); 148 2 : result.seq[1] = (0x80 | ((code_point>>6)&0x3f)); 149 2 : result.seq[2] = (0x80 | (code_point&0x3f)); 150 2 : result.seq[3] = '\0'; 151 2 : assert( this_.byte_length == 3 ); 152 : } 153 : else 154 : { 155 1 : result.seq[0] = (0xf0 | (code_point>>18)); 156 1 : result.seq[1] = (0x80 | ((code_point>>12)&0x3f)); 157 1 : result.seq[2] = (0x80 | ((code_point>>6)&0x3f)); 158 1 : result.seq[3] = (0x80 | (code_point&0x3f)); 159 1 : assert( this_.byte_length == 4 ); 160 : } 161 : } 162 : else 163 : { 164 : /* UTF8CODEPOINT_INVALID_LEN */ 165 1 : result.seq[0] = '\0'; 166 1 : result.seq[1] = '\0'; 167 1 : result.seq[2] = '\0'; 168 1 : result.seq[3] = '\0'; 169 1 : assert( this_.byte_length == 0 ); 170 : } 171 : } 172 : 173 3847 : return result; 174 : } 175 : 176 8009 : static inline int utf8codepoint_is_valid( const utf8codepoint_t this_ ) { 177 8009 : return ( UTF8CODEPOINT_INVALID_LEN != this_.byte_length ) ? 1 : 0; 178 : } 179 : 180 22 : static inline int utf8codepoint_is_unicode( const utf8codepoint_t this_ ) { 181 22 : int result = 0; 182 22 : if ( this_.byte_length != UTF8CODEPOINT_INVALID_LEN ) { 183 20 : if ( this_.code_point < 0xd800 ) { 184 2 : result = 1; 185 : } 186 18 : else if (( this_.code_point > 0xdfff ) && ( this_.code_point < 0xfdd0 )) { 187 2 : result = 1; 188 : } 189 16 : else if (( this_.code_point > 0xfdef ) && ( this_.code_point < 0x110000 )) { 190 12 : if (( this_.code_point & 0x00fffe ) != 0x00fffe ) { 191 6 : result = 1; 192 : } 193 : } 194 : } 195 22 : return result; 196 : } 197 : 198 : #ifdef __cplusplus 199 : } 200 : #endif 201 : 202 : 203 : /* 204 : * Copyright 2012-2024 Andreas Warnke 205 : * 206 : * Licensed under the Apache License, Version 2.0 (the "License"); 207 : * you may not use this file except in compliance with the License. 208 : * You may obtain a copy of the License at 209 : * 210 : * http://www.apache.org/licenses/LICENSE-2.0 211 : * 212 : * Unless required by applicable law or agreed to in writing, software 213 : * distributed under the License is distributed on an "AS IS" BASIS, 214 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 215 : * See the License for the specific language governing permissions and 216 : * limitations under the License. 217 : */