LCOV - code coverage report
Current view: top level - u8stream/include/utf8stringbuf - utf8codepoint.inl (source / functions) Hit Total Coverage
Test: crystal-facet-uml_v1.61.0_covts Lines: 93 93 100.0 %
Date: 2024-10-26 21:44:38 Functions: 7 7 100.0 %

          Line data    Source code
       1             : /* File: utf8codepoint.inl; Copyright and License: see below */
       2             : 
       3             : #include <stdint.h>
       4             : #include <inttypes.h>
       5             : #include <string.h>
       6             : #include <assert.h>
       7             : 
       8             : #ifdef __cplusplus
       9             : extern "C" {
      10             : #endif
      11             : 
      12             : /*!
      13             :  *  \enum utf8codepoint_enum
      14             :  *  \private
      15             :  */
      16             : /*  enumeration for invalid code points */
      17             : enum utf8codepoint_enum {UTF8CODEPOINT_INVALID_LEN=0,};
      18             : 
      19        4133 : static inline utf8codepoint_t utf8codepoint( uint32_t code_point ) {
      20             :     utf8codepoint_t result;
      21             :     /* for balanced performance between standard and worst cases, */
      22             :     /* this_ is implemented as asymmetric decision tree: */
      23             :     /* in the best case, we have 2 comparisons, in the worst case 3 */
      24        4133 :     result.byte_length =
      25             :             ( code_point <= 0x7ff )
      26             :             ? (
      27             :                     ( code_point <= 0x7f ) ? 1 : 2
      28             :             )
      29        4133 :             : (
      30             :                     ( code_point <= 0x10ffff )
      31             :                     ? (
      32             :                             ( code_point <= 0xffff ) ? 3 : 4
      33             :                     )
      34             :                     : UTF8CODEPOINT_INVALID_LEN
      35             :             );
      36        4133 :     result.code_point = code_point;
      37        4133 :     return result;
      38             : }
      39             : 
      40        3879 : static inline utf8codepoint_t utf8codepoint_init( const char *that, unsigned int max_size ) {
      41        3879 :     utf8codepoint_t result = { UTF8CODEPOINT_INVALID_LEN, 0x0, };
      42        3879 :     if (( that != NULL )&&( max_size > 0 ))
      43             :     {
      44        3877 :         const unsigned char firstByte = (const unsigned char) (that[0]);
      45        3877 :         if (( 0x80 & firstByte ) == 0x00 )
      46             :         {
      47             :             /* 7-bit ASCII character */
      48        3856 :             result.byte_length = 1;
      49        3856 :             result.code_point = firstByte;
      50             :         }
      51          21 :         else if ( firstByte < 0xe0 )
      52             :         {
      53           8 :             if ( max_size >= 2 )
      54             :             {
      55           7 :                 const unsigned char secondByte = (const unsigned char) (that[1]);
      56           7 :                 if (( ( 0xe0 & firstByte ) == 0xc0 ) && ( ( 0xc0 & secondByte ) == 0x80 ))
      57             :                 {
      58             :                     /* first and second byte are valid */
      59           3 :                     result.byte_length = 2;
      60           3 :                     result.code_point = (((uint32_t)(firstByte & 0x1f))<<6)
      61           3 :                                         |(secondByte & 0x3f);
      62             :                 }
      63             :             }
      64             :         }
      65          13 :         else if ( firstByte < 0xf0 )
      66             :         {
      67           7 :             if ( max_size >= 3 )
      68             :             {
      69           7 :                 const unsigned char secondByte = (const unsigned char) (that[1]);
      70           7 :                 const unsigned char thirdByte = (const unsigned char) (that[2]);
      71           7 :                 if (( ( 0xc0 & secondByte ) == 0x80 ) && ( ( 0xc0 & thirdByte ) == 0x80 ))
      72             :                 {
      73             :                     /* second and third bytes are valid */
      74           7 :                     result.byte_length = 3;
      75           7 :                     result.code_point = (((uint32_t)(firstByte & 0x0f))<<12)
      76           7 :                                         |(((uint32_t)(secondByte & 0x3f))<<6)
      77           7 :                                         |(thirdByte & 0x3f);
      78             :                 }
      79             :             }
      80             :         }
      81           6 :         else if ( firstByte < 0xf8 )
      82             :         {
      83           6 :             if ( max_size >= 4 )
      84             :             {
      85           4 :                 const unsigned char secondByte = (const unsigned char) (that[1]);
      86           4 :                 const unsigned char thirdByte = (const unsigned char) (that[2]);
      87           4 :                 const unsigned char fourthByte = (const unsigned char) (that[3]);
      88           4 :                 if (( ( 0xc0 & secondByte ) == 0x80 )
      89           4 :                    && ( ( 0xc0 & thirdByte ) == 0x80 )
      90           4 :                    && ( ( 0xc0 & fourthByte ) == 0x80 ))
      91             :                 {
      92             :                     /* second, third and fourth bytes are valid */
      93           3 :                     result.byte_length = 4;
      94           3 :                     result.code_point = (((uint32_t)(firstByte & 0x07))<<18)
      95           3 :                                         |(((uint32_t)(secondByte & 0x3f))<<12)
      96           3 :                                         |(((uint32_t)(thirdByte & 0x3f))<<6)
      97           3 :                                         |(fourthByte & 0x3f);
      98           3 :                     if ( result.code_point > 0x10ffff )
      99             :                     {
     100             :                         /* invalid */
     101           1 :                         result.byte_length = UTF8CODEPOINT_INVALID_LEN;
     102             :                     }
     103             :                 }
     104             :             }
     105             :         }
     106             :     }
     107        3879 :     return result;
     108             : }
     109             : 
     110        7720 : static inline uint32_t utf8codepoint_get_char( const utf8codepoint_t this_ ) {
     111        7720 :     return this_.code_point;
     112             : }
     113             : 
     114        7737 : static inline unsigned int utf8codepoint_get_length( const utf8codepoint_t this_ ) {
     115        7737 :     return this_.byte_length;
     116             : }
     117             : 
     118        3847 : static inline utf8codepointseq_t utf8codepoint_get_utf8( const utf8codepoint_t this_ ) {
     119             :     utf8codepointseq_t result;
     120        3847 :     const uint32_t code_point = this_.code_point;
     121             : 
     122        3847 :     if ( code_point <= 0x7ff )
     123             :     {
     124        3843 :         if ( code_point <= 0x7f )
     125             :         {
     126        3842 :             result.seq[0] = code_point;
     127        3842 :             result.seq[1] = '\0';
     128        3842 :             result.seq[2] = '\0';
     129        3842 :             result.seq[3] = '\0';
     130        3842 :             assert( this_.byte_length == 1 );
     131             :         }
     132             :         else
     133             :         {
     134           1 :             result.seq[0] = (0xc0 | (code_point>>6));
     135           1 :             result.seq[1] = (0x80 | (code_point&0x3f));
     136           1 :             result.seq[2] = '\0';
     137           1 :             result.seq[3] = '\0';
     138           1 :             assert( this_.byte_length == 2 );
     139             :         }
     140             :     }
     141             :     else
     142             :     {
     143           4 :         if ( code_point <= 0x10ffff )
     144             :         {
     145           3 :             if ( code_point <= 0xffff )
     146             :             {
     147           2 :                 result.seq[0] = (0xe0 | (code_point>>12));
     148           2 :                 result.seq[1] = (0x80 | ((code_point>>6)&0x3f));
     149           2 :                 result.seq[2] = (0x80 | (code_point&0x3f));
     150           2 :                 result.seq[3] = '\0';
     151           2 :                 assert( this_.byte_length == 3 );
     152             :             }
     153             :             else
     154             :             {
     155           1 :                 result.seq[0] = (0xf0 | (code_point>>18));
     156           1 :                 result.seq[1] = (0x80 | ((code_point>>12)&0x3f));
     157           1 :                 result.seq[2] = (0x80 | ((code_point>>6)&0x3f));
     158           1 :                 result.seq[3] = (0x80 | (code_point&0x3f));
     159           1 :                 assert( this_.byte_length == 4 );
     160             :             }
     161             :         }
     162             :         else
     163             :         {
     164             :             /* UTF8CODEPOINT_INVALID_LEN */
     165           1 :             result.seq[0] = '\0';
     166           1 :             result.seq[1] = '\0';
     167           1 :             result.seq[2] = '\0';
     168           1 :             result.seq[3] = '\0';
     169           1 :             assert( this_.byte_length == 0 );
     170             :         }
     171             :     }
     172             : 
     173        3847 :     return result;
     174             : }
     175             : 
     176        8009 : static inline int utf8codepoint_is_valid( const utf8codepoint_t this_ ) {
     177        8009 :     return ( UTF8CODEPOINT_INVALID_LEN != this_.byte_length ) ? 1 : 0;
     178             : }
     179             : 
     180          22 : static inline int utf8codepoint_is_unicode( const utf8codepoint_t this_ ) {
     181          22 :     int result = 0;
     182          22 :     if ( this_.byte_length != UTF8CODEPOINT_INVALID_LEN ) {
     183          20 :         if ( this_.code_point < 0xd800 ) {
     184           2 :             result = 1;
     185             :         }
     186          18 :         else if (( this_.code_point > 0xdfff ) && ( this_.code_point < 0xfdd0 )) {
     187           2 :             result = 1;
     188             :         }
     189          16 :         else if (( this_.code_point > 0xfdef ) && ( this_.code_point < 0x110000 )) {
     190          12 :             if (( this_.code_point & 0x00fffe ) != 0x00fffe ) {
     191           6 :                 result = 1;
     192             :             }
     193             :         }
     194             :     }
     195          22 :     return result;
     196             : }
     197             : 
     198             : #ifdef __cplusplus
     199             : }
     200             : #endif
     201             : 
     202             : 
     203             : /*
     204             :  * Copyright 2012-2024 Andreas Warnke
     205             :  *
     206             :  * Licensed under the Apache License, Version 2.0 (the "License");
     207             :  * you may not use this file except in compliance with the License.
     208             :  * You may obtain a copy of the License at
     209             :  *
     210             :  *    http://www.apache.org/licenses/LICENSE-2.0
     211             :  *
     212             :  * Unless required by applicable law or agreed to in writing, software
     213             :  * distributed under the License is distributed on an "AS IS" BASIS,
     214             :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     215             :  * See the License for the specific language governing permissions and
     216             :  * limitations under the License.
     217             :  */

Generated by: LCOV version 1.16