LCOV - crystal-facet-uml_v1.63.2_covts - u8stream/include/utf8stringbuf/utf8codepoint.inl

LCOV - code coverage report

Current view:	top level - u8stream/include/utf8stringbuf - utf8codepoint.inl (source / functions)		Coverage	Total	Hit
Test:	crystal-facet-uml_v1.63.2_covts	Lines:	100.0 %	93	93
Test Date:	2025-05-01 10:10:14	Functions:	100.0 %	7	7

            Line data    Source code

       1              : /* File: utf8codepoint.inl; Copyright and License: see below */
       2              : 
       3              : #include <stdint.h>
       4              : #include <inttypes.h>
       5              : #include <string.h>
       6              : #include <assert.h>
       7              : 
       8              : #ifdef __cplusplus
       9              : extern "C" {
      10              : #endif
      11              : 
      12              : /*!
      13              :  *  \enum utf8codepoint_enum
      14              :  *  \private
      15              :  */
      16              : /*  enumeration for invalid code points */
      17              : enum utf8codepoint_enum {UTF8CODEPOINT_INVALID_LEN=0,};
      18              : 
      19         4133 : static inline utf8codepoint_t utf8codepoint( uint32_t code_point ) {
      20              :     utf8codepoint_t result;
      21              :     /* for balanced performance between standard and worst cases, */
      22              :     /* this_ is implemented as asymmetric decision tree: */
      23              :     /* in the best case, we have 2 comparisons, in the worst case 3 */
      24         4133 :     result.byte_length =
      25              :             ( code_point <= 0x7ff )
      26              :             ? (
      27              :                     ( code_point <= 0x7f ) ? 1 : 2
      28              :             )
      29         4133 :             : (
      30              :                     ( code_point <= 0x10ffff )
      31              :                     ? (
      32              :                             ( code_point <= 0xffff ) ? 3 : 4
      33              :                     )
      34              :                     : UTF8CODEPOINT_INVALID_LEN
      35              :             );
      36         4133 :     result.code_point = code_point;
      37         4133 :     return result;
      38              : }
      39              : 
      40         3879 : static inline utf8codepoint_t utf8codepoint_new( const char *that, unsigned int max_size ) {
      41         3879 :     utf8codepoint_t result = { UTF8CODEPOINT_INVALID_LEN, 0x0, };
      42         3879 :     if (( that != NULL )&&( max_size > 0 ))
      43              :     {
      44         3877 :         const unsigned char firstByte = (const unsigned char) (that[0]);
      45         3877 :         if (( 0x80 & firstByte ) == 0x00 )
      46              :         {
      47              :             /* 7-bit ASCII character */
      48         3856 :             result.byte_length = 1;
      49         3856 :             result.code_point = firstByte;
      50              :         }
      51           21 :         else if ( firstByte < 0xe0 )
      52              :         {
      53            8 :             if ( max_size >= 2 )
      54              :             {
      55            7 :                 const unsigned char secondByte = (const unsigned char) (that[1]);
      56            7 :                 if (( ( 0xe0 & firstByte ) == 0xc0 ) && ( ( 0xc0 & secondByte ) == 0x80 ))
      57              :                 {
      58              :                     /* first and second byte are valid */
      59            3 :                     result.byte_length = 2;
      60            3 :                     result.code_point = (((uint32_t)(firstByte & 0x1f))<<6)
      61            3 :                                         |(secondByte & 0x3f);
      62              :                 }
      63              :             }
      64              :         }
      65           13 :         else if ( firstByte < 0xf0 )
      66              :         {
      67            7 :             if ( max_size >= 3 )
      68              :             {
      69            7 :                 const unsigned char secondByte = (const unsigned char) (that[1]);
      70            7 :                 const unsigned char thirdByte = (const unsigned char) (that[2]);
      71            7 :                 if (( ( 0xc0 & secondByte ) == 0x80 ) && ( ( 0xc0 & thirdByte ) == 0x80 ))
      72              :                 {
      73              :                     /* second and third bytes are valid */
      74            7 :                     result.byte_length = 3;
      75            7 :                     result.code_point = (((uint32_t)(firstByte & 0x0f))<<12)
      76            7 :                                         |(((uint32_t)(secondByte & 0x3f))<<6)
      77            7 :                                         |(thirdByte & 0x3f);
      78              :                 }
      79              :             }
      80              :         }
      81            6 :         else if ( firstByte < 0xf8 )
      82              :         {
      83            6 :             if ( max_size >= 4 )
      84              :             {
      85            4 :                 const unsigned char secondByte = (const unsigned char) (that[1]);
      86            4 :                 const unsigned char thirdByte = (const unsigned char) (that[2]);
      87            4 :                 const unsigned char fourthByte = (const unsigned char) (that[3]);
      88            4 :                 if (( ( 0xc0 & secondByte ) == 0x80 )
      89            4 :                    && ( ( 0xc0 & thirdByte ) == 0x80 )
      90            4 :                    && ( ( 0xc0 & fourthByte ) == 0x80 ))
      91              :                 {
      92              :                     /* second, third and fourth bytes are valid */
      93            3 :                     result.byte_length = 4;
      94            3 :                     result.code_point = (((uint32_t)(firstByte & 0x07))<<18)
      95            3 :                                         |(((uint32_t)(secondByte & 0x3f))<<12)
      96            3 :                                         |(((uint32_t)(thirdByte & 0x3f))<<6)
      97            3 :                                         |(fourthByte & 0x3f);
      98            3 :                     if ( result.code_point > 0x10ffff )
      99              :                     {
     100              :                         /* invalid */
     101            1 :                         result.byte_length = UTF8CODEPOINT_INVALID_LEN;
     102              :                     }
     103              :                 }
     104              :             }
     105              :         }
     106              :     }
     107         3879 :     return result;
     108              : }
     109              : 
     110         7720 : static inline uint32_t utf8codepoint_get_char( const utf8codepoint_t *this_ ) {
     111         7720 :     return (*this_).code_point;
     112              : }
     113              : 
     114         7737 : static inline unsigned int utf8codepoint_get_length( const utf8codepoint_t *this_ ) {
     115         7737 :     return (*this_).byte_length;
     116              : }
     117              : 
     118         3847 : static inline utf8codepointseq_t utf8codepoint_get_utf8( const utf8codepoint_t *this_ ) {
     119              :     utf8codepointseq_t result;
     120         3847 :     const uint32_t code_point = (*this_).code_point;
     121              : 
     122         3847 :     if ( code_point <= 0x7ff )
     123              :     {
     124         3843 :         if ( code_point <= 0x7f )
     125              :         {
     126         3842 :             result.seq[0] = code_point;
     127         3842 :             result.seq[1] = '\0';
     128         3842 :             result.seq[2] = '\0';
     129         3842 :             result.seq[3] = '\0';
     130         3842 :             assert( (*this_).byte_length == 1 );
     131              :         }
     132              :         else
     133              :         {
     134            1 :             result.seq[0] = (0xc0 | (code_point>>6));
     135            1 :             result.seq[1] = (0x80 | (code_point&0x3f));
     136            1 :             result.seq[2] = '\0';
     137            1 :             result.seq[3] = '\0';
     138            1 :             assert( (*this_).byte_length == 2 );
     139              :         }
     140              :     }
     141              :     else
     142              :     {
     143            4 :         if ( code_point <= 0x10ffff )
     144              :         {
     145            3 :             if ( code_point <= 0xffff )
     146              :             {
     147            2 :                 result.seq[0] = (0xe0 | (code_point>>12));
     148            2 :                 result.seq[1] = (0x80 | ((code_point>>6)&0x3f));
     149            2 :                 result.seq[2] = (0x80 | (code_point&0x3f));
     150            2 :                 result.seq[3] = '\0';
     151            2 :                 assert( (*this_).byte_length == 3 );
     152              :             }
     153              :             else
     154              :             {
     155            1 :                 result.seq[0] = (0xf0 | (code_point>>18));
     156            1 :                 result.seq[1] = (0x80 | ((code_point>>12)&0x3f));
     157            1 :                 result.seq[2] = (0x80 | ((code_point>>6)&0x3f));
     158            1 :                 result.seq[3] = (0x80 | (code_point&0x3f));
     159            1 :                 assert( (*this_).byte_length == 4 );
     160              :             }
     161              :         }
     162              :         else
     163              :         {
     164              :             /* UTF8CODEPOINT_INVALID_LEN */
     165            1 :             result.seq[0] = '\0';
     166            1 :             result.seq[1] = '\0';
     167            1 :             result.seq[2] = '\0';
     168            1 :             result.seq[3] = '\0';
     169            1 :             assert( (*this_).byte_length == 0 );
     170              :         }
     171              :     }
     172              : 
     173         3847 :     return result;
     174              : }
     175              : 
     176         8009 : static inline int utf8codepoint_is_valid( const utf8codepoint_t *this_ ) {
     177         8009 :     return ( UTF8CODEPOINT_INVALID_LEN != (*this_).byte_length ) ? 1 : 0;
     178              : }
     179              : 
     180           22 : static inline int utf8codepoint_is_unicode( const utf8codepoint_t *this_ ) {
     181           22 :     int result = 0;
     182           22 :     if ( (*this_).byte_length != UTF8CODEPOINT_INVALID_LEN ) {
     183           20 :         if ( (*this_).code_point < 0xd800 ) {
     184            2 :             result = 1;
     185              :         }
     186           18 :         else if (( (*this_).code_point > 0xdfff ) && ( (*this_).code_point < 0xfdd0 )) {
     187            2 :             result = 1;
     188              :         }
     189           16 :         else if (( (*this_).code_point > 0xfdef ) && ( (*this_).code_point < 0x110000 )) {
     190           12 :             if (( (*this_).code_point & 0x00fffe ) != 0x00fffe ) {
     191            6 :                 result = 1;
     192              :             }
     193              :         }
     194              :     }
     195           22 :     return result;
     196              : }
     197              : 
     198              : #ifdef __cplusplus
     199              : }
     200              : #endif
     201              : 
     202              : 
     203              : /*
     204              :  * Copyright 2012-2025 Andreas Warnke
     205              :  *
     206              :  * Licensed under the Apache License, Version 2.0 (the "License");
     207              :  * you may not use this file except in compliance with the License.
     208              :  * You may obtain a copy of the License at
     209              :  *
     210              :  *    http://www.apache.org/licenses/LICENSE-2.0
     211              :  *
     212              :  * Unless required by applicable law or agreed to in writing, software
     213              :  * distributed under the License is distributed on an "AS IS" BASIS,
     214              :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     215              :  * See the License for the specific language governing permissions and
     216              :  * limitations under the License.
     217              :  */

Generated by: LCOV version 2.0-1