LCOV - code coverage report
Current view: top level - u8stream/source/utf8stringbuf - utf8stringbuf.c (source / functions) Hit Total Coverage
Test: crystal-facet-uml_v1.61.0_covts Lines: 119 119 100.0 %
Date: 2024-10-26 21:44:38 Functions: 6 6 100.0 %

          Line data    Source code
       1             : /* File: utf8stringbuf.c; Copyright and License: see below */
       2             : 
       3             : #include <inttypes.h>
       4             : #include "utf8stringbuf/utf8stringbuf.h"
       5             : 
       6             : /* utf8stringbuf_private_empty_buf is constantly 0, but may be overwritten by a 0 - therefore stored in a read-writeable memory page */
       7             : char utf8stringbuf_private_empty_buf[1] = "";
       8             : 
       9             : const char *utf8stringbuf_private_format_signed_64_bit_int = "%" PRIi64;
      10             : 
      11             : const char *utf8stringbuf_private_format_64_bit_hex = "%" PRIx64;
      12             : 
      13             : /*!
      14             :  *  \fn utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source )
      15             :  *  \private
      16             :  */
      17             : /* function to write a code point as utf8, returns the number of bytes written and an error code */
      18             : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len );
      19             : 
      20             : /* utf8 sequences longer or equal 2 bytes start with a byte with 2 highest bits set: 0xc0 */
      21             : /* utf8 sequences longer or equal 3 bytes start with a byte with 3 highest bits set: 0xe0 */
      22             : /* utf8 sequences           equal 4 bytes start with a byte with 4 highest bits set: 0xf0 */
      23             : static const unsigned char utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[5] = { 0, 0, 0xc0, 0xe0, 0xf0 };
      24             : 
      25             : /* Note: There is some magic in the design of utf8 which makes the implementation of this function quite short */
      26          95 : unsigned int utf8_string_buf_private_make_null_termination( utf8stringbuf_t this_ ) {
      27             :     unsigned int truncatedLength;
      28          95 :     int clearAtEnd = 1;
      29             : 
      30         334 :     for ( int searchBackwards = 2; searchBackwards <= 4; searchBackwards ++ ) {
      31         258 :         if ( searchBackwards > this_.size ) {
      32           4 :             break;
      33             :         }
      34         254 :         const char pattern = utf8stringbuf_private_pattern_to_detect_half_utf8_sequences[searchBackwards];
      35         254 :         if ( ( this_.buf[this_.size-searchBackwards] & pattern ) == pattern ) {
      36          15 :             clearAtEnd = searchBackwards;
      37          15 :             break;
      38             :         }
      39             :     }
      40             : 
      41          95 :     truncatedLength = this_.size - clearAtEnd;
      42             :     /* this_.buf[truncatedLength] = '\0'; */ /* Note: some functions like splitIn2 require complete zeroed trailings */
      43          95 :     memset( &(this_.buf[truncatedLength]), '\0', clearAtEnd );
      44          95 :     return truncatedLength;
      45             : }
      46             : 
      47             : #ifdef UTF8STRINGBUF_UNCHECKED_RANGE
      48             : utf8error_t utf8stringbuf_copy_region_from_buf( utf8stringbuf_t this_, const utf8stringbuf_t that, int start, int length ) {
      49             :     utf8error_t result;
      50             :     if (( start < 0 ) || ( length < 0 )) {
      51             :         this_.buf[0] = '\0';
      52             :         result = UTF8ERROR_OUT_OF_RANGE;
      53             :     }
      54             :     else {
      55             :         const int thatLen = utf8stringbuf_get_length( that );
      56             :         if ( start > thatLen ) {
      57             :             this_.buf[0] = '\0';
      58             :             result = UTF8ERROR_OUT_OF_RANGE;
      59             :         }
      60             :         else if ( start+length > thatLen ) {
      61             :             strncpy( this_.buf, &(that.buf[start]), this_.size );
      62             :             utf8_string_buf_private_make_null_termination( this_ );
      63             :             result = UTF8ERROR_OUT_OF_RANGE;
      64             :         }
      65             :         else {
      66             :             if ( length >= this_.size ) {
      67             :                 memcpy( this_.buf, &(that.buf[start]), this_.size-1);
      68             :                 utf8_string_buf_private_make_null_termination( this_ );
      69             :                 result = UTF8ERROR_TRUNCATED;
      70             :             }
      71             :             else {
      72             :                 memcpy( this_.buf, &(that.buf[start]), length );
      73             :                 this_.buf[length] = '\0';
      74             :                 result = UTF8ERROR_SUCCESS;
      75             :             }
      76             :         }
      77             :     }
      78             :     return result;
      79             : }
      80             : #endif  /* UTF8STRINGBUF_UNCHECKED_RANGE */
      81             : 
      82             : #ifdef UTF8STRINGBUF_UNCHECKED_RANGE
      83             : utf8error_t utf8stringbuf_copy_region_from_str( utf8stringbuf_t this_, const char *that, int start, int length ) {
      84             :     utf8error_t result;
      85             :     if (( start < 0 ) || ( length < 0 )) {
      86             :         this_.buf[0] = '\0';
      87             :         result = UTF8ERROR_OUT_OF_RANGE;
      88             :     }
      89             :     else if ( that == NULL ) {
      90             :         this_.buf[0] = '\0';
      91             :         result = UTF8ERROR_NULL_PARAM;
      92             :     }
      93             :     else {
      94             :         const int thatLen = strlen( that );
      95             :         if ( start > thatLen ) {
      96             :             this_.buf[0] = '\0';
      97             :             result = UTF8ERROR_OUT_OF_RANGE;
      98             :         }
      99             :         else if ( start+length > thatLen ) {
     100             :             strncpy( this_.buf, &(that[start]), this_.size );
     101             :             utf8_string_buf_private_make_null_termination( this_ );
     102             :             result = UTF8ERROR_OUT_OF_RANGE;
     103             :         }
     104             :         else {
     105             :             if ( length >= this_.size ) {
     106             :                 memcpy( this_.buf, &(that[start]), this_.size-1);
     107             :                 utf8_string_buf_private_make_null_termination( this_ );
     108             :                 result = UTF8ERROR_TRUNCATED;
     109             :             }
     110             :             else {
     111             :                 memcpy( this_.buf, &(that[start]), length );
     112             :                 this_.buf[length] = '\0';
     113             :                 result = UTF8ERROR_SUCCESS;
     114             :             }
     115             :         }
     116             :     }
     117             :     return result;
     118             : }
     119             : #endif  /* UTF8STRINGBUF_UNCHECKED_RANGE */
     120             : 
     121          14 : utf8error_t utf8stringbuf_append_char( utf8stringbuf_t this_, const uint32_t appendix ) {
     122             :     utf8error_t result;
     123          14 :     const unsigned int start = utf8stringbuf_get_length( this_ );
     124             :     int appendLen;
     125          14 :     result = utf8stringbuf_private_write_char( &(this_.buf[start]), this_.size - start - 1, appendix, &appendLen );
     126          14 :     if ( result == UTF8ERROR_SUCCESS ) {
     127          11 :         this_.buf[start+appendLen] = '\0';
     128             :     }
     129          14 :     return result;
     130             : }
     131             : 
     132           5 : utf8error_t utf8stringbuf_append_wstr( utf8stringbuf_t this_, const wchar_t *appendix ) {
     133           5 :     utf8error_t result = UTF8ERROR_NULL_PARAM;
     134           5 :     if ( appendix != NULL ) {
     135           4 :         unsigned int start = utf8stringbuf_get_length( this_ );
     136           4 :         result = UTF8ERROR_SUCCESS;
     137           9 :         for( ; appendix[0]!=L'\0'; appendix = &(appendix[1]) ) {
     138             :             int appendLen;
     139           7 :             result |= utf8stringbuf_private_write_char( &(this_.buf[start]), this_.size - start - 1, appendix[0], &appendLen );
     140           7 :             if ( result != UTF8ERROR_SUCCESS ) {
     141           2 :                 if ( ( result & UTF8ERROR_TRUNCATED ) != 0 ) {
     142           2 :                     break;
     143             :                 }
     144             :             }
     145           5 :             start = start + appendLen;
     146             :         }
     147           4 :         this_.buf[start] = '\0';
     148             :     }
     149           5 :     return result;
     150             : }
     151             : 
     152          21 : static inline utf8error_t utf8stringbuf_private_write_char( char *destination, unsigned int max_size, const uint32_t source, int *out_len ) {
     153          21 :     *out_len = 0;
     154          21 :     utf8error_t result = UTF8ERROR_TRUNCATED;
     155          21 :     if ( source <= 0x7ff ) {
     156          12 :         if ( source <= 0x7f ) {
     157             :             /* 1 byte character */
     158             :             /* check if there is enough space for the character */
     159          10 :             if ( max_size >= 1 ) {
     160           8 :                 destination[0] = source;
     161           8 :                 *out_len = 1;
     162           8 :                 result = UTF8ERROR_SUCCESS;
     163             :             }
     164             :         }
     165             :         else {
     166             :             /* 2 byte character */
     167           2 :             if ( max_size >= 2 ) {
     168           2 :                 destination[0] = 0xc0 | ( source >> 6 );
     169           2 :                 destination[1] = 0x80 | ( source & 0x3f );
     170           2 :                 *out_len = 2;
     171           2 :                 result = UTF8ERROR_SUCCESS;
     172             :             }
     173             :         }
     174             :     }
     175             :     else {
     176           9 :         if ( source <= 0x10ffff ) {
     177           8 :             if ( source <= 0xffff ) {
     178             :                 /* 3 byte character */
     179           6 :                 if ( max_size >= 3 ) {
     180           4 :                     destination[0] = 0xe0 | ( source >> 12 );
     181           4 :                     destination[1] = 0x80 | (( source >> 6 ) & 0x3f );
     182           4 :                     destination[2] = 0x80 | ( source & 0x3f );
     183           4 :                     *out_len = 3;
     184           4 :                     result = UTF8ERROR_SUCCESS;
     185             :                 }
     186             :             }
     187             :             else {
     188             :                 /* 4 byte character */
     189           2 :                 if ( max_size >= 4 ) {
     190           2 :                     destination[0] = 0xf0 | ( source >> 18 );
     191           2 :                     destination[1] = 0x80 | (( source >> 12 ) & 0x3f );
     192           2 :                     destination[2] = 0x80 | (( source >> 6 ) & 0x3f );
     193           2 :                     destination[3] = 0x80 | ( source & 0x3f );
     194           2 :                     *out_len = 4;
     195           2 :                     result = UTF8ERROR_SUCCESS;
     196             :                 }
     197             :             }
     198             :         }
     199             :         else {
     200             :             /* note: utf8 can not encode more than 21 bits per character, and even there only 0-0x10ffff is allowed. */
     201           1 :             result = UTF8ERROR_NOT_A_CODEPOINT;
     202             :         }
     203             :     }
     204          21 :     return result;
     205             : }
     206             : 
     207     2117172 : utf8error_t utf8_string_buf_private_replace_region_by_str( utf8stringbuf_t this_, unsigned int this_Length, int start, int length, const char *replacement ) {
     208     2117172 :     utf8error_t result = UTF8ERROR_OUT_OF_RANGE;
     209     2117172 :     if (( start >= 0 ) && ( start <= this_Length ) && ( length >= 0 ) && (( start + length ) <= this_Length )) {
     210     2117172 :         result = UTF8ERROR_SUCCESS;
     211     2117172 :         unsigned int replaceLen = ( replacement == NULL ) ? (0) : ( strlen(replacement) );
     212     2117172 :         int tailLen = this_Length - start - length;
     213     2117172 :         if ( length > replaceLen ) {
     214       10006 :             memmove( &(this_.buf[start+replaceLen]), &(this_.buf[start+length]), tailLen+1 );
     215             :         }
     216     2107166 :         else if ( length < replaceLen ) {
     217       10014 :             if ( ( start + replaceLen + tailLen ) < this_.size ) {
     218       10004 :                 memmove( &(this_.buf[start+replaceLen]), &(this_.buf[start+length]), tailLen+1 );
     219             :             }
     220          10 :             else if ( ( start + replaceLen ) < this_.size ) {
     221           8 :                 tailLen = this_.size - start - replaceLen - 1;
     222           8 :                 memmove( &(this_.buf[start+replaceLen]), &(this_.buf[start+length]), tailLen );
     223           8 :                 result = UTF8ERROR_TRUNCATED;
     224             :             }
     225             :             else {
     226           2 :                 replaceLen = this_.size - start - 1;
     227           2 :                 result = UTF8ERROR_TRUNCATED;
     228             :             }
     229             :         }
     230             :         else {
     231             :             /* original and replacement strings have equal lengths */
     232             :         }
     233             :         /* replace string */
     234     2117172 :         if ( replaceLen > 0 ) {
     235     2117164 :             const size_t replace_len = (size_t)(replaceLen);
     236     2117164 :             memcpy ( &(this_.buf[start]), replacement, replace_len );
     237             :         }
     238             :         /* terminate string */
     239     2117172 :         if ( result != UTF8ERROR_SUCCESS ) {
     240          10 :             utf8_string_buf_private_make_null_termination( this_ );
     241             :         }
     242             :     }
     243     2117172 :     return result;
     244             : }
     245             : 
     246          20 : utf8error_t utf8stringbuf_replace_all( const utf8stringbuf_t this_, const char *const ((*patterns_and_replacements)[][2]) ) {
     247          20 :     utf8error_t result = UTF8ERROR_NULL_PARAM;
     248             : 
     249             :     /* count input patterns */
     250          20 :     int maxPatternIdx = 0;
     251          20 :     if ( patterns_and_replacements != NULL ) {
     252          19 :         result = UTF8ERROR_SUCCESS;
     253         278 :         for ( maxPatternIdx = 0; (*patterns_and_replacements)[maxPatternIdx][0] != NULL; maxPatternIdx ++ ) {
     254             :         };
     255             :     }
     256             : 
     257             :     /* search patterns */
     258          20 :     unsigned int thisLen = utf8stringbuf_get_length( this_ );
     259     3165827 :     for ( int index = 0; index < thisLen; index ++ ) {
     260     3165807 :         int matchingPatternIdx = -1;
     261     3165807 :         unsigned int remainingLength = thisLen-index;
     262    35692462 :         for ( int patternIdx = 0; ( patternIdx < maxPatternIdx )&&( matchingPatternIdx == -1 ); patternIdx ++ ) {
     263    32526655 :             int finished = 0;
     264    67180507 :             for ( int compareIdx = 0; ( compareIdx <= remainingLength )&&( finished == 0 ); compareIdx ++ ) {
     265    34653852 :                 char cmpChar = (*patterns_and_replacements)[patternIdx][0][compareIdx];
     266    34653852 :                 if ( cmpChar == '\0' ) {
     267     2117252 :                     if ( compareIdx != 0 ) {
     268             :                            /* all characters were equal (and there was at least one) */
     269     2117172 :                            matchingPatternIdx = patternIdx;
     270             :                     }
     271     2117252 :                     finished = 1;
     272             :                 }
     273    34653852 :                 if ( this_.buf[index+compareIdx] != cmpChar ) {
     274             :                     /* difference found */
     275    32526647 :                     finished = 1;
     276             :                 }
     277             :             }
     278             :         }
     279             :         /* replace pattern */
     280     3165807 :         if ( matchingPatternIdx != -1 ) {
     281     2117172 :             const char * pattern = (*patterns_and_replacements)[matchingPatternIdx][0];
     282     2117172 :             int patternLen = strlen( pattern );
     283     2117172 :             const char * replacement = (*patterns_and_replacements)[matchingPatternIdx][1];
     284     2117172 :             int replaceLen = 0;
     285     2117172 :             if ( replacement != NULL ) {
     286     2117164 :                 replaceLen = strlen(replacement);
     287             :             }
     288             :             utf8error_t replaceErr;
     289     2117172 :             replaceErr = utf8_string_buf_private_replace_region_by_str( this_, thisLen, index, patternLen, replacement );
     290     2117172 :             if ( replaceErr != UTF8ERROR_SUCCESS ) {
     291          10 :                 result = UTF8ERROR_TRUNCATED;
     292          10 :                 thisLen = utf8stringbuf_get_length( this_ );
     293             :             }
     294             :             else {
     295     2117162 :                 thisLen = thisLen - patternLen + replaceLen;
     296             :             }
     297     2117172 :             index = index + replaceLen - 1;
     298             :         }
     299             :     }
     300             : 
     301          20 :     return result;
     302             : }
     303             : 
     304             : 
     305             : /*
     306             :  * Copyright 2012-2024 Andreas Warnke
     307             :  *
     308             :  * Licensed under the Apache License, Version 2.0 (the "License");
     309             :  * you may not use this file except in compliance with the License.
     310             :  * You may obtain a copy of the License at
     311             :  *
     312             :  *    http://www.apache.org/licenses/LICENSE-2.0
     313             :  *
     314             :  * Unless required by applicable law or agreed to in writing, software
     315             :  * distributed under the License is distributed on an "AS IS" BASIS,
     316             :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     317             :  * See the License for the specific language governing permissions and
     318             :  * limitations under the License.
     319             :  */

Generated by: LCOV version 1.16