12c7b1c5 (kx 2023-03-24 02:53:04 +0300 1)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 2) #ifdef HAVE_CONFIG_H
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 3) #include <config.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 4) #endif
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 5)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 6) #include <stdlib.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 7) #include <stdio.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 8) #include <unistd.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 9) #include <string.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 10) #include <stdarg.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 11) #include <limits.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 12) #include <locale.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 13) #include <wchar.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 14) #include <wctype.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 15)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 16) #include <defs.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 17) #include <utf8ing.h>
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 18)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 19)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 20) static const ucs4_t replacement_char = 0xfffd;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 21) static const ucs4_t maximum_ucs4 = 0x7fffffff;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 22)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 23) static const int half_shift = 10;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 24) static const ucs4_t half_base = 0x0010000;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 25)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 26) static const ucs4_t surrogate_high_start = 0xd800;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 27) static const ucs4_t surrogate_high_end = 0xdbff;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 28) static const ucs4_t surrogate_low_start = 0xdc00;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 29) static const ucs4_t surrogate_low_end = 0xdfff;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 30)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 31) static utf8_t
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 32) first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 33)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 34)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 35) /***************************************************************
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 36) static copy_ucs4_to_utf8()
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 37)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 38) Переводит строку символов UCS4( src ) в UTF8( dest ).
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 39)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 40) Возвращаемое значение:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 41) Количество байт, реально записанное в DEST.
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 42)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 43) NOTE:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 44) Выход за пределы памяти, выделенной под указатель DEST
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 45) не контролируются.
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 46) Подразумевается, что строка SRC имеет null-терминатор.
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 47) ***************************************************************/
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 48) int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src )
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 49) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 50) utf8_t target[7];
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 51) utf8_t *ptr;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 52) int count = 0;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 53)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 54) while( *src )
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 55) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 56) ucs4_t c;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 57) int bytes_to_write = 0;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 58) const ucs4_t byte_mask = 0xbf;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 59) const ucs4_t byte_mark = 0x80;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 60)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 61) c = *src++;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 62)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 63) if( c >= surrogate_high_start &&
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 64) c <= surrogate_high_end && *src )
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 65) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 66) ucs4_t c2 = *src;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 67)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 68) if( c2 >= surrogate_low_start &&
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 69) c2 <= surrogate_low_end )
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 70) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 71) c = ((c - surrogate_high_start) << half_shift) +
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 72) (c2 - surrogate_low_start) + half_base;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 73) ++src;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 74) }
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 75) }
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 76)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 77) if( c < 0x80 ) bytes_to_write = 1;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 78) else if( c < 0x800 ) bytes_to_write = 2;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 79) else if( c < 0x10000 ) bytes_to_write = 3;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 80) else if( c < 0x200000 ) bytes_to_write = 4;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 81) else if( c < 0x4000000 ) bytes_to_write = 5;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 82) else if( c <= maximum_ucs4 ) bytes_to_write = 6;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 83) else
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 84) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 85) bytes_to_write = 2; c = replacement_char;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 86) }
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 87)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 88) ptr = &target[0] + bytes_to_write;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 89)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 90) switch( bytes_to_write )
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 91) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 92) case 6:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 93) *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 94) case 5:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 95) *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 96) case 4:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 97) *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 98) case 3:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 99) *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 100) case 2:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 101) *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 102) case 1:
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 103) *--ptr = c | first_byte_mark[bytes_to_write];
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 104) }
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 105)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 106) ptr = &target[0];
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 107)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 108) while( bytes_to_write > 0 )
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 109) {
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 110) *dest++ = *ptr++; /* write byte */
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 111) --bytes_to_write;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 112) ++count;
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 113) }
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 114)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 115) } /* End while( *src ) */
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 116)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 117) *dest = (utf8_t)0; /* null terminator */
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 118)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 119) return( count );
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 120)
12c7b1c5 (kx 2023-03-24 02:53:04 +0300 121) } /* End of static copy_ucs4_to_utf8() */