955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2) * Copyright (c) 2014 SGI.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3) * All rights reserved.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 4) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 5) * This program is free software; you can redistribute it and/or
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 6) * modify it under the terms of the GNU General Public License as
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 7) * published by the Free Software Foundation.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 8) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 9) * This program is distributed in the hope that it would be useful,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 10) * but WITHOUT ANY WARRANTY; without even the implied warranty of
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 11) * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 12) * GNU General Public License for more details.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 13) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 14) * You should have received a copy of the GNU General Public License
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 15) * along with this program; if not, write the Free Software Foundation,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 16) * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 17) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 18)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 19) /* Generator for a compact trie for unicode normalization */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 20)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 21) #include <sys/types.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 22) #include <stddef.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 23) #include <stdlib.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 24) #include <stdio.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 25) #include <assert.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 26) #include <string.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 27) #include <unistd.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 28) #include <errno.h>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 29)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 30) /* Default names of the in- and output files. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 31)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 32) #define AGE_NAME "DerivedAge.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 33) #define CCC_NAME "DerivedCombiningClass.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 34) #define PROP_NAME "DerivedCoreProperties.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 35) #define DATA_NAME "UnicodeData.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 36) #define FOLD_NAME "CaseFolding.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 37) #define NORM_NAME "NormalizationCorrections.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 38) #define TEST_NAME "NormalizationTest.txt"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 39) #define UTF8_NAME "utf8data.h"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 40)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 41) const char *age_name = AGE_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 42) const char *ccc_name = CCC_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 43) const char *prop_name = PROP_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 44) const char *data_name = DATA_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 45) const char *fold_name = FOLD_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 46) const char *norm_name = NORM_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 47) const char *test_name = TEST_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 48) const char *utf8_name = UTF8_NAME;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 49)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 50) int verbose = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 51)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 52) /* An arbitrary line size limit on input lines. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 53)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 54) #define LINESIZE 1024
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 55) char line[LINESIZE];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 56) char buf0[LINESIZE];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 57) char buf1[LINESIZE];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 58) char buf2[LINESIZE];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 59) char buf3[LINESIZE];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 60)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 61) const char *argv0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 62)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 63) #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 64)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 65) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 66)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 67) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 68) * Unicode version numbers consist of three parts: major, minor, and a
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 69) * revision. These numbers are packed into an unsigned int to obtain
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 70) * a single version number.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 71) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 72) * To save space in the generated trie, the unicode version is not
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 73) * stored directly, instead we calculate a generation number from the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 74) * unicode versions seen in the DerivedAge file, and use that as an
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 75) * index into a table of unicode versions.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 76) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 77) #define UNICODE_MAJ_SHIFT (16)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 78) #define UNICODE_MIN_SHIFT (8)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 79)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 80) #define UNICODE_MAJ_MAX ((unsigned short)-1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 81) #define UNICODE_MIN_MAX ((unsigned char)-1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 82) #define UNICODE_REV_MAX ((unsigned char)-1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 83)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 84) #define UNICODE_AGE(MAJ,MIN,REV) \
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 85) (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 86) ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 87) ((unsigned int)(REV)))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 88)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 89) unsigned int *ages;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 90) int ages_count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 91)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 92) unsigned int unicode_maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 93)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 94) static int age_valid(unsigned int major, unsigned int minor,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 95) unsigned int revision)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 96) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 97) if (major > UNICODE_MAJ_MAX)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 98) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 99) if (minor > UNICODE_MIN_MAX)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 100) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 101) if (revision > UNICODE_REV_MAX)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 102) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 103) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 104) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 105)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 106) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 107)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 108) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 109) * utf8trie_t
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 110) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 111) * A compact binary tree, used to decode UTF-8 characters.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 112) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 113) * Internal nodes are one byte for the node itself, and up to three
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 114) * bytes for an offset into the tree. The first byte contains the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 115) * following information:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 116) * NEXTBYTE - flag - advance to next byte if set
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 117) * BITNUM - 3 bit field - the bit number to tested
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 118) * OFFLEN - 2 bit field - number of bytes in the offset
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 119) * if offlen == 0 (non-branching node)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 120) * RIGHTPATH - 1 bit field - set if the following node is for the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 121) * right-hand path (tested bit is set)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 122) * TRIENODE - 1 bit field - set if the following node is an internal
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 123) * node, otherwise it is a leaf node
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 124) * if offlen != 0 (branching node)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 125) * LEFTNODE - 1 bit field - set if the left-hand node is internal
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 126) * RIGHTNODE - 1 bit field - set if the right-hand node is internal
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 127) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 128) * Due to the way utf8 works, there cannot be branching nodes with
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 129) * NEXTBYTE set, and moreover those nodes always have a righthand
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 130) * descendant.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 131) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 132) typedef unsigned char utf8trie_t;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 133) #define BITNUM 0x07
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 134) #define NEXTBYTE 0x08
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 135) #define OFFLEN 0x30
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 136) #define OFFLEN_SHIFT 4
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 137) #define RIGHTPATH 0x40
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 138) #define TRIENODE 0x80
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 139) #define RIGHTNODE 0x40
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 140) #define LEFTNODE 0x80
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 141)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 142) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 143) * utf8leaf_t
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 144) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 145) * The leaves of the trie are embedded in the trie, and so the same
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 146) * underlying datatype, unsigned char.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 147) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 148) * leaf[0]: The unicode version, stored as a generation number that is
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 149) * an index into utf8agetab[]. With this we can filter code
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 150) * points based on the unicode version in which they were
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 151) * defined. The CCC of a non-defined code point is 0.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 152) * leaf[1]: Canonical Combining Class. During normalization, we need
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 153) * to do a stable sort into ascending order of all characters
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 154) * with a non-zero CCC that occur between two characters with
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 155) * a CCC of 0, or at the begin or end of a string.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 156) * The unicode standard guarantees that all CCC values are
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 157) * between 0 and 254 inclusive, which leaves 255 available as
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 158) * a special value.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 159) * Code points with CCC 0 are known as stoppers.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 160) * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 161) * start of a NUL-terminated string that is the decomposition
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 162) * of the character.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 163) * The CCC of a decomposable character is the same as the CCC
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 164) * of the first character of its decomposition.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 165) * Some characters decompose as the empty string: these are
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 166) * characters with the Default_Ignorable_Code_Point property.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 167) * These do affect normalization, as they all have CCC 0.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 168) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 169) * The decompositions in the trie have been fully expanded.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 170) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 171) * Casefolding, if applicable, is also done using decompositions.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 172) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 173) typedef unsigned char utf8leaf_t;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 174)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 175) #define LEAF_GEN(LEAF) ((LEAF)[0])
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 176) #define LEAF_CCC(LEAF) ((LEAF)[1])
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 177) #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 178)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 179) #define MAXGEN (255)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 180)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 181) #define MINCCC (0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 182) #define MAXCCC (254)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 183) #define STOPPER (0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 184) #define DECOMPOSE (255)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 185) #define HANGUL ((char)(255))
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 186)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 187) #define UTF8HANGULLEAF (12)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 188)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 189) struct tree;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 190) static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 191) const char *, size_t);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 192) static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 193)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 194) unsigned char *utf8data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 195) size_t utf8data_size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 196)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 197) utf8trie_t *nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 198) utf8trie_t *nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 199)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 200) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 201)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 202) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 203) * UTF8 valid ranges.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 204) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 205) * The UTF-8 encoding spreads the bits of a 32bit word over several
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 206) * bytes. This table gives the ranges that can be held and how they'd
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 207) * be represented.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 208) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 209) * 0x00000000 0x0000007F: 0xxxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 210) * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 211) * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 212) * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 213) * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 214) * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 215) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 216) * There is an additional requirement on UTF-8, in that only the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 217) * shortest representation of a 32bit value is to be used. A decoder
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 218) * must not decode sequences that do not satisfy this requirement.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 219) * Thus the allowed ranges have a lower bound.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 220) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 221) * 0x00000000 0x0000007F: 0xxxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 222) * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 223) * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 224) * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 225) * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 226) * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 227) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 228) * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 229) * 17 planes of 65536 values. This limits the sequences actually seen
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 230) * even more, to just the following.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 231) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 232) * 0 - 0x7f: 0 0x7f
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 233) * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 234) * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 235) * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 236) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 237) * Even within those ranges not all values are allowed: the surrogates
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 238) * 0xd800 - 0xdfff should never be seen.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 239) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 240) * Note that the longest sequence seen with valid usage is 4 bytes,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 241) * the same a single UTF-32 character. This makes the UTF-8
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 242) * representation of Unicode strictly smaller than UTF-32.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 243) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 244) * The shortest sequence requirement was introduced by:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 245) * Corrigendum #1: UTF-8 Shortest Form
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 246) * It can be found here:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 247) * http://www.unicode.org/versions/corrigendum1.html
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 248) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 249) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 250)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 251) #define UTF8_2_BITS 0xC0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 252) #define UTF8_3_BITS 0xE0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 253) #define UTF8_4_BITS 0xF0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 254) #define UTF8_N_BITS 0x80
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 255) #define UTF8_2_MASK 0xE0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 256) #define UTF8_3_MASK 0xF0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 257) #define UTF8_4_MASK 0xF8
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 258) #define UTF8_N_MASK 0xC0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 259) #define UTF8_V_MASK 0x3F
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 260) #define UTF8_V_SHIFT 6
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 261)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 262) static int utf8encode(char *str, unsigned int val)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 263) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 264) int len;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 265)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 266) if (val < 0x80) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 267) str[0] = val;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 268) len = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 269) } else if (val < 0x800) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 270) str[1] = val & UTF8_V_MASK;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 271) str[1] |= UTF8_N_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 272) val >>= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 273) str[0] = val;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 274) str[0] |= UTF8_2_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 275) len = 2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 276) } else if (val < 0x10000) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 277) str[2] = val & UTF8_V_MASK;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 278) str[2] |= UTF8_N_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 279) val >>= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 280) str[1] = val & UTF8_V_MASK;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 281) str[1] |= UTF8_N_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 282) val >>= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 283) str[0] = val;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 284) str[0] |= UTF8_3_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 285) len = 3;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 286) } else if (val < 0x110000) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 287) str[3] = val & UTF8_V_MASK;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 288) str[3] |= UTF8_N_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 289) val >>= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 290) str[2] = val & UTF8_V_MASK;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 291) str[2] |= UTF8_N_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 292) val >>= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 293) str[1] = val & UTF8_V_MASK;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 294) str[1] |= UTF8_N_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 295) val >>= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 296) str[0] = val;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 297) str[0] |= UTF8_4_BITS;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 298) len = 4;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 299) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 300) printf("%#x: illegal val\n", val);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 301) len = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 302) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 303) return len;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 304) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 305)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 306) static unsigned int utf8decode(const char *str)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 307) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 308) const unsigned char *s = (const unsigned char*)str;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 309) unsigned int unichar = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 310)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 311) if (*s < 0x80) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 312) unichar = *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 313) } else if (*s < UTF8_3_BITS) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 314) unichar = *s++ & 0x1F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 315) unichar <<= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 316) unichar |= *s & 0x3F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 317) } else if (*s < UTF8_4_BITS) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 318) unichar = *s++ & 0x0F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 319) unichar <<= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 320) unichar |= *s++ & 0x3F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 321) unichar <<= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 322) unichar |= *s & 0x3F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 323) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 324) unichar = *s++ & 0x0F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 325) unichar <<= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 326) unichar |= *s++ & 0x3F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 327) unichar <<= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 328) unichar |= *s++ & 0x3F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 329) unichar <<= UTF8_V_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 330) unichar |= *s & 0x3F;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 331) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 332) return unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 333) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 334)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 335) static int utf32valid(unsigned int unichar)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 336) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 337) return unichar < 0x110000;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 338) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 339)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 340) #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 341)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 342) #define NODE 1
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 343) #define LEAF 0
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 344)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 345) struct tree {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 346) void *root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 347) int childnode;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 348) const char *type;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 349) unsigned int maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 350) struct tree *next;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 351) int (*leaf_equal)(void *, void *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 352) void (*leaf_print)(void *, int);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 353) int (*leaf_mark)(void *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 354) int (*leaf_size)(void *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 355) int *(*leaf_index)(struct tree *, void *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 356) unsigned char *(*leaf_emit)(void *, unsigned char *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 357) int leafindex[0x110000];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 358) int index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 359) };
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 360)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 361) struct node {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 362) int index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 363) int offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 364) int mark;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 365) int size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 366) struct node *parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 367) void *left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 368) void *right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 369) unsigned char bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 370) unsigned char nextbyte;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 371) unsigned char leftnode;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 372) unsigned char rightnode;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 373) unsigned int keybits;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 374) unsigned int keymask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 375) };
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 376)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 377) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 378) * Example lookup function for a tree.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 379) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 380) static void *lookup(struct tree *tree, const char *key)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 381) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 382) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 383) void *leaf = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 384)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 385) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 386) while (!leaf && node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 387) if (node->nextbyte)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 388) key++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 389) if (*key & (1 << (node->bitnum & 7))) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 390) /* Right leg */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 391) if (node->rightnode == NODE) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 392) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 393) } else if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 394) leaf = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 395) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 396) node = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 397) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 398) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 399) /* Left leg */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 400) if (node->leftnode == NODE) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 401) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 402) } else if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 403) leaf = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 404) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 405) node = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 406) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 407) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 408) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 409)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 410) return leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 411) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 412)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 413) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 414) * A simple non-recursive tree walker: keep track of visits to the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 415) * left and right branches in the leftmask and rightmask.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 416) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 417) static void tree_walk(struct tree *tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 418) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 419) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 420) unsigned int leftmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 421) unsigned int rightmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 422) unsigned int bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 423) int indent = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 424) int nodes, singletons, leaves;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 425)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 426) nodes = singletons = leaves = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 427)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 428) printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 429) if (tree->childnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 430) assert(tree->root);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 431) tree->leaf_print(tree->root, indent);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 432) leaves = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 433) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 434) assert(tree->childnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 435) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 436) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 437) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 438) printf("%*snode @ %p bitnum %d nextbyte %d"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 439) " left %p right %p mask %x bits %x\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 440) indent, "", node,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 441) node->bitnum, node->nextbyte,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 442) node->left, node->right,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 443) node->keymask, node->keybits);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 444) nodes += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 445) if (!(node->left && node->right))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 446) singletons += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 447)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 448) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 449) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 450) if ((leftmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 451) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 452) if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 453) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 454) tree->leaf_print(node->left,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 455) indent+1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 456) leaves += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 457) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 458) assert(node->leftnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 459) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 460) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 461) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 462) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 463) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 464) if ((rightmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 465) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 466) if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 467) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 468) tree->leaf_print(node->right,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 469) indent+1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 470) leaves += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 471) } else if (node->right) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 472) assert(node->rightnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 473) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 474) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 475) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 476) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 477) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 478) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 479) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 480) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 481) indent -= 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 482) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 483) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 484) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 485) printf("nodes %d leaves %d singletons %d\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 486) nodes, leaves, singletons);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 487) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 488)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 489) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 490) * Allocate an initialize a new internal node.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 491) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 492) static struct node *alloc_node(struct node *parent)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 493) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 494) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 495) int bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 496)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 497) node = malloc(sizeof(*node));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 498) node->left = node->right = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 499) node->parent = parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 500) node->leftnode = NODE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 501) node->rightnode = NODE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 502) node->keybits = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 503) node->keymask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 504) node->mark = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 505) node->index = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 506) node->offset = -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 507) node->size = 4;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 508)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 509) if (node->parent) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 510) bitnum = parent->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 511) if ((bitnum & 7) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 512) node->bitnum = bitnum + 7 + 8;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 513) node->nextbyte = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 514) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 515) node->bitnum = bitnum - 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 516) node->nextbyte = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 517) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 518) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 519) node->bitnum = 7;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 520) node->nextbyte = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 521) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 522)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 523) return node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 524) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 525)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 526) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 527) * Insert a new leaf into the tree, and collapse any subtrees that are
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 528) * fully populated and end in identical leaves. A nextbyte tagged
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 529) * internal node will not be removed to preserve the tree's integrity.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 530) * Note that due to the structure of utf8, no nextbyte tagged node
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 531) * will be a candidate for removal.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 532) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 533) static int insert(struct tree *tree, char *key, int keylen, void *leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 534) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 535) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 536) struct node *parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 537) void **cursor;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 538) int keybits;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 539)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 540) assert(keylen >= 1 && keylen <= 4);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 541)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 542) node = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 543) cursor = &tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 544) keybits = 8 * keylen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 545)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 546) /* Insert, creating path along the way. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 547) while (keybits) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 548) if (!*cursor)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 549) *cursor = alloc_node(node);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 550) node = *cursor;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 551) if (node->nextbyte)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 552) key++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 553) if (*key & (1 << (node->bitnum & 7)))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 554) cursor = &node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 555) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 556) cursor = &node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 557) keybits--;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 558) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 559) *cursor = leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 560)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 561) /* Merge subtrees if possible. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 562) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 563) if (*key & (1 << (node->bitnum & 7)))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 564) node->rightnode = LEAF;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 565) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 566) node->leftnode = LEAF;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 567) if (node->nextbyte)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 568) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 569) if (node->leftnode == NODE || node->rightnode == NODE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 570) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 571) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 572) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 573) /* Compare */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 574) if (! tree->leaf_equal(node->left, node->right))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 575) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 576) /* Keep left, drop right leaf. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 577) leaf = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 578) /* Check in parent */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 579) parent = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 580) if (!parent) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 581) /* root of tree! */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 582) tree->root = leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 583) tree->childnode = LEAF;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 584) } else if (parent->left == node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 585) parent->left = leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 586) parent->leftnode = LEAF;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 587) if (parent->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 588) parent->keymask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 589) parent->keybits = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 590) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 591) parent->keymask |= (1 << node->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 592) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 593) } else if (parent->right == node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 594) parent->right = leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 595) parent->rightnode = LEAF;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 596) if (parent->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 597) parent->keymask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 598) parent->keybits = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 599) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 600) parent->keymask |= (1 << node->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 601) parent->keybits |= (1 << node->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 602) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 603) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 604) /* internal tree error */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 605) assert(0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 606) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 607) free(node);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 608) node = parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 609) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 610)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 611) /* Propagate keymasks up along singleton chains. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 612) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 613) parent = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 614) if (!parent)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 615) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 616) /* Nix the mask for parents with two children. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 617) if (node->keymask == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 618) parent->keymask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 619) parent->keybits = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 620) } else if (parent->left && parent->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 621) parent->keymask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 622) parent->keybits = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 623) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 624) assert((parent->keymask & node->keymask) == 0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 625) parent->keymask |= node->keymask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 626) parent->keymask |= (1 << parent->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 627) parent->keybits |= node->keybits;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 628) if (parent->right)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 629) parent->keybits |= (1 << parent->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 630) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 631) node = parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 632) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 633)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 634) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 635) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 636)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 637) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 638) * Prune internal nodes.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 639) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 640) * Fully populated subtrees that end at the same leaf have already
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 641) * been collapsed. There are still internal nodes that have for both
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 642) * their left and right branches a sequence of singletons that make
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 643) * identical choices and end in identical leaves. The keymask and
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 644) * keybits collected in the nodes describe the choices made in these
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 645) * singleton chains. When they are identical for the left and right
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 646) * branch of a node, and the two leaves comare identical, the node in
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 647) * question can be removed.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 648) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 649) * Note that nodes with the nextbyte tag set will not be removed by
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 650) * this to ensure tree integrity. Note as well that the structure of
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 651) * utf8 ensures that these nodes would not have been candidates for
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 652) * removal in any case.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 653) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 654) static void prune(struct tree *tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 655) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 656) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 657) struct node *left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 658) struct node *right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 659) struct node *parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 660) void *leftleaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 661) void *rightleaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 662) unsigned int leftmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 663) unsigned int rightmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 664) unsigned int bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 665) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 666)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 667) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 668) printf("Pruning %s_%x\n", tree->type, tree->maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 669)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 670) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 671) if (tree->childnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 672) return;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 673) if (!tree->root)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 674) return;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 675)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 676) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 677) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 678) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 679) if (node->nextbyte)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 680) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 681) if (node->leftnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 682) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 683) if (node->rightnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 684) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 685) if (!node->left)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 686) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 687) if (!node->right)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 688) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 689) left = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 690) right = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 691) if (left->keymask == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 692) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 693) if (right->keymask == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 694) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 695) if (left->keymask != right->keymask)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 696) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 697) if (left->keybits != right->keybits)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 698) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 699) leftleaf = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 700) while (!leftleaf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 701) assert(left->left || left->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 702) if (left->leftnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 703) leftleaf = left->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 704) else if (left->rightnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 705) leftleaf = left->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 706) else if (left->left)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 707) left = left->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 708) else if (left->right)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 709) left = left->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 710) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 711) assert(0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 712) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 713) rightleaf = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 714) while (!rightleaf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 715) assert(right->left || right->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 716) if (right->leftnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 717) rightleaf = right->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 718) else if (right->rightnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 719) rightleaf = right->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 720) else if (right->left)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 721) right = right->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 722) else if (right->right)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 723) right = right->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 724) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 725) assert(0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 726) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 727) if (! tree->leaf_equal(leftleaf, rightleaf))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 728) goto advance;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 729) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 730) * This node has identical singleton-only subtrees.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 731) * Remove it.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 732) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 733) parent = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 734) left = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 735) right = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 736) if (parent->left == node)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 737) parent->left = left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 738) else if (parent->right == node)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 739) parent->right = left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 740) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 741) assert(0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 742) left->parent = parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 743) left->keymask |= (1 << node->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 744) node->left = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 745) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 746) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 747) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 748) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 749) if (node->leftnode == NODE && node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 750) left = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 751) free(node);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 752) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 753) node = left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 754) } else if (node->rightnode == NODE && node->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 755) right = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 756) free(node);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 757) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 758) node = right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 759) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 760) node = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 761) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 762) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 763) /* Propagate keymasks up along singleton chains. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 764) node = parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 765) /* Force re-check */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 766) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 767) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 768) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 769) for (;;) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 770) if (node->left && node->right)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 771) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 772) if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 773) left = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 774) node->keymask |= left->keymask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 775) node->keybits |= left->keybits;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 776) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 777) if (node->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 778) right = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 779) node->keymask |= right->keymask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 780) node->keybits |= right->keybits;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 781) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 782) node->keymask |= (1 << node->bitnum);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 783) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 784) /* Force re-check */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 785) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 786) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 787) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 788) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 789) advance:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 790) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 791) if ((leftmask & bitmask) == 0 &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 792) node->leftnode == NODE &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 793) node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 794) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 795) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 796) } else if ((rightmask & bitmask) == 0 &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 797) node->rightnode == NODE &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 798) node->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 799) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 800) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 801) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 802) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 803) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 804) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 805) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 806) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 807) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 808) printf("Pruned %d nodes\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 809) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 810)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 811) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 812) * Mark the nodes in the tree that lead to leaves that must be
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 813) * emitted.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 814) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 815) static void mark_nodes(struct tree *tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 816) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 817) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 818) struct node *n;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 819) unsigned int leftmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 820) unsigned int rightmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 821) unsigned int bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 822) int marked;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 823)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 824) marked = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 825) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 826) printf("Marking %s_%x\n", tree->type, tree->maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 827) if (tree->childnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 828) goto done;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 829)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 830) assert(tree->childnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 831) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 832) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 833) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 834) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 835) if ((leftmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 836) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 837) if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 838) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 839) if (tree->leaf_mark(node->left)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 840) n = node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 841) while (n && !n->mark) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 842) marked++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 843) n->mark = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 844) n = n->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 845) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 846) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 847) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 848) assert(node->leftnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 849) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 850) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 851) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 852) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 853) if ((rightmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 854) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 855) if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 856) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 857) if (tree->leaf_mark(node->right)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 858) n = node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 859) while (n && !n->mark) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 860) marked++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 861) n->mark = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 862) n = n->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 863) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 864) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 865) } else if (node->right) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 866) assert(node->rightnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 867) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 868) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 869) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 870) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 871) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 872) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 873) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 874) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 875)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 876) /* second pass: left siblings and singletons */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 877)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 878) assert(tree->childnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 879) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 880) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 881) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 882) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 883) if ((leftmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 884) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 885) if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 886) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 887) if (tree->leaf_mark(node->left)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 888) n = node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 889) while (n && !n->mark) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 890) marked++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 891) n->mark = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 892) n = n->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 893) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 894) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 895) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 896) assert(node->leftnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 897) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 898) if (!node->mark && node->parent->mark) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 899) marked++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 900) node->mark = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 901) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 902) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 903) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 904) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 905) if ((rightmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 906) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 907) if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 908) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 909) if (tree->leaf_mark(node->right)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 910) n = node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 911) while (n && !n->mark) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 912) marked++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 913) n->mark = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 914) n = n->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 915) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 916) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 917) } else if (node->right) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 918) assert(node->rightnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 919) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 920) if (!node->mark && node->parent->mark &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 921) !node->parent->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 922) marked++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 923) node->mark = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 924) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 925) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 926) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 927) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 928) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 929) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 930) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 931) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 932) done:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 933) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 934) printf("Marked %d nodes\n", marked);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 935) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 936)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 937) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 938) * Compute the index of each node and leaf, which is the offset in the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 939) * emitted trie. These values must be pre-computed because relative
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 940) * offsets between nodes are used to navigate the tree.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 941) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 942) static int index_nodes(struct tree *tree, int index)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 943) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 944) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 945) unsigned int leftmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 946) unsigned int rightmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 947) unsigned int bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 948) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 949) int indent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 950)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 951) /* Align to a cache line (or half a cache line?). */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 952) while (index % 64)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 953) index++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 954) tree->index = index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 955) indent = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 956) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 957)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 958) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 959) printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 960) if (tree->childnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 961) index += tree->leaf_size(tree->root);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 962) goto done;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 963) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 964)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 965) assert(tree->childnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 966) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 967) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 968) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 969) if (!node->mark)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 970) goto skip;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 971) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 972) if (node->index != index)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 973) node->index = index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 974) index += node->size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 975) skip:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 976) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 977) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 978) if (node->mark && (leftmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 979) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 980) if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 981) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 982) *tree->leaf_index(tree, node->left) =
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 983) index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 984) index += tree->leaf_size(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 985) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 986) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 987) assert(node->leftnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 988) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 989) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 990) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 991) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 992) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 993) if (node->mark && (rightmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 994) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 995) if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 996) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 997) *tree->leaf_index(tree, node->right) = index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 998) index += tree->leaf_size(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 999) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1000) } else if (node->right) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1001) assert(node->rightnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1002) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1003) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1004) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1005) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1006) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1007) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1008) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1009) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1010) indent -= 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1011) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1012) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1013) done:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1014) /* Round up to a multiple of 16 */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1015) while (index % 16)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1016) index++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1017) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1018) printf("Final index %d\n", index);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1019) return index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1020) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1021)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1022) /*
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1023) * Mark the nodes in a subtree, helper for size_nodes().
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1024) */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1025) static int mark_subtree(struct node *node)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1026) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1027) int changed;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1028)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1029) if (!node || node->mark)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1030) return 0;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1031) node->mark = 1;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1032) node->index = node->parent->index;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1033) changed = 1;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1034) if (node->leftnode == NODE)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1035) changed += mark_subtree(node->left);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1036) if (node->rightnode == NODE)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1037) changed += mark_subtree(node->right);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1038) return changed;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1039) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1040)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1041) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1042) * Compute the size of nodes and leaves. We start by assuming that
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1043) * each node needs to store a three-byte offset. The indexes of the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1044) * nodes are calculated based on that, and then this function is
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1045) * called to see if the sizes of some nodes can be reduced. This is
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1046) * repeated until no more changes are seen.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1047) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1048) static int size_nodes(struct tree *tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1049) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1050) struct tree *next;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1051) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1052) struct node *right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1053) struct node *n;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1054) unsigned int leftmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1055) unsigned int rightmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1056) unsigned int bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1057) unsigned int pathbits;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1058) unsigned int pathmask;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1059) unsigned int nbit;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1060) int changed;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1061) int offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1062) int size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1063) int indent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1064)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1065) indent = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1066) changed = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1067) size = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1068)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1069) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1070) printf("Sizing %s_%x\n", tree->type, tree->maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1071) if (tree->childnode == LEAF)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1072) goto done;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1073)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1074) assert(tree->childnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1075) pathbits = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1076) pathmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1077) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1078) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1079) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1080) if (!node->mark)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1081) goto skip;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1082) offset = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1083) if (!node->left || !node->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1084) size = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1085) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1086) if (node->rightnode == NODE) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1087) /*
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1088) * If the right node is not marked,
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1089) * look for a corresponding node in
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1090) * the next tree. Such a node need
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1091) * not exist.
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1092) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1093) right = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1094) next = tree->next;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1095) while (!right->mark) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1096) assert(next);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1097) n = next->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1098) while (n->bitnum != node->bitnum) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1099) nbit = 1 << n->bitnum;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1100) if (!(pathmask & nbit))
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1101) break;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1102) if (pathbits & nbit) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1103) if (n->rightnode == LEAF)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1104) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1105) n = n->right;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1106) } else {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1107) if (n->leftnode == LEAF)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1108) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1109) n = n->left;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1110) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1111) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1112) if (n->bitnum != node->bitnum)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1113) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1114) n = n->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1115) right = n;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1116) next = next->next;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1117) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1118) /* Make sure the right node is marked. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1119) if (!right->mark)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1120) changed += mark_subtree(right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1121) offset = right->index - node->index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1122) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1123) offset = *tree->leaf_index(tree, node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1124) offset -= node->index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1125) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1126) assert(offset >= 0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1127) assert(offset <= 0xffffff);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1128) if (offset <= 0xff) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1129) size = 2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1130) } else if (offset <= 0xffff) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1131) size = 3;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1132) } else { /* offset <= 0xffffff */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1133) size = 4;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1134) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1135) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1136) if (node->size != size || node->offset != offset) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1137) node->size = size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1138) node->offset = offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1139) changed++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1140) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1141) skip:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1142) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1143) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1144) pathmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1145) if (node->mark && (leftmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1146) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1147) if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1148) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1149) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1150) assert(node->leftnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1151) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1152) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1153) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1154) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1155) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1156) if (node->mark && (rightmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1157) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1158) pathbits |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1159) if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1160) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1161) } else if (node->right) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1162) assert(node->rightnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1163) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1164) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1165) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1166) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1167) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1168) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1169) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1170) pathmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1171) pathbits &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1172) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1173) indent -= 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1174) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1175) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1176) done:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1177) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1178) printf("Found %d changes\n", changed);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1179) return changed;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1180) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1181)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1182) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1183) * Emit a trie for the given tree into the data array.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1184) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1185) static void emit(struct tree *tree, unsigned char *data)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1186) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1187) struct node *node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1188) unsigned int leftmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1189) unsigned int rightmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1190) unsigned int bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1191) int offlen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1192) int offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1193) int index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1194) int indent;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1195) int size;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1196) int bytes;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1197) int leaves;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1198) int nodes[4];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1199) unsigned char byte;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1200)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1201) nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1202) leaves = 0;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1203) bytes = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1204) index = tree->index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1205) data += index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1206) indent = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1207) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1208) printf("Emitting %s_%x\n", tree->type, tree->maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1209) if (tree->childnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1210) assert(tree->root);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1211) tree->leaf_emit(tree->root, data);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1212) size = tree->leaf_size(tree->root);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1213) index += size;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1214) leaves++;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1215) goto done;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1216) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1217)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1218) assert(tree->childnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1219) node = tree->root;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1220) leftmask = rightmask = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1221) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1222) if (!node->mark)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1223) goto skip;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1224) assert(node->offset != -1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1225) assert(node->index == index);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1226)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1227) byte = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1228) if (node->nextbyte)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1229) byte |= NEXTBYTE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1230) byte |= (node->bitnum & BITNUM);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1231) if (node->left && node->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1232) if (node->leftnode == NODE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1233) byte |= LEFTNODE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1234) if (node->rightnode == NODE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1235) byte |= RIGHTNODE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1236) if (node->offset <= 0xff)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1237) offlen = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1238) else if (node->offset <= 0xffff)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1239) offlen = 2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1240) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1241) offlen = 3;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1242) nodes[offlen]++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1243) offset = node->offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1244) byte |= offlen << OFFLEN_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1245) *data++ = byte;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1246) index++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1247) while (offlen--) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1248) *data++ = offset & 0xff;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1249) index++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1250) offset >>= 8;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1251) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1252) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1253) if (node->leftnode == NODE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1254) byte |= TRIENODE;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1255) nodes[0]++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1256) *data++ = byte;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1257) index++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1258) } else if (node->right) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1259) byte |= RIGHTNODE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1260) if (node->rightnode == NODE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1261) byte |= TRIENODE;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1262) nodes[0]++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1263) *data++ = byte;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1264) index++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1265) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1266) assert(0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1267) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1268) skip:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1269) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1270) bitmask = 1 << node->bitnum;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1271) if (node->mark && (leftmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1272) leftmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1273) if (node->leftnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1274) assert(node->left);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1275) data = tree->leaf_emit(node->left,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1276) data);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1277) size = tree->leaf_size(node->left);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1278) index += size;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1279) bytes += size;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1280) leaves++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1281) } else if (node->left) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1282) assert(node->leftnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1283) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1284) node = node->left;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1285) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1286) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1287) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1288) if (node->mark && (rightmask & bitmask) == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1289) rightmask |= bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1290) if (node->rightnode == LEAF) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1291) assert(node->right);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1292) data = tree->leaf_emit(node->right,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1293) data);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1294) size = tree->leaf_size(node->right);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1295) index += size;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1296) bytes += size;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1297) leaves++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1298) } else if (node->right) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1299) assert(node->rightnode == NODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1300) indent += 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1301) node = node->right;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1302) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1303) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1304) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1305) leftmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1306) rightmask &= ~bitmask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1307) node = node->parent;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1308) indent -= 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1309) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1310) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1311) done:
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1312) if (verbose > 0) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1313) printf("Emitted %d (%d) leaves",
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1314) leaves, bytes);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1315) printf(" %d (%d+%d+%d+%d) nodes",
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1316) nodes[0] + nodes[1] + nodes[2] + nodes[3],
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1317) nodes[0], nodes[1], nodes[2], nodes[3]);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1318) printf(" %d total\n", index - tree->index);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1319) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1320) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1321)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1322) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1323)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1324) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1325) * Unicode data.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1326) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1327) * We need to keep track of the Canonical Combining Class, the Age,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1328) * and decompositions for a code point.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1329) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1330) * For the Age, we store the index into the ages table. Effectively
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1331) * this is a generation number that the table maps to a unicode
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1332) * version.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1333) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1334) * The correction field is used to indicate that this entry is in the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1335) * corrections array, which contains decompositions that were
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1336) * corrected in later revisions. The value of the correction field is
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1337) * the Unicode version in which the mapping was corrected.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1338) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1339) struct unicode_data {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1340) unsigned int code;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1341) int ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1342) int gen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1343) int correction;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1344) unsigned int *utf32nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1345) unsigned int *utf32nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1346) char *utf8nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1347) char *utf8nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1348) };
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1349)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1350) struct unicode_data unicode_data[0x110000];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1351) struct unicode_data *corrections;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1352) int corrections_count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1353)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1354) struct tree *nfdi_tree;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1355) struct tree *nfdicf_tree;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1356)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1357) struct tree *trees;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1358) int trees_count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1359)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1360) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1361) * Check the corrections array to see if this entry was corrected at
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1362) * some point.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1363) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1364) static struct unicode_data *corrections_lookup(struct unicode_data *u)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1365) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1366) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1367)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1368) for (i = 0; i != corrections_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1369) if (u->code == corrections[i].code)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1370) return &corrections[i];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1371) return u;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1372) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1373)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1374) static int nfdi_equal(void *l, void *r)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1375) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1376) struct unicode_data *left = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1377) struct unicode_data *right = r;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1378)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1379) if (left->gen != right->gen)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1380) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1381) if (left->ccc != right->ccc)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1382) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1383) if (left->utf8nfdi && right->utf8nfdi &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1384) strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1385) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1386) if (left->utf8nfdi || right->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1387) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1388) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1389) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1390)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1391) static int nfdicf_equal(void *l, void *r)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1392) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1393) struct unicode_data *left = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1394) struct unicode_data *right = r;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1395)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1396) if (left->gen != right->gen)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1397) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1398) if (left->ccc != right->ccc)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1399) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1400) if (left->utf8nfdicf && right->utf8nfdicf &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1401) strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1402) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1403) if (left->utf8nfdicf && right->utf8nfdicf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1404) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1405) if (left->utf8nfdicf || right->utf8nfdicf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1406) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1407) if (left->utf8nfdi && right->utf8nfdi &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1408) strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1409) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1410) if (left->utf8nfdi || right->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1411) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1412) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1413) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1414)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1415) static void nfdi_print(void *l, int indent)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1416) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1417) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1418)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1419) printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1420) leaf->code, leaf->ccc, leaf->gen);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1421)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1422) if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1423) printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1424) else if (leaf->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1425) printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1426)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1427) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1428) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1429)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1430) static void nfdicf_print(void *l, int indent)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1431) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1432) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1433)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1434) printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1435) leaf->code, leaf->ccc, leaf->gen);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1436)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1437) if (leaf->utf8nfdicf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1438) printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1439) else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1440) printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1441) else if (leaf->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1442) printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1443) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1444) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1445)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1446) static int nfdi_mark(void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1447) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1448) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1449) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1450)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1451) static int nfdicf_mark(void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1452) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1453) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1454)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1455) if (leaf->utf8nfdicf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1456) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1457) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1458) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1459)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1460) static int correction_mark(void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1461) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1462) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1463)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1464) return leaf->correction;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1465) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1466)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1467) static int nfdi_size(void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1468) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1469) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1470) int size = 2;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1471)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1472) if (HANGUL_SYLLABLE(leaf->code))
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1473) size += 1;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1474) else if (leaf->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1475) size += strlen(leaf->utf8nfdi) + 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1476) return size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1477) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1478)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1479) static int nfdicf_size(void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1480) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1481) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1482) int size = 2;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1483)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1484) if (HANGUL_SYLLABLE(leaf->code))
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1485) size += 1;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1486) else if (leaf->utf8nfdicf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1487) size += strlen(leaf->utf8nfdicf) + 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1488) else if (leaf->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1489) size += strlen(leaf->utf8nfdi) + 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1490) return size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1491) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1492)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1493) static int *nfdi_index(struct tree *tree, void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1494) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1495) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1496)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1497) return &tree->leafindex[leaf->code];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1498) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1499)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1500) static int *nfdicf_index(struct tree *tree, void *l)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1501) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1502) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1503)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1504) return &tree->leafindex[leaf->code];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1505) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1506)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1507) static unsigned char *nfdi_emit(void *l, unsigned char *data)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1508) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1509) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1510) unsigned char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1511)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1512) *data++ = leaf->gen;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1513)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1514) if (HANGUL_SYLLABLE(leaf->code)) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1515) *data++ = DECOMPOSE;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1516) *data++ = HANGUL;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1517) } else if (leaf->utf8nfdi) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1518) *data++ = DECOMPOSE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1519) s = (unsigned char*)leaf->utf8nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1520) while ((*data++ = *s++) != 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1521) ;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1522) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1523) *data++ = leaf->ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1524) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1525) return data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1526) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1527)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1528) static unsigned char *nfdicf_emit(void *l, unsigned char *data)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1529) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1530) struct unicode_data *leaf = l;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1531) unsigned char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1532)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1533) *data++ = leaf->gen;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1534)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1535) if (HANGUL_SYLLABLE(leaf->code)) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1536) *data++ = DECOMPOSE;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1537) *data++ = HANGUL;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1538) } else if (leaf->utf8nfdicf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1539) *data++ = DECOMPOSE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1540) s = (unsigned char*)leaf->utf8nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1541) while ((*data++ = *s++) != 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1542) ;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1543) } else if (leaf->utf8nfdi) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1544) *data++ = DECOMPOSE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1545) s = (unsigned char*)leaf->utf8nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1546) while ((*data++ = *s++) != 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1547) ;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1548) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1549) *data++ = leaf->ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1550) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1551) return data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1552) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1553)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1554) static void utf8_create(struct unicode_data *data)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1555) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1556) char utf[18*4+1];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1557) char *u;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1558) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1559) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1560)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1561) if (data->utf8nfdi) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1562) assert(data->utf8nfdi[0] == HANGUL);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1563) return;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1564) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1565)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1566) u = utf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1567) um = data->utf32nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1568) if (um) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1569) for (i = 0; um[i]; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1570) u += utf8encode(u, um[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1571) *u = '\0';
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1572) data->utf8nfdi = strdup(utf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1573) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1574) u = utf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1575) um = data->utf32nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1576) if (um) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1577) for (i = 0; um[i]; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1578) u += utf8encode(u, um[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1579) *u = '\0';
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1580) if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1581) data->utf8nfdicf = strdup(utf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1582) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1583) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1584)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1585) static void utf8_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1586) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1587) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1588) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1589)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1590) for (unichar = 0; unichar != 0x110000; unichar++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1591) utf8_create(&unicode_data[unichar]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1592)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1593) for (i = 0; i != corrections_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1594) utf8_create(&corrections[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1595) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1596)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1597) static void trees_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1598) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1599) struct unicode_data *data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1600) unsigned int maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1601) unsigned int nextage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1602) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1603) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1604) int j;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1605)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1606) /* Count the number of different ages. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1607) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1608) nextage = (unsigned int)-1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1609) do {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1610) maxage = nextage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1611) nextage = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1612) for (i = 0; i <= corrections_count; i++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1613) data = &corrections[i];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1614) if (nextage < data->correction &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1615) data->correction < maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1616) nextage = data->correction;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1617) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1618) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1619) } while (nextage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1620)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1621) /* Two trees per age: nfdi and nfdicf */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1622) trees_count = count * 2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1623) trees = calloc(trees_count, sizeof(struct tree));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1624)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1625) /* Assign ages to the trees. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1626) count = trees_count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1627) nextage = (unsigned int)-1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1628) do {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1629) maxage = nextage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1630) trees[--count].maxage = maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1631) trees[--count].maxage = maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1632) nextage = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1633) for (i = 0; i <= corrections_count; i++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1634) data = &corrections[i];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1635) if (nextage < data->correction &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1636) data->correction < maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1637) nextage = data->correction;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1638) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1639) } while (nextage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1640)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1641) /* The ages assigned above are off by one. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1642) for (i = 0; i != trees_count; i++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1643) j = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1644) while (ages[j] < trees[i].maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1645) j++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1646) trees[i].maxage = ages[j-1];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1647) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1648)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1649) /* Set up the forwarding between trees. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1650) trees[trees_count-2].next = &trees[trees_count-1];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1651) trees[trees_count-1].leaf_mark = nfdi_mark;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1652) trees[trees_count-2].leaf_mark = nfdicf_mark;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1653) for (i = 0; i != trees_count-2; i += 2) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1654) trees[i].next = &trees[trees_count-2];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1655) trees[i].leaf_mark = correction_mark;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1656) trees[i+1].next = &trees[trees_count-1];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1657) trees[i+1].leaf_mark = correction_mark;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1658) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1659)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1660) /* Assign the callouts. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1661) for (i = 0; i != trees_count; i += 2) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1662) trees[i].type = "nfdicf";
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1663) trees[i].leaf_equal = nfdicf_equal;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1664) trees[i].leaf_print = nfdicf_print;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1665) trees[i].leaf_size = nfdicf_size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1666) trees[i].leaf_index = nfdicf_index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1667) trees[i].leaf_emit = nfdicf_emit;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1668)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1669) trees[i+1].type = "nfdi";
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1670) trees[i+1].leaf_equal = nfdi_equal;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1671) trees[i+1].leaf_print = nfdi_print;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1672) trees[i+1].leaf_size = nfdi_size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1673) trees[i+1].leaf_index = nfdi_index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1674) trees[i+1].leaf_emit = nfdi_emit;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1675) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1676)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1677) /* Finish init. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1678) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1679) trees[i].childnode = NODE;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1680) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1681)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1682) static void trees_populate(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1683) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1684) struct unicode_data *data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1685) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1686) char keyval[4];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1687) int keylen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1688) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1689)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1690) for (i = 0; i != trees_count; i++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1691) if (verbose > 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1692) printf("Populating %s_%x\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1693) trees[i].type, trees[i].maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1694) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1695) for (unichar = 0; unichar != 0x110000; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1696) if (unicode_data[unichar].gen < 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1697) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1698) keylen = utf8encode(keyval, unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1699) data = corrections_lookup(&unicode_data[unichar]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1700) if (data->correction <= trees[i].maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1701) data = &unicode_data[unichar];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1702) insert(&trees[i], keyval, keylen, data);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1703) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1704) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1705) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1706)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1707) static void trees_reduce(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1708) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1709) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1710) int size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1711) int changed;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1712)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1713) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1714) prune(&trees[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1715) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1716) mark_nodes(&trees[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1717) do {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1718) size = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1719) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1720) size = index_nodes(&trees[i], size);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1721) changed = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1722) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1723) changed += size_nodes(&trees[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1724) } while (changed);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1725)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1726) utf8data = calloc(size, 1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1727) utf8data_size = size;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1728) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1729) emit(&trees[i], utf8data);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1730)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1731) if (verbose > 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1732) for (i = 0; i != trees_count; i++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1733) printf("%s_%x idx %d\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1734) trees[i].type, trees[i].maxage, trees[i].index);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1735) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1736) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1737)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1738) nfdi = utf8data + trees[trees_count-1].index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1739) nfdicf = utf8data + trees[trees_count-2].index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1740)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1741) nfdi_tree = &trees[trees_count-1];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1742) nfdicf_tree = &trees[trees_count-2];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1743) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1744)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1745) static void verify(struct tree *tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1746) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1747) struct unicode_data *data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1748) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1749) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1750) char key[4];
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1751) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1752) int report;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1753) int nocf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1754)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1755) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1756) printf("Verifying %s_%x\n", tree->type, tree->maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1757) nocf = strcmp(tree->type, "nfdicf");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1758)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1759) for (unichar = 0; unichar != 0x110000; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1760) report = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1761) data = corrections_lookup(&unicode_data[unichar]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1762) if (data->correction <= tree->maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1763) data = &unicode_data[unichar];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1764) utf8encode(key,unichar);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1765) leaf = utf8lookup(tree, hangul, key);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1766)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1767) if (!leaf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1768) if (data->gen != -1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1769) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1770) if (unichar < 0xd800 || unichar > 0xdfff)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1771) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1772) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1773) if (unichar >= 0xd800 && unichar <= 0xdfff)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1774) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1775) if (data->gen == -1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1776) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1777) if (data->gen != LEAF_GEN(leaf))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1778) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1779) if (LEAF_CCC(leaf) == DECOMPOSE) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1780) if (HANGUL_SYLLABLE(data->code)) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1781) if (data->utf8nfdi[0] != HANGUL)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1782) report++;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 1783) } else if (nocf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1784) if (!data->utf8nfdi) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1785) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1786) } else if (strcmp(data->utf8nfdi,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1787) LEAF_STR(leaf))) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1788) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1789) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1790) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1791) if (!data->utf8nfdicf &&
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1792) !data->utf8nfdi) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1793) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1794) } else if (data->utf8nfdicf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1795) if (strcmp(data->utf8nfdicf,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1796) LEAF_STR(leaf)))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1797) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1798) } else if (strcmp(data->utf8nfdi,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1799) LEAF_STR(leaf))) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1800) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1801) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1802) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1803) } else if (data->ccc != LEAF_CCC(leaf)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1804) report++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1805) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1806) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1807) if (report) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1808) printf("%X code %X gen %d ccc %d"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1809) " nfdi -> \"%s\"",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1810) unichar, data->code, data->gen,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1811) data->ccc,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1812) data->utf8nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1813) if (leaf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1814) printf(" gen %d ccc %d"
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1815) " nfdi -> \"%s\"",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1816) LEAF_GEN(leaf),
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1817) LEAF_CCC(leaf),
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1818) LEAF_CCC(leaf) == DECOMPOSE ?
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1819) LEAF_STR(leaf) : "");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1820) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1821) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1822) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1823) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1824) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1825)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1826) static void trees_verify(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1827) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1828) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1829)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1830) for (i = 0; i != trees_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1831) verify(&trees[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1832) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1833)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1834) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1835)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1836) static void help(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1837) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1838) printf("Usage: %s [options]\n", argv0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1839) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1840) printf("This program creates an a data trie used for parsing and\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1841) printf("normalization of UTF-8 strings. The trie is derived from\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1842) printf("a set of input files from the Unicode character database\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1843) printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1844) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1845) printf("The generated tree supports two normalization forms:\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1846) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1847) printf("\tnfdi:\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1848) printf("\t- Apply unicode normalization form NFD.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1849) printf("\t- Remove any Default_Ignorable_Code_Point.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1850) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1851) printf("\tnfdicf:\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1852) printf("\t- Apply unicode normalization form NFD.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1853) printf("\t- Remove any Default_Ignorable_Code_Point.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1854) printf("\t- Apply a full casefold (C + F).\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1855) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1856) printf("These forms were chosen as being most useful when dealing\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1857) printf("with file names: NFD catches most cases where characters\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1858) printf("should be considered equivalent. The ignorables are mostly\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1859) printf("invisible, making names hard to type.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1860) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1861) printf("The options to specify the files to be used are listed\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1862) printf("below with their default values, which are the names used\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1863) printf("by version 11.0.0 of the Unicode Character Database.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1864) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1865) printf("The input files:\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1866) printf("\t-a %s\n", AGE_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1867) printf("\t-c %s\n", CCC_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1868) printf("\t-p %s\n", PROP_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1869) printf("\t-d %s\n", DATA_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1870) printf("\t-f %s\n", FOLD_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1871) printf("\t-n %s\n", NORM_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1872) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1873) printf("Additionally, the generated tables are tested using:\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1874) printf("\t-t %s\n", TEST_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1875) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1876) printf("Finally, the output file:\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1877) printf("\t-o %s\n", UTF8_NAME);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1878) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1879) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1880)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1881) static void usage(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1882) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1883) help();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1884) exit(1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1885) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1886)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1887) static void open_fail(const char *name, int error)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1888) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1889) printf("Error %d opening %s: %s\n", error, name, strerror(error));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1890) exit(1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1891) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1892)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1893) static void file_fail(const char *filename)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1894) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1895) printf("Error parsing %s\n", filename);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1896) exit(1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1897) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1898)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1899) static void line_fail(const char *filename, const char *line)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1900) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1901) printf("Error parsing %s:%s\n", filename, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1902) exit(1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1903) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1904)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1905) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1906)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1907) static void print_utf32(unsigned int *utf32str)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1908) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1909) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1910)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1911) for (i = 0; utf32str[i]; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1912) printf(" %X", utf32str[i]);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1913) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1914)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1915) static void print_utf32nfdi(unsigned int unichar)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1916) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1917) printf(" %X ->", unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1918) print_utf32(unicode_data[unichar].utf32nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1919) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1920) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1921)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1922) static void print_utf32nfdicf(unsigned int unichar)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1923) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1924) printf(" %X ->", unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1925) print_utf32(unicode_data[unichar].utf32nfdicf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1926) printf("\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1927) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1928)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1929) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1930)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1931) static void age_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1932) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1933) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1934) unsigned int first;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1935) unsigned int last;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1936) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1937) unsigned int major;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1938) unsigned int minor;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1939) unsigned int revision;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1940) int gen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1941) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1942) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1943)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1944) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1945) printf("Parsing %s\n", age_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1946)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1947) file = fopen(age_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1948) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1949) open_fail(age_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1950) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1951)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1952) gen = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1953) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1954) ret = sscanf(line, "# Age=V%d_%d_%d",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1955) &major, &minor, &revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1956) if (ret == 3) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1957) ages_count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1958) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1959) printf(" Age V%d_%d_%d\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1960) major, minor, revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1961) if (!age_valid(major, minor, revision))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1962) line_fail(age_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1963) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1964) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1965) ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1966) if (ret == 2) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1967) ages_count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1968) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1969) printf(" Age V%d_%d\n", major, minor);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1970) if (!age_valid(major, minor, 0))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1971) line_fail(age_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1972) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1973) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1974) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1975)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1976) /* We must have found something above. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1977) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1978) printf("%d age entries\n", ages_count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1979) if (ages_count == 0 || ages_count > MAXGEN)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1980) file_fail(age_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1981)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1982) /* There is a 0 entry. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1983) ages_count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1984) ages = calloc(ages_count + 1, sizeof(*ages));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1985) /* And a guard entry. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1986) ages[ages_count] = (unsigned int)-1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1987)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1988) rewind(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1989) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1990) gen = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1991) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1992) ret = sscanf(line, "# Age=V%d_%d_%d",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1993) &major, &minor, &revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1994) if (ret == 3) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1995) ages[++gen] =
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1996) UNICODE_AGE(major, minor, revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1997) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1998) printf(" Age V%d_%d_%d = gen %d\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 1999) major, minor, revision, gen);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2000) if (!age_valid(major, minor, revision))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2001) line_fail(age_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2002) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2003) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2004) ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2005) if (ret == 2) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2006) ages[++gen] = UNICODE_AGE(major, minor, 0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2007) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2008) printf(" Age V%d_%d = %d\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2009) major, minor, gen);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2010) if (!age_valid(major, minor, 0))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2011) line_fail(age_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2012) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2013) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2014) ret = sscanf(line, "%X..%X ; %d.%d #",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2015) &first, &last, &major, &minor);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2016) if (ret == 4) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2017) for (unichar = first; unichar <= last; unichar++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2018) unicode_data[unichar].gen = gen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2019) count += 1 + last - first;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2020) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2021) printf(" %X..%X gen %d\n", first, last, gen);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2022) if (!utf32valid(first) || !utf32valid(last))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2023) line_fail(age_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2024) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2025) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2026) ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2027) if (ret == 3) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2028) unicode_data[unichar].gen = gen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2029) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2030) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2031) printf(" %X gen %d\n", unichar, gen);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2032) if (!utf32valid(unichar))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2033) line_fail(age_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2034) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2035) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2036) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2037) unicode_maxage = ages[gen];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2038) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2039)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2040) /* Nix surrogate block */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2041) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2042) printf(" Removing surrogate block D800..DFFF\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2043) for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2044) unicode_data[unichar].gen = -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2045)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2046) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2047) printf("Found %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2048) if (count == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2049) file_fail(age_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2050) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2051)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2052) static void ccc_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2053) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2054) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2055) unsigned int first;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2056) unsigned int last;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2057) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2058) unsigned int value;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2059) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2060) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2061)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2062) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2063) printf("Parsing %s\n", ccc_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2064)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2065) file = fopen(ccc_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2066) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2067) open_fail(ccc_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2068)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2069) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2070) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2071) ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2072) if (ret == 3) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2073) for (unichar = first; unichar <= last; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2074) unicode_data[unichar].ccc = value;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2075) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2076) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2077) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2078) printf(" %X..%X ccc %d\n", first, last, value);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2079) if (!utf32valid(first) || !utf32valid(last))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2080) line_fail(ccc_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2081) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2082) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2083) ret = sscanf(line, "%X ; %d #", &unichar, &value);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2084) if (ret == 2) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2085) unicode_data[unichar].ccc = value;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2086) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2087) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2088) printf(" %X ccc %d\n", unichar, value);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2089) if (!utf32valid(unichar))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2090) line_fail(ccc_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2091) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2092) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2093) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2094) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2095)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2096) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2097) printf("Found %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2098) if (count == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2099) file_fail(ccc_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2100) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2101)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2102) static int ignore_compatibility_form(char *type)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2103) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2104) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2105) char *ignored_types[] = {"font", "noBreak", "initial", "medial",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2106) "final", "isolated", "circle", "super",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2107) "sub", "vertical", "wide", "narrow",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2108) "small", "square", "fraction", "compat"};
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2109)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2110) for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2111) if (strcmp(type, ignored_types[i]) == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2112) return 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2113) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2114) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2115)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2116) static void nfdi_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2117) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2118) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2119) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2120) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2121) char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2122) char *type;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2123) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2124) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2125) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2126) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2127)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2128) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2129) printf("Parsing %s\n", data_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2130) file = fopen(data_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2131) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2132) open_fail(data_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2133)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2134) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2135) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2136) ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2137) &unichar, buf0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2138) if (ret != 2)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2139) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2140) if (!utf32valid(unichar))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2141) line_fail(data_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2142)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2143) s = buf0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2144) /* skip over <tag> */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2145) if (*s == '<') {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2146) type = ++s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2147) while (*++s != '>');
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2148) *s++ = '\0';
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2149) if(ignore_compatibility_form(type))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2150) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2151) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2152) /* decode the decomposition into UTF-32 */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2153) i = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2154) while (*s) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2155) mapping[i] = strtoul(s, &s, 16);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2156) if (!utf32valid(mapping[i]))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2157) line_fail(data_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2158) i++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2159) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2160) mapping[i++] = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2161)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2162) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2163) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2164) unicode_data[unichar].utf32nfdi = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2165)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2166) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2167) print_utf32nfdi(unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2168) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2169) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2170) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2171) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2172) printf("Found %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2173) if (count == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2174) file_fail(data_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2175) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2176)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2177) static void nfdicf_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2178) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2179) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2180) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2181) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2182) char status;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2183) char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2184) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2185) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2186) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2187) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2188)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2189) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2190) printf("Parsing %s\n", fold_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2191) file = fopen(fold_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2192) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2193) open_fail(fold_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2194)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2195) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2196) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2197) ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2198) if (ret != 3)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2199) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2200) if (!utf32valid(unichar))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2201) line_fail(fold_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2202) /* Use the C+F casefold. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2203) if (status != 'C' && status != 'F')
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2204) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2205) s = buf0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2206) if (*s == '<')
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2207) while (*s++ != ' ')
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2208) ;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2209) i = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2210) while (*s) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2211) mapping[i] = strtoul(s, &s, 16);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2212) if (!utf32valid(mapping[i]))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2213) line_fail(fold_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2214) i++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2215) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2216) mapping[i++] = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2217)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2218) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2219) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2220) unicode_data[unichar].utf32nfdicf = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2221)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2222) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2223) print_utf32nfdicf(unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2224) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2225) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2226) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2227) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2228) printf("Found %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2229) if (count == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2230) file_fail(fold_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2231) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2232)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2233) static void ignore_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2234) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2235) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2236) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2237) unsigned int first;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2238) unsigned int last;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2239) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2240) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2241) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2242)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2243) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2244) printf("Parsing %s\n", prop_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2245) file = fopen(prop_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2246) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2247) open_fail(prop_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2248) assert(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2249) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2250) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2251) ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2252) if (ret == 3) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2253) if (strcmp(buf0, "Default_Ignorable_Code_Point"))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2254) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2255) if (!utf32valid(first) || !utf32valid(last))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2256) line_fail(prop_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2257) for (unichar = first; unichar <= last; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2258) free(unicode_data[unichar].utf32nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2259) um = malloc(sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2260) *um = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2261) unicode_data[unichar].utf32nfdi = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2262) free(unicode_data[unichar].utf32nfdicf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2263) um = malloc(sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2264) *um = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2265) unicode_data[unichar].utf32nfdicf = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2266) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2267) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2268) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2269) printf(" %X..%X Default_Ignorable_Code_Point\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2270) first, last);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2271) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2272) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2273) ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2274) if (ret == 2) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2275) if (strcmp(buf0, "Default_Ignorable_Code_Point"))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2276) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2277) if (!utf32valid(unichar))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2278) line_fail(prop_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2279) free(unicode_data[unichar].utf32nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2280) um = malloc(sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2281) *um = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2282) unicode_data[unichar].utf32nfdi = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2283) free(unicode_data[unichar].utf32nfdicf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2284) um = malloc(sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2285) *um = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2286) unicode_data[unichar].utf32nfdicf = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2287) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2288) printf(" %X Default_Ignorable_Code_Point\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2289) unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2290) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2291) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2292) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2293) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2294) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2295)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2296) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2297) printf("Found %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2298) if (count == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2299) file_fail(prop_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2300) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2301)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2302) static void corrections_init(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2303) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2304) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2305) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2306) unsigned int major;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2307) unsigned int minor;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2308) unsigned int revision;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2309) unsigned int age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2310) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2311) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2312) char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2313) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2314) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2315) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2316)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2317) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2318) printf("Parsing %s\n", norm_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2319) file = fopen(norm_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2320) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2321) open_fail(norm_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2322)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2323) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2324) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2325) ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2326) &unichar, buf0, buf1,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2327) &major, &minor, &revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2328) if (ret != 6)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2329) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2330) if (!utf32valid(unichar) || !age_valid(major, minor, revision))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2331) line_fail(norm_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2332) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2333) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2334) corrections = calloc(count, sizeof(struct unicode_data));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2335) corrections_count = count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2336) rewind(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2337)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2338) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2339) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2340) ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2341) &unichar, buf0, buf1,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2342) &major, &minor, &revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2343) if (ret != 6)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2344) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2345) if (!utf32valid(unichar) || !age_valid(major, minor, revision))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2346) line_fail(norm_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2347) corrections[count] = unicode_data[unichar];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2348) assert(corrections[count].code == unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2349) age = UNICODE_AGE(major, minor, revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2350) corrections[count].correction = age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2351)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2352) i = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2353) s = buf0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2354) while (*s) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2355) mapping[i] = strtoul(s, &s, 16);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2356) if (!utf32valid(mapping[i]))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2357) line_fail(norm_name, line);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2358) i++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2359) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2360) mapping[i++] = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2361)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2362) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2363) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2364) corrections[count].utf32nfdi = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2365)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2366) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2367) printf(" %X -> %s -> %s V%d_%d_%d\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2368) unichar, buf0, buf1, major, minor, revision);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2369) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2370) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2371) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2372)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2373) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2374) printf("Found %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2375) if (count == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2376) file_fail(norm_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2377) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2378)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2379) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2380)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2381) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2382) * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2383) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2384) * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2385) * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2386) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2387) * SBase = 0xAC00
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2388) * LBase = 0x1100
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2389) * VBase = 0x1161
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2390) * TBase = 0x11A7
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2391) * LCount = 19
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2392) * VCount = 21
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2393) * TCount = 28
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2394) * NCount = 588 (VCount * TCount)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2395) * SCount = 11172 (LCount * NCount)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2396) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2397) * Decomposition:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2398) * SIndex = s - SBase
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2399) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2400) * LV (Canonical/Full)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2401) * LIndex = SIndex / NCount
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2402) * VIndex = (Sindex % NCount) / TCount
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2403) * LPart = LBase + LIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2404) * VPart = VBase + VIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2405) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2406) * LVT (Canonical)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2407) * LVIndex = (SIndex / TCount) * TCount
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2408) * TIndex = (Sindex % TCount)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2409) * LVPart = SBase + LVIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2410) * TPart = TBase + TIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2411) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2412) * LVT (Full)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2413) * LIndex = SIndex / NCount
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2414) * VIndex = (Sindex % NCount) / TCount
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2415) * TIndex = (Sindex % TCount)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2416) * LPart = LBase + LIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2417) * VPart = VBase + VIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2418) * if (TIndex == 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2419) * d = <LPart, VPart>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2420) * } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2421) * TPart = TBase + TIndex
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2422) * d = <LPart, VPart, TPart>
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2423) * }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2424) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2425) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2426)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2427) static void hangul_decompose(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2428) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2429) unsigned int sb = 0xAC00;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2430) unsigned int lb = 0x1100;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2431) unsigned int vb = 0x1161;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2432) unsigned int tb = 0x11a7;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2433) /* unsigned int lc = 19; */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2434) unsigned int vc = 21;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2435) unsigned int tc = 28;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2436) unsigned int nc = (vc * tc);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2437) /* unsigned int sc = (lc * nc); */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2438) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2439) unsigned int mapping[4];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2440) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2441) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2442) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2443)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2444) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2445) printf("Decomposing hangul\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2446) /* Hangul */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2447) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2448) for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2449) unsigned int si = unichar - sb;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2450) unsigned int li = si / nc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2451) unsigned int vi = (si % nc) / tc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2452) unsigned int ti = si % tc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2453)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2454) i = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2455) mapping[i++] = lb + li;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2456) mapping[i++] = vb + vi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2457) if (ti)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2458) mapping[i++] = tb + ti;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2459) mapping[i++] = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2460)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2461) assert(!unicode_data[unichar].utf32nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2462) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2463) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2464) unicode_data[unichar].utf32nfdi = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2465)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2466) assert(!unicode_data[unichar].utf32nfdicf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2467) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2468) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2469) unicode_data[unichar].utf32nfdicf = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2470)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2471) /*
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2472) * Add a cookie as a reminder that the hangul syllable
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2473) * decompositions must not be stored in the generated
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2474) * trie.
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2475) */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2476) unicode_data[unichar].utf8nfdi = malloc(2);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2477) unicode_data[unichar].utf8nfdi[0] = HANGUL;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2478) unicode_data[unichar].utf8nfdi[1] = '\0';
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2479)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2480) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2481) print_utf32nfdi(unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2482)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2483) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2484) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2485) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2486) printf("Created %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2487) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2488)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2489) static void nfdi_decompose(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2490) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2491) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2492) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2493) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2494) unsigned int *dc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2495) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2496) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2497) int j;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2498) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2499)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2500) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2501) printf("Decomposing nfdi\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2502)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2503) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2504) for (unichar = 0; unichar != 0x110000; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2505) if (!unicode_data[unichar].utf32nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2506) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2507) for (;;) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2508) ret = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2509) i = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2510) um = unicode_data[unichar].utf32nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2511) while (*um) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2512) dc = unicode_data[*um].utf32nfdi;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2513) if (dc) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2514) for (j = 0; dc[j]; j++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2515) mapping[i++] = dc[j];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2516) ret = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2517) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2518) mapping[i++] = *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2519) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2520) um++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2521) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2522) mapping[i++] = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2523) if (ret)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2524) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2525) free(unicode_data[unichar].utf32nfdi);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2526) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2527) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2528) unicode_data[unichar].utf32nfdi = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2529) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2530) /* Add this decomposition to nfdicf if there is no entry. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2531) if (!unicode_data[unichar].utf32nfdicf) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2532) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2533) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2534) unicode_data[unichar].utf32nfdicf = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2535) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2536) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2537) print_utf32nfdi(unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2538) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2539) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2540) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2541) printf("Processed %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2542) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2543)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2544) static void nfdicf_decompose(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2545) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2546) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2547) unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2548) unsigned int *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2549) unsigned int *dc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2550) int count;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2551) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2552) int j;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2553) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2554)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2555) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2556) printf("Decomposing nfdicf\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2557) count = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2558) for (unichar = 0; unichar != 0x110000; unichar++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2559) if (!unicode_data[unichar].utf32nfdicf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2560) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2561) for (;;) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2562) ret = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2563) i = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2564) um = unicode_data[unichar].utf32nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2565) while (*um) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2566) dc = unicode_data[*um].utf32nfdicf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2567) if (dc) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2568) for (j = 0; dc[j]; j++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2569) mapping[i++] = dc[j];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2570) ret = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2571) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2572) mapping[i++] = *um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2573) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2574) um++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2575) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2576) mapping[i++] = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2577) if (ret)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2578) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2579) free(unicode_data[unichar].utf32nfdicf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2580) um = malloc(i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2581) memcpy(um, mapping, i * sizeof(unsigned int));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2582) unicode_data[unichar].utf32nfdicf = um;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2583) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2584) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2585) print_utf32nfdicf(unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2586) count++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2587) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2588) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2589) printf("Processed %d entries\n", count);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2590) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2591)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2592) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2593)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2594) int utf8agemax(struct tree *, const char *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2595) int utf8nagemax(struct tree *, const char *, size_t);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2596) int utf8agemin(struct tree *, const char *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2597) int utf8nagemin(struct tree *, const char *, size_t);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2598) ssize_t utf8len(struct tree *, const char *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2599) ssize_t utf8nlen(struct tree *, const char *, size_t);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2600) struct utf8cursor;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2601) int utf8cursor(struct utf8cursor *, struct tree *, const char *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2602) int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2603) int utf8byte(struct utf8cursor *);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2604)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2605) /*
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2606) * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2607) *
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2608) * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2609) * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2610) *
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2611) * SBase = 0xAC00
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2612) * LBase = 0x1100
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2613) * VBase = 0x1161
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2614) * TBase = 0x11A7
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2615) * LCount = 19
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2616) * VCount = 21
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2617) * TCount = 28
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2618) * NCount = 588 (VCount * TCount)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2619) * SCount = 11172 (LCount * NCount)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2620) *
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2621) * Decomposition:
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2622) * SIndex = s - SBase
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2623) *
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2624) * LV (Canonical/Full)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2625) * LIndex = SIndex / NCount
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2626) * VIndex = (Sindex % NCount) / TCount
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2627) * LPart = LBase + LIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2628) * VPart = VBase + VIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2629) *
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2630) * LVT (Canonical)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2631) * LVIndex = (SIndex / TCount) * TCount
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2632) * TIndex = (Sindex % TCount)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2633) * LVPart = SBase + LVIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2634) * TPart = TBase + TIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2635) *
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2636) * LVT (Full)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2637) * LIndex = SIndex / NCount
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2638) * VIndex = (Sindex % NCount) / TCount
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2639) * TIndex = (Sindex % TCount)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2640) * LPart = LBase + LIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2641) * VPart = VBase + VIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2642) * if (TIndex == 0) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2643) * d = <LPart, VPart>
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2644) * } else {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2645) * TPart = TBase + TIndex
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2646) * d = <LPart, VPart, TPart>
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2647) * }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2648) */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2649)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2650) /* Constants */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2651) #define SB (0xAC00)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2652) #define LB (0x1100)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2653) #define VB (0x1161)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2654) #define TB (0x11A7)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2655) #define LC (19)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2656) #define VC (21)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2657) #define TC (28)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2658) #define NC (VC * TC)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2659) #define SC (LC * NC)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2660)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2661) /* Algorithmic decomposition of hangul syllable. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2662) static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2663) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2664) unsigned int si;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2665) unsigned int li;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2666) unsigned int vi;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2667) unsigned int ti;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2668) unsigned char *h;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2669)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2670) /* Calculate the SI, LI, VI, and TI values. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2671) si = utf8decode(str) - SB;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2672) li = si / NC;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2673) vi = (si % NC) / TC;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2674) ti = si % TC;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2675)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2676) /* Fill in base of leaf. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2677) h = hangul;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2678) LEAF_GEN(h) = 2;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2679) LEAF_CCC(h) = DECOMPOSE;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2680) h += 2;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2681)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2682) /* Add LPart, a 3-byte UTF-8 sequence. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2683) h += utf8encode((char *)h, li + LB);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2684)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2685) /* Add VPart, a 3-byte UTF-8 sequence. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2686) h += utf8encode((char *)h, vi + VB);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2687)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2688) /* Add TPart if required, also a 3-byte UTF-8 sequence. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2689) if (ti)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2690) h += utf8encode((char *)h, ti + TB);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2691)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2692) /* Terminate string. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2693) h[0] = '\0';
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2694)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2695) return hangul;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2696) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2697)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2698) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2699) * Use trie to scan s, touching at most len bytes.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2700) * Returns the leaf if one exists, NULL otherwise.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2701) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2702) * A non-NULL return guarantees that the UTF-8 sequence starting at s
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2703) * is well-formed and corresponds to a known unicode code point. The
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2704) * shorthand for this will be "is valid UTF-8 unicode".
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2705) */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2706) static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2707) const char *s, size_t len)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2708) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2709) utf8trie_t *trie;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2710) int offlen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2711) int offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2712) int mask;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2713) int node;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2714)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2715) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2716) return NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2717) if (len == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2718) return NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2719) node = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2720) trie = utf8data + tree->index;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2721) while (node) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2722) offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2723) if (*trie & NEXTBYTE) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2724) if (--len == 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2725) return NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2726) s++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2727) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2728) mask = 1 << (*trie & BITNUM);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2729) if (*s & mask) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2730) /* Right leg */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2731) if (offlen) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2732) /* Right node at offset of trie */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2733) node = (*trie & RIGHTNODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2734) offset = trie[offlen];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2735) while (--offlen) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2736) offset <<= 8;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2737) offset |= trie[offlen];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2738) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2739) trie += offset;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2740) } else if (*trie & RIGHTPATH) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2741) /* Right node after this node */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2742) node = (*trie & TRIENODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2743) trie++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2744) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2745) /* No right node. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2746) return NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2747) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2748) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2749) /* Left leg */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2750) if (offlen) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2751) /* Left node after this node. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2752) node = (*trie & LEFTNODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2753) trie += offlen + 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2754) } else if (*trie & RIGHTPATH) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2755) /* No left node. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2756) return NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2757) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2758) /* Left node after this node */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2759) node = (*trie & TRIENODE);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2760) trie++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2761) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2762) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2763) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2764) /*
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2765) * Hangul decomposition is done algorithmically. These are the
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2766) * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2767) * always 3 bytes long, so s has been advanced twice, and the
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2768) * start of the sequence is at s-2.
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2769) */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2770) if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2771) trie = utf8hangul(s - 2, hangul);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2772) return trie;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2773) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2774)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2775) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2776) * Use trie to scan s.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2777) * Returns the leaf if one exists, NULL otherwise.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2778) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2779) * Forwards to trie_nlookup().
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2780) */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2781) static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2782) const char *s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2783) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2784) return utf8nlookup(tree, hangul, s, (size_t)-1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2785) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2786)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2787) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2788) * Return the number of bytes used by the current UTF-8 sequence.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2789) * Assumes the input points to the first byte of a valid UTF-8
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2790) * sequence.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2791) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2792) static inline int utf8clen(const char *s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2793) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2794) unsigned char c = *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2795) return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2796) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2797)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2798) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2799) * Maximum age of any character in s.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2800) * Return -1 if s is not valid UTF-8 unicode.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2801) * Return 0 if only non-assigned code points are used.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2802) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2803) int utf8agemax(struct tree *tree, const char *s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2804) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2805) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2806) int age = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2807) int leaf_age;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2808) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2809)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2810) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2811) return -1;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2812)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2813) while (*s) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2814) leaf = utf8lookup(tree, hangul, s);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2815) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2816) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2817) leaf_age = ages[LEAF_GEN(leaf)];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2818) if (leaf_age <= tree->maxage && leaf_age > age)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2819) age = leaf_age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2820) s += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2821) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2822) return age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2823) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2824)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2825) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2826) * Minimum age of any character in s.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2827) * Return -1 if s is not valid UTF-8 unicode.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2828) * Return 0 if non-assigned code points are used.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2829) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2830) int utf8agemin(struct tree *tree, const char *s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2831) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2832) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2833) int age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2834) int leaf_age;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2835) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2836)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2837) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2838) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2839) age = tree->maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2840) while (*s) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2841) leaf = utf8lookup(tree, hangul, s);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2842) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2843) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2844) leaf_age = ages[LEAF_GEN(leaf)];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2845) if (leaf_age <= tree->maxage && leaf_age < age)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2846) age = leaf_age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2847) s += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2848) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2849) return age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2850) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2851)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2852) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2853) * Maximum age of any character in s, touch at most len bytes.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2854) * Return -1 if s is not valid UTF-8 unicode.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2855) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2856) int utf8nagemax(struct tree *tree, const char *s, size_t len)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2857) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2858) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2859) int age = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2860) int leaf_age;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2861) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2862)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2863) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2864) return -1;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2865)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2866) while (len && *s) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2867) leaf = utf8nlookup(tree, hangul, s, len);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2868) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2869) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2870) leaf_age = ages[LEAF_GEN(leaf)];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2871) if (leaf_age <= tree->maxage && leaf_age > age)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2872) age = leaf_age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2873) len -= utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2874) s += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2875) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2876) return age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2877) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2878)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2879) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2880) * Maximum age of any character in s, touch at most len bytes.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2881) * Return -1 if s is not valid UTF-8 unicode.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2882) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2883) int utf8nagemin(struct tree *tree, const char *s, size_t len)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2884) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2885) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2886) int leaf_age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2887) int age;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2888) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2889)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2890) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2891) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2892) age = tree->maxage;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2893) while (len && *s) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2894) leaf = utf8nlookup(tree, hangul, s, len);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2895) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2896) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2897) leaf_age = ages[LEAF_GEN(leaf)];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2898) if (leaf_age <= tree->maxage && leaf_age < age)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2899) age = leaf_age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2900) len -= utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2901) s += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2902) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2903) return age;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2904) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2905)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2906) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2907) * Length of the normalization of s.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2908) * Return -1 if s is not valid UTF-8 unicode.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2909) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2910) * A string of Default_Ignorable_Code_Point has length 0.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2911) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2912) ssize_t utf8len(struct tree *tree, const char *s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2913) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2914) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2915) size_t ret = 0;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2916) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2917)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2918) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2919) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2920) while (*s) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2921) leaf = utf8lookup(tree, hangul, s);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2922) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2923) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2924) if (ages[LEAF_GEN(leaf)] > tree->maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2925) ret += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2926) else if (LEAF_CCC(leaf) == DECOMPOSE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2927) ret += strlen(LEAF_STR(leaf));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2928) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2929) ret += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2930) s += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2931) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2932) return ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2933) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2934)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2935) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2936) * Length of the normalization of s, touch at most len bytes.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2937) * Return -1 if s is not valid UTF-8 unicode.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2938) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2939) ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2940) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2941) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2942) size_t ret = 0;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2943) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2944)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2945) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2946) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2947) while (len && *s) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2948) leaf = utf8nlookup(tree, hangul, s, len);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2949) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2950) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2951) if (ages[LEAF_GEN(leaf)] > tree->maxage)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2952) ret += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2953) else if (LEAF_CCC(leaf) == DECOMPOSE)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2954) ret += strlen(LEAF_STR(leaf));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2955) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2956) ret += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2957) len -= utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2958) s += utf8clen(s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2959) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2960) return ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2961) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2962)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2963) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2964) * Cursor structure used by the normalizer.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2965) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2966) struct utf8cursor {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2967) struct tree *tree;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2968) const char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2969) const char *p;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2970) const char *ss;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2971) const char *sp;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2972) unsigned int len;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2973) unsigned int slen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2974) short int ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2975) short int nccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2976) unsigned int unichar;
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 2977) unsigned char hangul[UTF8HANGULLEAF];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2978) };
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2979)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2980) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2981) * Set up an utf8cursor for use by utf8byte().
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2982) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2983) * s : string.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2984) * len : length of s.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2985) * u8c : pointer to cursor.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2986) * trie : utf8trie_t to use for normalization.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2987) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2988) * Returns -1 on error, 0 on success.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2989) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2990) int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2991) size_t len)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2992) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2993) if (!tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2994) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2995) if (!s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2996) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2997) u8c->tree = tree;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2998) u8c->s = s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 2999) u8c->p = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3000) u8c->ss = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3001) u8c->sp = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3002) u8c->len = len;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3003) u8c->slen = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3004) u8c->ccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3005) u8c->nccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3006) u8c->unichar = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3007) /* Check we didn't clobber the maximum length. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3008) if (u8c->len != len)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3009) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3010) /* The first byte of s may not be an utf8 continuation. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3011) if (len > 0 && (*s & 0xC0) == 0x80)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3012) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3013) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3014) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3015)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3016) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3017) * Set up an utf8cursor for use by utf8byte().
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3018) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3019) * s : NUL-terminated string.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3020) * u8c : pointer to cursor.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3021) * trie : utf8trie_t to use for normalization.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3022) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3023) * Returns -1 on error, 0 on success.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3024) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3025) int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3026) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3027) return utf8ncursor(u8c, tree, s, (unsigned int)-1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3028) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3029)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3030) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3031) * Get one byte from the normalized form of the string described by u8c.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3032) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3033) * Returns the byte cast to an unsigned char on succes, and -1 on failure.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3034) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3035) * The cursor keeps track of the location in the string in u8c->s.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3036) * When a character is decomposed, the current location is stored in
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3037) * u8c->p, and u8c->s is set to the start of the decomposition. Note
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3038) * that bytes from a decomposition do not count against u8c->len.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3039) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3040) * Characters are emitted if they match the current CCC in u8c->ccc.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3041) * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3042) * and the function returns 0 in that case.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3043) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3044) * Sorting by CCC is done by repeatedly scanning the string. The
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3045) * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3046) * the start of the scan. The first pass finds the lowest CCC to be
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3047) * emitted and stores it in u8c->nccc, the second pass emits the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3048) * characters with this CCC and finds the next lowest CCC. This limits
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3049) * the number of passes to 1 + the number of different CCCs in the
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3050) * sequence being scanned.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3051) *
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3052) * Therefore:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3053) * u8c->p != NULL -> a decomposition is being scanned.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3054) * u8c->ss != NULL -> this is a repeating scan.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3055) * u8c->ccc == -1 -> this is the first scan of a repeating scan.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3056) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3057) int utf8byte(struct utf8cursor *u8c)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3058) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3059) utf8leaf_t *leaf;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3060) int ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3061)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3062) for (;;) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3063) /* Check for the end of a decomposed character. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3064) if (u8c->p && *u8c->s == '\0') {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3065) u8c->s = u8c->p;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3066) u8c->p = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3067) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3068)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3069) /* Check for end-of-string. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3070) if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3071) /* There is no next byte. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3072) if (u8c->ccc == STOPPER)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3073) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3074) /* End-of-string during a scan counts as a stopper. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3075) ccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3076) goto ccc_mismatch;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3077) } else if ((*u8c->s & 0xC0) == 0x80) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3078) /* This is a continuation of the current character. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3079) if (!u8c->p)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3080) u8c->len--;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3081) return (unsigned char)*u8c->s++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3082) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3083)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3084) /* Look up the data for the current character. */
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3085) if (u8c->p) {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3086) leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3087) } else {
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3088) leaf = utf8nlookup(u8c->tree, u8c->hangul,
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3089) u8c->s, u8c->len);
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3090) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3091)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3092) /* No leaf found implies that the input is a binary blob. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3093) if (!leaf)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3094) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3095)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3096) /* Characters that are too new have CCC 0. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3097) if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3098) ccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3099) } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3100) u8c->len -= utf8clen(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3101) u8c->p = u8c->s + utf8clen(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3102) u8c->s = LEAF_STR(leaf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3103) /* Empty decomposition implies CCC 0. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3104) if (*u8c->s == '\0') {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3105) if (u8c->ccc == STOPPER)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3106) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3107) ccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3108) goto ccc_mismatch;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3109) }
a8384c68797ee scripts/mkutf8data.c (Olaf Weber 2019-04-25 13:49:18 -0400 3110) leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3111) ccc = LEAF_CCC(leaf);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3112) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3113) u8c->unichar = utf8decode(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3114)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3115) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3116) * If this is not a stopper, then see if it updates
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3117) * the next canonical class to be emitted.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3118) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3119) if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3120) u8c->nccc = ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3121)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3122) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3123) * Return the current byte if this is the current
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3124) * combining class.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3125) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3126) if (ccc == u8c->ccc) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3127) if (!u8c->p)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3128) u8c->len--;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3129) return (unsigned char)*u8c->s++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3130) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3131)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3132) /* Current combining class mismatch. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3133) ccc_mismatch:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3134) if (u8c->nccc == STOPPER) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3135) /*
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3136) * Scan forward for the first canonical class
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3137) * to be emitted. Save the position from
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3138) * which to restart.
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3139) */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3140) assert(u8c->ccc == STOPPER);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3141) u8c->ccc = MINCCC - 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3142) u8c->nccc = ccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3143) u8c->sp = u8c->p;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3144) u8c->ss = u8c->s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3145) u8c->slen = u8c->len;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3146) if (!u8c->p)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3147) u8c->len -= utf8clen(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3148) u8c->s += utf8clen(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3149) } else if (ccc != STOPPER) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3150) /* Not a stopper, and not the ccc we're emitting. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3151) if (!u8c->p)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3152) u8c->len -= utf8clen(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3153) u8c->s += utf8clen(u8c->s);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3154) } else if (u8c->nccc != MAXCCC + 1) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3155) /* At a stopper, restart for next ccc. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3156) u8c->ccc = u8c->nccc;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3157) u8c->nccc = MAXCCC + 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3158) u8c->s = u8c->ss;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3159) u8c->p = u8c->sp;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3160) u8c->len = u8c->slen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3161) } else {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3162) /* All done, proceed from here. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3163) u8c->ccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3164) u8c->nccc = STOPPER;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3165) u8c->sp = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3166) u8c->ss = NULL;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3167) u8c->slen = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3168) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3169) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3170) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3171)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3172) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3173)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3174) static int normalize_line(struct tree *tree)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3175) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3176) char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3177) char *t;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3178) int c;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3179) struct utf8cursor u8c;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3180)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3181) /* First test: null-terminated string. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3182) s = buf2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3183) t = buf3;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3184) if (utf8cursor(&u8c, tree, s))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3185) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3186) while ((c = utf8byte(&u8c)) > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3187) if (c != (unsigned char)*t++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3188) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3189) if (c < 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3190) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3191) if (*t != 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3192) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3193)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3194) /* Second test: length-limited string. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3195) s = buf2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3196) /* Replace NUL with a value that will cause an error if seen. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3197) s[strlen(s) + 1] = -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3198) t = buf3;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3199) if (utf8cursor(&u8c, tree, s))
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3200) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3201) while ((c = utf8byte(&u8c)) > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3202) if (c != (unsigned char)*t++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3203) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3204) if (c < 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3205) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3206) if (*t != 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3207) return -1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3208)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3209) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3210) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3211)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3212) static void normalization_test(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3213) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3214) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3215) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3216) struct unicode_data *data;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3217) char *s;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3218) char *t;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3219) int ret;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3220) int ignorables;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3221) int tests = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3222) int failures = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3223)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3224) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3225) printf("Parsing %s\n", test_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3226) /* Step one, read data from file. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3227) file = fopen(test_name, "r");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3228) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3229) open_fail(test_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3230)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3231) while (fgets(line, LINESIZE, file)) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3232) ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3233) buf0, buf1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3234) if (ret != 2 || *line == '#')
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3235) continue;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3236) s = buf0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3237) t = buf2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3238) while (*s) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3239) unichar = strtoul(s, &s, 16);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3240) t += utf8encode(t, unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3241) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3242) *t = '\0';
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3243)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3244) ignorables = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3245) s = buf1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3246) t = buf3;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3247) while (*s) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3248) unichar = strtoul(s, &s, 16);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3249) data = &unicode_data[unichar];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3250) if (data->utf8nfdi && !*data->utf8nfdi)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3251) ignorables = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3252) else
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3253) t += utf8encode(t, unichar);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3254) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3255) *t = '\0';
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3256)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3257) tests++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3258) if (normalize_line(nfdi_tree) < 0) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3259) printf("Line %s -> %s", buf0, buf1);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3260) if (ignorables)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3261) printf(" (ignorables removed)");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3262) printf(" failure\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3263) failures++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3264) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3265) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3266) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3267) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3268) printf("Ran %d tests with %d failures\n", tests, failures);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3269) if (failures)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3270) file_fail(test_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3271) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3272)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3273) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3274)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3275) static void write_file(void)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3276) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3277) FILE *file;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3278) int i;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3279) int j;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3280) int t;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3281) int gen;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3282)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3283) if (verbose > 0)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3284) printf("Writing %s\n", utf8_name);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3285) file = fopen(utf8_name, "w");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3286) if (!file)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3287) open_fail(utf8_name, errno);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3288)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3289) fprintf(file, "/* This file is generated code, do not edit. */\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3290) fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3291) fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3292) fprintf(file, "#endif\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3293) fprintf(file, "\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3294) fprintf(file, "static const unsigned int utf8vers = %#x;\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3295) unicode_maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3296) fprintf(file, "\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3297) fprintf(file, "static const unsigned int utf8agetab[] = {\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3298) for (i = 0; i != ages_count; i++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3299) fprintf(file, "\t%#x%s\n", ages[i],
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3300) ages[i] == unicode_maxage ? "" : ",");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3301) fprintf(file, "};\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3302) fprintf(file, "\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3303) fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3304) t = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3305) for (gen = 0; gen < ages_count; gen++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3306) fprintf(file, "\t{ %#x, %d }%s\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3307) ages[gen], trees[t].index,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3308) ages[gen] == unicode_maxage ? "" : ",");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3309) if (trees[t].maxage == ages[gen])
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3310) t += 2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3311) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3312) fprintf(file, "};\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3313) fprintf(file, "\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3314) fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3315) t = 1;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3316) for (gen = 0; gen < ages_count; gen++) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3317) fprintf(file, "\t{ %#x, %d }%s\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3318) ages[gen], trees[t].index,
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3319) ages[gen] == unicode_maxage ? "" : ",");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3320) if (trees[t].maxage == ages[gen])
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3321) t += 2;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3322) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3323) fprintf(file, "};\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3324) fprintf(file, "\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3325) fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3326) utf8data_size);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3327) t = 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3328) for (i = 0; i != utf8data_size; i += 16) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3329) if (i == trees[t].index) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3330) fprintf(file, "\t/* %s_%x */\n",
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3331) trees[t].type, trees[t].maxage);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3332) if (t < trees_count-1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3333) t++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3334) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3335) fprintf(file, "\t");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3336) for (j = i; j != i + 16; j++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3337) fprintf(file, "0x%.2x%s", utf8data[j],
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3338) (j < utf8data_size -1 ? "," : ""));
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3339) fprintf(file, "\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3340) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3341) fprintf(file, "};\n");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3342) fclose(file);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3343) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3344)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3345) /* ------------------------------------------------------------------ */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3346)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3347) int main(int argc, char *argv[])
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3348) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3349) unsigned int unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3350) int opt;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3351)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3352) argv0 = argv[0];
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3353)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3354) while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3355) switch (opt) {
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3356) case 'a':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3357) age_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3358) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3359) case 'c':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3360) ccc_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3361) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3362) case 'd':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3363) data_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3364) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3365) case 'f':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3366) fold_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3367) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3368) case 'n':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3369) norm_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3370) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3371) case 'o':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3372) utf8_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3373) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3374) case 'p':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3375) prop_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3376) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3377) case 't':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3378) test_name = optarg;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3379) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3380) case 'v':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3381) verbose++;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3382) break;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3383) case 'h':
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3384) help();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3385) exit(0);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3386) default:
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3387) usage();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3388) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3389) }
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3390)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3391) if (verbose > 1)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3392) help();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3393) for (unichar = 0; unichar != 0x110000; unichar++)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3394) unicode_data[unichar].code = unichar;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3395) age_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3396) ccc_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3397) nfdi_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3398) nfdicf_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3399) ignore_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3400) corrections_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3401) hangul_decompose();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3402) nfdi_decompose();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3403) nfdicf_decompose();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3404) utf8_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3405) trees_init();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3406) trees_populate();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3407) trees_reduce();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3408) trees_verify();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3409) /* Prevent "unused function" warning. */
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3410) (void)lookup(nfdi_tree, " ");
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3411) if (verbose > 2)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3412) tree_walk(nfdi_tree);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3413) if (verbose > 2)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3414) tree_walk(nfdicf_tree);
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3415) normalization_test();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3416) write_file();
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3417)
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3418) return 0;
955405d1174ee scripts/mkutf8data.c (Gabriel Krisman Bertazi 2019-04-25 13:38:44 -0400 3419) }