9f8068503d637 (Thomas Gleixner 2019-05-29 07:18:08 -0700 1) /* SPDX-License-Identifier: GPL-2.0-only */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 2) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 3) * Copyright (c) 2014 SGI.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 4) * All rights reserved.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 5) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 6)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 7) #ifndef UTF8NORM_H
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 8) #define UTF8NORM_H
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 9)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 10) #include <linux/types.h>
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 11) #include <linux/export.h>
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 12) #include <linux/string.h>
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 13) #include <linux/module.h>
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 14)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 15) /* Encoding a unicode version number as a single unsigned int. */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 16) #define UNICODE_MAJ_SHIFT (16)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 17) #define UNICODE_MIN_SHIFT (8)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 18)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 19) #define UNICODE_AGE(MAJ, MIN, REV) \
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 20) (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 21) ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 22) ((unsigned int)(REV)))
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 23)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 24) /* Highest unicode version supported by the data tables. */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 25) extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
9d53690f0d4e5 (Gabriel Krisman Bertazi 2019-04-25 13:51:22 -0400 26) extern int utf8version_latest(void);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 27)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 28) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 29) * Look for the correct const struct utf8data for a unicode version.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 30) * Returns NULL if the version requested is too new.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 31) *
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 32) * Two normalization forms are supported: nfdi and nfdicf.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 33) *
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 34) * nfdi:
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 35) * - Apply unicode normalization form NFD.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 36) * - Remove any Default_Ignorable_Code_Point.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 37) *
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 38) * nfdicf:
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 39) * - Apply unicode normalization form NFD.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 40) * - Remove any Default_Ignorable_Code_Point.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 41) * - Apply a full casefold (C + F).
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 42) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 43) extern const struct utf8data *utf8nfdi(unsigned int maxage);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 44) extern const struct utf8data *utf8nfdicf(unsigned int maxage);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 45)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 46) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 47) * Determine the maximum age of any unicode character in the string.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 48) * Returns 0 if only unassigned code points are present.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 49) * Returns -1 if the input is not valid UTF-8.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 50) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 51) extern int utf8agemax(const struct utf8data *data, const char *s);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 52) extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 53)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 54) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 55) * Determine the minimum age of any unicode character in the string.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 56) * Returns 0 if any unassigned code points are present.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 57) * Returns -1 if the input is not valid UTF-8.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 58) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 59) extern int utf8agemin(const struct utf8data *data, const char *s);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 60) extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 61)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 62) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 63) * Determine the length of the normalized from of the string,
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 64) * excluding any terminating NULL byte.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 65) * Returns 0 if only ignorable code points are present.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 66) * Returns -1 if the input is not valid UTF-8.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 67) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 68) extern ssize_t utf8len(const struct utf8data *data, const char *s);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 69) extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 70)
a8384c68797ee (Olaf Weber 2019-04-25 13:49:18 -0400 71) /* Needed in struct utf8cursor below. */
a8384c68797ee (Olaf Weber 2019-04-25 13:49:18 -0400 72) #define UTF8HANGULLEAF (12)
a8384c68797ee (Olaf Weber 2019-04-25 13:49:18 -0400 73)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 74) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 75) * Cursor structure used by the normalizer.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 76) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 77) struct utf8cursor {
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 78) const struct utf8data *data;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 79) const char *s;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 80) const char *p;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 81) const char *ss;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 82) const char *sp;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 83) unsigned int len;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 84) unsigned int slen;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 85) short int ccc;
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 86) short int nccc;
a8384c68797ee (Olaf Weber 2019-04-25 13:49:18 -0400 87) unsigned char hangul[UTF8HANGULLEAF];
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 88) };
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 89)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 90) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 91) * Initialize a utf8cursor to normalize a string.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 92) * Returns 0 on success.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 93) * Returns -1 on failure.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 94) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 95) extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 96) const char *s);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 97) extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 98) const char *s, size_t len);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 99)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 100) /*
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 101) * Get the next byte in the normalization.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 102) * Returns a value > 0 && < 256 on success.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 103) * Returns 0 when the end of the normalization is reached.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 104) * Returns -1 if the string being normalized is not valid UTF-8.
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 105) */
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 106) extern int utf8byte(struct utf8cursor *u8c);
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 107)
44594c2fbf425 (Olaf Weber 2019-04-25 13:45:46 -0400 108) #endif /* UTF8NORM_H */