unicode: implement higher level API for string handling
This patch integrates the utf8n patches with some higher level API to perform UTF-8 string comparison, normalization and casefolding operations. Implemented is a variation of NFD, and casefold is performed by doing full casefold on top of NFD. These algorithms are based on the core implemented by Olaf Weber from SGI. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.co.uk> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
Родитель
a8384c6879
Коммит
9d53690f0d
|
@ -1,6 +1,8 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
obj-$(CONFIG_UNICODE) += utf8-norm.o
|
||||
obj-$(CONFIG_UNICODE) += unicode.o
|
||||
|
||||
unicode-y := utf8-norm.o utf8-core.o
|
||||
|
||||
# This rule is not invoked during the kernel compilation. It is used to
|
||||
# regenerate the utf8data.h header file.
|
||||
|
|
|
@ -0,0 +1,187 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/parser.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/unicode.h>
|
||||
|
||||
#include "utf8n.h"
|
||||
|
||||
int utf8_validate(const struct unicode_map *um, const struct qstr *str)
|
||||
{
|
||||
const struct utf8data *data = utf8nfdi(um->version);
|
||||
|
||||
if (utf8nlen(data, str->name, str->len) < 0)
|
||||
return -1;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(utf8_validate);
|
||||
|
||||
int utf8_strncmp(const struct unicode_map *um,
|
||||
const struct qstr *s1, const struct qstr *s2)
|
||||
{
|
||||
const struct utf8data *data = utf8nfdi(um->version);
|
||||
struct utf8cursor cur1, cur2;
|
||||
int c1, c2;
|
||||
|
||||
if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
c1 = utf8byte(&cur1);
|
||||
c2 = utf8byte(&cur2);
|
||||
|
||||
if (c1 < 0 || c2 < 0)
|
||||
return -EINVAL;
|
||||
if (c1 != c2)
|
||||
return 1;
|
||||
} while (c1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(utf8_strncmp);
|
||||
|
||||
int utf8_strncasecmp(const struct unicode_map *um,
|
||||
const struct qstr *s1, const struct qstr *s2)
|
||||
{
|
||||
const struct utf8data *data = utf8nfdicf(um->version);
|
||||
struct utf8cursor cur1, cur2;
|
||||
int c1, c2;
|
||||
|
||||
if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
c1 = utf8byte(&cur1);
|
||||
c2 = utf8byte(&cur2);
|
||||
|
||||
if (c1 < 0 || c2 < 0)
|
||||
return -EINVAL;
|
||||
if (c1 != c2)
|
||||
return 1;
|
||||
} while (c1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(utf8_strncasecmp);
|
||||
|
||||
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
|
||||
unsigned char *dest, size_t dlen)
|
||||
{
|
||||
const struct utf8data *data = utf8nfdicf(um->version);
|
||||
struct utf8cursor cur;
|
||||
size_t nlen = 0;
|
||||
|
||||
if (utf8ncursor(&cur, data, str->name, str->len) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
for (nlen = 0; nlen < dlen; nlen++) {
|
||||
int c = utf8byte(&cur);
|
||||
|
||||
dest[nlen] = c;
|
||||
if (!c)
|
||||
return nlen;
|
||||
if (c == -1)
|
||||
break;
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(utf8_casefold);
|
||||
|
||||
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
|
||||
unsigned char *dest, size_t dlen)
|
||||
{
|
||||
const struct utf8data *data = utf8nfdi(um->version);
|
||||
struct utf8cursor cur;
|
||||
ssize_t nlen = 0;
|
||||
|
||||
if (utf8ncursor(&cur, data, str->name, str->len) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
for (nlen = 0; nlen < dlen; nlen++) {
|
||||
int c = utf8byte(&cur);
|
||||
|
||||
dest[nlen] = c;
|
||||
if (!c)
|
||||
return nlen;
|
||||
if (c == -1)
|
||||
break;
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(utf8_normalize);
|
||||
|
||||
static int utf8_parse_version(const char *version, unsigned int *maj,
|
||||
unsigned int *min, unsigned int *rev)
|
||||
{
|
||||
substring_t args[3];
|
||||
char version_string[12];
|
||||
const struct match_token token[] = {
|
||||
{1, "%d.%d.%d"},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
strncpy(version_string, version, sizeof(version_string));
|
||||
|
||||
if (match_token(version_string, token, args) != 1)
|
||||
return -EINVAL;
|
||||
|
||||
if (match_int(&args[0], maj) || match_int(&args[1], min) ||
|
||||
match_int(&args[2], rev))
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct unicode_map *utf8_load(const char *version)
|
||||
{
|
||||
struct unicode_map *um = NULL;
|
||||
int unicode_version;
|
||||
|
||||
if (version) {
|
||||
unsigned int maj, min, rev;
|
||||
|
||||
if (utf8_parse_version(version, &maj, &min, &rev) < 0)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
if (!utf8version_is_supported(maj, min, rev))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
unicode_version = UNICODE_AGE(maj, min, rev);
|
||||
} else {
|
||||
unicode_version = utf8version_latest();
|
||||
printk(KERN_WARNING"UTF-8 version not specified. "
|
||||
"Assuming latest supported version (%d.%d.%d).",
|
||||
(unicode_version >> 16) & 0xff,
|
||||
(unicode_version >> 8) & 0xff,
|
||||
(unicode_version & 0xff));
|
||||
}
|
||||
|
||||
um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
|
||||
if (!um)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
um->charset = "UTF-8";
|
||||
um->version = unicode_version;
|
||||
|
||||
return um;
|
||||
}
|
||||
EXPORT_SYMBOL(utf8_load);
|
||||
|
||||
void utf8_unload(struct unicode_map *um)
|
||||
{
|
||||
kfree(um);
|
||||
}
|
||||
EXPORT_SYMBOL(utf8_unload);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
|
@ -38,6 +38,12 @@ int utf8version_is_supported(u8 maj, u8 min, u8 rev)
|
|||
}
|
||||
EXPORT_SYMBOL(utf8version_is_supported);
|
||||
|
||||
int utf8version_latest(void)
|
||||
{
|
||||
return utf8vers;
|
||||
}
|
||||
EXPORT_SYMBOL(utf8version_latest);
|
||||
|
||||
/*
|
||||
* UTF-8 valid ranges.
|
||||
*
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
|
||||
/* Highest unicode version supported by the data tables. */
|
||||
extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
|
||||
extern int utf8version_latest(void);
|
||||
|
||||
/*
|
||||
* Look for the correct const struct utf8data for a unicode version.
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
#ifndef _LINUX_UNICODE_H
|
||||
#define _LINUX_UNICODE_H
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/dcache.h>
|
||||
|
||||
struct unicode_map {
|
||||
const char *charset;
|
||||
int version;
|
||||
};
|
||||
|
||||
int utf8_validate(const struct unicode_map *um, const struct qstr *str);
|
||||
|
||||
int utf8_strncmp(const struct unicode_map *um,
|
||||
const struct qstr *s1, const struct qstr *s2);
|
||||
|
||||
int utf8_strncasecmp(const struct unicode_map *um,
|
||||
const struct qstr *s1, const struct qstr *s2);
|
||||
|
||||
int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
|
||||
unsigned char *dest, size_t dlen);
|
||||
|
||||
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
|
||||
unsigned char *dest, size_t dlen);
|
||||
|
||||
struct unicode_map *utf8_load(const char *version);
|
||||
void utf8_unload(struct unicode_map *um);
|
||||
|
||||
#endif /* _LINUX_UNICODE_H */
|
Загрузка…
Ссылка в новой задаче