All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL
@ 2018-08-11 15:28 Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 01/15] lib: build charset.o only if needed Heinrich Schuchardt
                   ` (14 more replies)
  0 siblings, 15 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

This patchset implements the EFI_UNICODE_COLLATION_PROTOCOL and provides
unit tests.

Additionally some errors when handling Unicode strings are fixed.
* possible stack overflow
* incorrect handling of precision attribute in printf()
* incorrect limit on variable length

Some old Unicode functions are replaced.

An error in Unicode handling in the FAT file system shall be fixed after
merging Takahiro's FAT patches. Until then some redundancy between old and
new Unicode functions remains.

Heinrich Schuchardt (15):
  lib: build charset.o only if needed
  efi_loader: rename utf16_strlen, utf16_strnlen
  lib: charset: utility functions for Unicode
  test: unit tests for Unicode functions
  lib: vsprintf: correct printing of Unicode strings
  test: test printing Unicode
  efi_loader: remove limit on variable length
  efi_loader: don't use unlimited stack as buffer
  efi_loader: buffer size for load options
  lib: charset: remove obsolete functions
  efi_loader: capitalization table
  lib: charset: upper/lower case conversion
  test: tests for utf_to_lower() utf_to_upper().
  efi_loader: EFI_UNICODE_COLLATION_PROTOCOL
  efi_selftest: EFI_UNICODE_COLLATION_PROTOCOL

 MAINTAINERS                                   |    4 +
 cmd/bootefi.c                                 |    6 +-
 include/capitalization.h                      | 1909 +++++++++++++++++
 include/charset.h                             |  181 +-
 include/cp1250.h                              |   40 +
 include/cp437.h                               |   40 +
 include/efi_api.h                             |   21 +
 include/efi_loader.h                          |    5 +
 include/test/suites.h                         |    3 +-
 lib/Makefile                                  |    5 +
 lib/charset.c                                 |  359 +++-
 lib/efi_loader/Makefile                       |   18 +-
 lib/efi_loader/efi_bootmgr.c                  |    2 +-
 lib/efi_loader/efi_boottime.c                 |    6 +
 lib/efi_loader/efi_console.c                  |   20 +-
 lib/efi_loader/efi_file.c                     |    2 +-
 lib/efi_loader/efi_unicode_collation.c        |  218 ++
 lib/efi_loader/efi_variable.c                 |   52 +-
 lib/efi_selftest/Makefile                     |    1 +
 .../efi_selftest_unicode_collation.c          |  257 +++
 lib/vsprintf.c                                |   34 +-
 test/Kconfig                                  |    8 +
 test/Makefile                                 |    1 +
 test/cmd_ut.c                                 |   14 +-
 test/unicode_ut.c                             |  545 +++++
 25 files changed, 3587 insertions(+), 164 deletions(-)
 create mode 100644 include/capitalization.h
 create mode 100644 include/cp1250.h
 create mode 100644 include/cp437.h
 create mode 100644 lib/efi_loader/efi_unicode_collation.c
 create mode 100644 lib/efi_selftest/efi_selftest_unicode_collation.c
 create mode 100644 test/unicode_ut.c

-- 
2.18.0

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 01/15] lib: build charset.o only if needed
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 17:45   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen Heinrich Schuchardt
                   ` (13 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

charset.o is only needed for the EFI subsystem

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 lib/Makefile   |  5 ++++-
 lib/vsprintf.c | 12 ++++++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/Makefile b/lib/Makefile
index 5f583aed37..2fd32798a0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -19,7 +19,10 @@ obj-$(CONFIG_ARCH_AT91) += at91/
 obj-$(CONFIG_OPTEE) += optee/
 
 obj-$(CONFIG_AES) += aes.o
-obj-y += charset.o
+
+ifndef API_BUILD
+obj-$(CONFIG_EFI_LOADER) += charset.o
+endif
 obj-$(CONFIG_USB_TTY) += circbuf.o
 obj-y += crc7.o
 obj-y += crc8.o
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 914fbd30cb..6100357858 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -274,6 +274,8 @@ static char *string(char *buf, char *end, char *s, int field_width,
 	return buf;
 }
 
+#if defined(CONFIG_EFI_LOADER) && \
+	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
 static char *string16(char *buf, char *end, u16 *s, int field_width,
 		int precision, int flags)
 {
@@ -294,8 +296,6 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
 	return buf;
 }
 
-#if defined(CONFIG_EFI_LOADER) && \
-	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
 static char *device_path_string(char *buf, char *end, void *dp, int field_width,
 				int precision, int flags)
 {
@@ -612,10 +612,14 @@ repeat:
 			continue;
 
 		case 's':
-			if (qualifier == 'l' && !IS_ENABLED(CONFIG_SPL_BUILD)) {
+#if defined(CONFIG_EFI_LOADER) && \
+	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
+			if (qualifier == 'l') {
 				str = string16(str, end, va_arg(args, u16 *),
 					       field_width, precision, flags);
-			} else {
+			} else
+#endif
+			{
 				str = string(str, end, va_arg(args, char *),
 					     field_width, precision, flags);
 			}
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 01/15] lib: build charset.o only if needed Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 17:52   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 03/15] lib: charset: utility functions for Unicode Heinrich Schuchardt
                   ` (12 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

The function names utf16_strlen() and utf16_strnlen() are misnomers.
The functions do not count utf-16 characters but non-zero words.
So let's rename them to u16_strlen and u16_strnlen().

In utf16_dup() avoid assignment in if clause.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 include/charset.h             | 28 +++++++++++-----------------
 lib/charset.c                 | 10 +++++++---
 lib/efi_loader/efi_bootmgr.c  |  2 +-
 lib/efi_loader/efi_console.c  |  2 +-
 lib/efi_loader/efi_file.c     |  2 +-
 lib/efi_loader/efi_variable.c |  2 +-
 lib/vsprintf.c                |  2 +-
 7 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/include/charset.h b/include/charset.h
index 11832cbd12..2307559890 100644
--- a/include/charset.h
+++ b/include/charset.h
@@ -13,29 +13,23 @@
 #define MAX_UTF8_PER_UTF16 3
 
 /**
- * utf16_strlen() - Get the length of an utf16 string
+ * u16_strlen - count non-zero words
  *
- * Returns the number of 16 bit characters in an utf16 string, not
- * including the terminating NULL character.
- *
- * @in     the string to measure
- * @return the string length
+ * @in:			utf-16 string
+ * ReturnValue:		number of non-zero words.
+ *			This is not the number of utf-16 letters!
  */
-size_t utf16_strlen(const uint16_t *in);
+size_t u16_strlen(const u16 *in);
 
 /**
- * utf16_strnlen() - Get the length of a fixed-size utf16 string.
- *
- * Returns the number of 16 bit characters in an utf16 string,
- * not including the terminating NULL character, but at most
- * 'count' number of characters.  In doing this, utf16_strnlen()
- * looks at only the first 'count' characters.
+ * u16_strlen - count non-zero words
  *
- * @in     the string to measure
- * @count  the maximum number of characters to count
- * @return the string length, up to a maximum of 'count'
+ * @in:			utf-16 string
+ * @count:		maximum number of words to count
+ * ReturnValue:		number of non-zero words.
+ *			This is not the number of utf-16 letters!
  */
-size_t utf16_strnlen(const uint16_t *in, size_t count);
+size_t u16_strnlen(const u16 *in, size_t count);
 
 /**
  * utf16_strcpy() - UTF16 equivalent of strcpy()
diff --git a/lib/charset.c b/lib/charset.c
index cd186a5a5a..8ff8d59957 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -12,14 +12,14 @@
  * utf8/utf16 conversion mostly lifted from grub
  */
 
-size_t utf16_strlen(const uint16_t *in)
+size_t u16_strlen(const u16 *in)
 {
 	size_t i;
 	for (i = 0; in[i]; i++);
 	return i;
 }
 
-size_t utf16_strnlen(const uint16_t *in, size_t count)
+size_t u16_strnlen(const u16 *in, size_t count)
 {
 	size_t i;
 	for (i = 0; count-- && in[i]; i++);
@@ -39,7 +39,11 @@ uint16_t *utf16_strcpy(uint16_t *dest, const uint16_t *src)
 uint16_t *utf16_strdup(const uint16_t *s)
 {
 	uint16_t *new;
-	if (!s || !(new = malloc((utf16_strlen(s) + 1) * 2)))
+
+	if (!s)
+		return NULL;
+	new = malloc((u16_strlen(s) + 1) * 2);
+	if (!new)
 		return NULL;
 	utf16_strcpy(new, s);
 	return new;
diff --git a/lib/efi_loader/efi_bootmgr.c b/lib/efi_loader/efi_bootmgr.c
index 853358ab93..0c5764db12 100644
--- a/lib/efi_loader/efi_bootmgr.c
+++ b/lib/efi_loader/efi_bootmgr.c
@@ -60,7 +60,7 @@ static void parse_load_option(struct load_option *lo, void *ptr)
 	ptr += sizeof(u16);
 
 	lo->label = ptr;
-	ptr += (utf16_strlen(lo->label) + 1) * 2;
+	ptr += (u16_strlen(lo->label) + 1) * 2;
 
 	lo->file_path = ptr;
 	ptr += lo->file_path_length;
diff --git a/lib/efi_loader/efi_console.c b/lib/efi_loader/efi_console.c
index b487288785..f3d612880c 100644
--- a/lib/efi_loader/efi_console.c
+++ b/lib/efi_loader/efi_console.c
@@ -114,7 +114,7 @@ static efi_status_t EFIAPI efi_cout_output_string(
 
 	EFI_ENTRY("%p, %p", this, string);
 
-	unsigned int n16 = utf16_strlen(string);
+	unsigned int n16 = u16_strlen(string);
 	char buf[MAX_UTF8_PER_UTF16 * n16 + 1];
 	u16 *p;
 
diff --git a/lib/efi_loader/efi_file.c b/lib/efi_loader/efi_file.c
index e6a15bcb52..c21881b32c 100644
--- a/lib/efi_loader/efi_file.c
+++ b/lib/efi_loader/efi_file.c
@@ -139,7 +139,7 @@ static struct efi_file_handle *file_open(struct file_system *fs,
 
 	if (file_name) {
 		utf16_to_utf8((u8 *)f0, (u16 *)file_name, 1);
-		flen = utf16_strlen((u16 *)file_name);
+		flen = u16_strlen((u16 *)file_name);
 	}
 
 	/* we could have a parent, but also an absolute path: */
diff --git a/lib/efi_loader/efi_variable.c b/lib/efi_loader/efi_variable.c
index 90b637215e..770c67abb9 100644
--- a/lib/efi_loader/efi_variable.c
+++ b/lib/efi_loader/efi_variable.c
@@ -106,7 +106,7 @@ static efi_status_t efi_to_native(char *native, u16 *variable_name,
 {
 	size_t len;
 
-	len = utf16_strlen((u16 *)variable_name);
+	len = u16_strlen((u16 *)variable_name);
 	if (len >= MAX_VAR_NAME)
 		return EFI_DEVICE_ERROR;
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 6100357858..a07128ad96 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -280,7 +280,7 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
 		int precision, int flags)
 {
 	u16 *str = s ? s : L"<NULL>";
-	int utf16_len = utf16_strnlen(str, precision);
+	int utf16_len = u16_strnlen(str, precision);
 	u8 utf8[utf16_len * MAX_UTF8_PER_UTF16];
 	int utf8_len, i;
 
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 03/15] lib: charset: utility functions for Unicode
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 01/15] lib: build charset.o only if needed Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 17:59   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions Heinrich Schuchardt
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

utf8_get() - get next UTF-8 code point from buffer
utf8_put() - write UTF-8 code point to buffer
utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16
utf8_utf16_strncpy() - copy a utf-8 string to utf-16
utf16_get() - get next UTF-16 code point from buffer
utf16_put() - write UTF-16 code point to buffer
utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8
utf16_utf8_strncpy() - copy a utf-16 string to utf-8

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 include/charset.h | 130 +++++++++++++++++++++++++
 lib/Makefile      |   4 +-
 lib/charset.c     | 243 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 373 insertions(+), 4 deletions(-)

diff --git a/include/charset.h b/include/charset.h
index 2307559890..81e31d1b26 100644
--- a/include/charset.h
+++ b/include/charset.h
@@ -8,10 +8,140 @@
 #ifndef __CHARSET_H_
 #define __CHARSET_H_
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 
 #define MAX_UTF8_PER_UTF16 3
 
+/**
+ * utf8_get() - get next UTF-8 code point from buffer
+ *
+ * @src:		pointer to current byte, updated to point to next byte
+ * Return:		code point, or 0 for end of string, or -1 if no legal
+ *			code point is found. In case of an error src points to
+ *			the incorrect byte.
+ */
+s32 utf8_get(const char **src);
+
+/**
+ * utf8_put() - write UTF-8 code point to buffer
+ *
+ * @code:		code point
+ * @dst:		pointer to destination buffer, updated to next position
+ * Return:		-1 if the input parameters are invalid
+ */
+int utf8_put(s32 code, char **dst);
+
+/**
+ * utf8_utf16_strnlen() - length of a truncated utf-8 string after conversion
+ *			  to utf-16
+ *
+ * @src:		utf-8 string
+ * @count:		maximum number of code points to convert
+ * Return:		length in bytes after conversion to utf-16 without the
+ *			trailing \0. If an invalid UTF-8 sequence is hit one
+ *			word will be reserved for a replacement character.
+ */
+size_t utf8_utf16_strnlen(const char *src, size_t count);
+
+/**
+ * utf8_utf16_strlen() - length of a utf-8 string after conversion to utf-16
+ *
+ * @src:		utf-8 string
+ * Return:		length in bytes after conversion to utf-16 without the
+ *			trailing \0. -1 if the utf-8 string is not valid.
+ */
+#define utf8_utf16_strlen(a) utf8_utf16_strnlen((a), SIZE_MAX)
+
+/**
+ * utf8_utf16_strncpy() - copy utf-8 string to utf-16 string
+ *
+ * @dst:		destination buffer
+ * @src:		source buffer
+ * @count:		maximum number of code points to copy
+ * Return:		-1 if the input parameters are invalid
+ */
+int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count);
+
+/**
+ * utf8_utf16_strcpy() - copy utf-8 string to utf-16 string
+ *
+ * @dst:		destination buffer
+ * @src:		source buffer
+ * Return:		-1 if the input parameters are invalid
+ */
+#define utf8_utf16_strcpy(d, s) utf8_utf16_strncpy((d), (s), SIZE_MAX)
+
+/**
+ * utf16_get() - get next UTF-16 code point from buffer
+ *
+ * @src:		pointer to current word, updated to point to next word
+ * Return:		code point, or 0 for end of string, or -1 if no legal
+ *			code point is found. In case of an error src points to
+ *			the incorrect word.
+ */
+s32 utf16_get(const u16 **src);
+
+/**
+ * utf16_put() - write UTF-16 code point to buffer
+ *
+ * @code:		code point
+ * @dst:		pointer to destination buffer, updated to next position
+ * Return:		-1 if the input parameters are invalid
+ */
+int utf16_put(s32 code, u16 **dst);
+
+/**
+ * utf16_strnlen() - length of a truncated utf-16 string
+ *
+ * @src:		utf-16 string
+ * @count:		maximum number of code points to convert
+ * Return:		length in code points. If an invalid UTF-16 sequence is
+ *			hit one position will be reserved for a replacement
+ *			character.
+ */
+size_t utf16_strnlen(const u16 *src, size_t count);
+
+/**
+ * utf16_utf8_strnlen() - length of a truncated utf-16 string after conversion
+ *			  to utf-8
+ *
+ * @src:		utf-16 string
+ * @count:		maximum number of code points to convert
+ * Return:		length in bytes after conversion to utf-8 without the
+ *			trailing \0. If an invalid UTF-16 sequence is hit one
+ *			byte will be reserved for a replacement character.
+ */
+size_t utf16_utf8_strnlen(const u16 *src, size_t count);
+
+/**
+ * utf16_utf8_strlen() - length of a utf-16 string after conversion to utf-8
+ *
+ * @src:		utf-16 string
+ * Return:		length in bytes after conversion to utf-8 without the
+ *			trailing \0. -1 if the utf-16 string is not valid.
+ */
+#define utf16_utf8_strlen(a) utf16_utf8_strnlen((a), SIZE_MAX)
+
+/**
+ * utf16_utf8_strncpy() - copy utf-16 string to utf-8 string
+ *
+ * @dst:		destination buffer
+ * @src:		source buffer
+ * @count:		maximum number of code points to copy
+ * Return:		-1 if the input parameters are invalid
+ */
+int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count);
+
+/**
+ * utf16_utf8_strcpy() - copy utf-16 string to utf-8 string
+ *
+ * @dst:		destination buffer
+ * @src:		source buffer
+ * Return:		-1 if the input parameters are invalid
+ */
+#define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX)
+
 /**
  * u16_strlen - count non-zero words
  *
diff --git a/lib/Makefile b/lib/Makefile
index 2fd32798a0..f169644850 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -21,7 +21,9 @@ obj-$(CONFIG_OPTEE) += optee/
 obj-$(CONFIG_AES) += aes.o
 
 ifndef API_BUILD
-obj-$(CONFIG_EFI_LOADER) += charset.o
+ifneq ($(CONFIG_UT_UNICODE)$(CONFIG_EFI_LOADER),)
+obj-y += charset.o
+endif
 endif
 obj-$(CONFIG_USB_TTY) += circbuf.o
 obj-y += crc7.o
diff --git a/lib/charset.c b/lib/charset.c
index 8ff8d59957..0f4c6f26eb 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -8,9 +8,246 @@
 #include <charset.h>
 #include <malloc.h>
 
-/*
- * utf8/utf16 conversion mostly lifted from grub
- */
+s32 utf8_get(const char **src)
+{
+	s32 code = 0;
+	unsigned char c;
+
+	if (!src || !*src)
+		return -1;
+	if (!**src)
+		return 0;
+	c = **src;
+	if (c >= 0x80) {
+		/*
+		 * We do not expect a continuation byte (0x80 - 0xbf).
+		 * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
+		 * here.
+		 * The highest code point is 0x10ffff which is coded as
+		 * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
+		 */
+		if (c < 0xc2 || code > 0xf4)
+			return -1;
+		if (c >= 0xe0) {
+			if (c >= 0xf0) {
+				/* 0xf0 - 0xf4 */
+				c &= 0x07;
+				code = c << 18;
+				++*src;
+				c = **src;
+				if (c < 0x80 || c > 0xbf)
+					return -1;
+				c &= 0x3f;
+			} else {
+				/* 0xe0 - 0xef */
+				c &= 0x0f;
+			}
+			code += c << 12;
+			if ((code >= 0xD800 && code <= 0xDFFF) ||
+			    code >= 0x110000)
+				return -1;
+			++*src;
+			c = **src;
+			if (c < 0x80 || c > 0xbf)
+				return -1;
+		}
+		/* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
+		c &= 0x3f;
+		code += c << 6;
+		++*src;
+		c = **src;
+		if (c < 0x80 || c > 0xbf)
+			return -1;
+		c &= 0x3f;
+	}
+	code += c;
+	++*src;
+	return code;
+}
+
+int utf8_put(s32 code, char **dst)
+{
+	if (!dst || !*dst)
+		return -1;
+	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
+		return -1;
+	if (code <= 0x007F) {
+		**dst = code;
+	} else {
+		if (code <= 0x07FF) {
+			**dst = code >> 6 | 0xC0;
+		} else {
+			if (code < 0x10000) {
+				**dst = code >> 12 | 0xE0;
+			} else {
+				**dst = code >> 18 | 0xF0;
+				++*dst;
+				**dst = (code >> 12 & 0x3F) | 0x80;
+			}
+			++*dst;
+			**dst = (code >> 6 & 0x3F) | 0x80;
+		}
+		++*dst;
+		**dst = (code & 0x3F) | 0x80;
+	}
+	++*dst;
+	return 0;
+}
+
+size_t utf8_utf16_strnlen(const char *src, size_t count)
+{
+	size_t len = 0;
+
+	for (; *src && count; --count)  {
+		s32 code = utf8_get(&src);
+
+		if (!code)
+			break;
+		if (code < 0) {
+			/* Reserve space for a replacement character */
+			len += 1;
+			if (*src)
+				++src;
+		} else if (code < 0x10000) {
+			len += 1;
+		} else {
+			len += 2;
+		}
+	}
+	return len;
+}
+
+int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
+{
+	if (!src || !dst || !*dst)
+		return -1;
+
+	for (; count && *src; --count) {
+		s32 code = utf8_get(&src);
+
+		if (code < 0) {
+			code = '?';
+			if (*src)
+				++src;
+		}
+		utf16_put(code, dst);
+	}
+	**dst = 0;
+	return 0;
+}
+
+s32 utf16_get(const u16 **src)
+{
+	s32 code, code2;
+
+	if (!src || !*src)
+		return -1;
+	if (!**src)
+		return 0;
+	code = **src;
+	++*src;
+	if (code >= 0xD800 && code <= 0xDBFF) {
+		if (!**src)
+			return -1;
+		code &= 0x3ff;
+		code <<= 10;
+		code += 0x10000;
+		code2 = **src;
+		if (code2 <= 0xDC00 || code2 >= 0xDFFF)
+			return -1;
+		++*src;
+		code2 &= 0x3ff;
+		code += code2;
+	}
+	return code;
+}
+
+int utf16_put(s32 code, u16 **dst)
+{
+	if (!dst || !*dst)
+		return -1;
+	if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
+		return -1;
+	if (code < 0x10000) {
+		**dst = code;
+	} else {
+		code -= 0x10000;
+		**dst = code >> 10 | 0xD800;
+		++*dst;
+		**dst = (code & 0x3ff) | 0xDC00;
+	}
+	++*dst;
+	return 0;
+}
+
+size_t utf16_strnlen(const u16 *src, size_t count)
+{
+	size_t len = 0;
+
+	for (; *src && count; --count)  {
+		s32 code = utf16_get(&src);
+
+		if (!code)
+			break;
+		if (code < 0) {
+			if (*src)
+				++src;
+		}
+		/*
+		 * In case of an illegal sequence still reserve space for a
+		 * replacement character.
+		 */
+		++len;
+	}
+	return len;
+}
+
+size_t utf16_utf8_strnlen(const u16 *src, size_t count)
+{
+	size_t len = 0;
+
+	for (; *src && count; --count)  {
+		s32 code = utf16_get(&src);
+
+		if (!code)
+			break;
+		if (code < 0) {
+			/* Reserve space for a replacement character */
+			len += 1;
+			if (*src)
+				++src;
+		} else if (code < 0x80) {
+			len += 1;
+		} else if (code < 0x800) {
+			len += 2;
+		} else if (code < 0x10000) {
+			len += 3;
+		} else {
+			len += 4;
+		}
+	}
+	return len;
+}
+
+int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
+{
+	if (!src || !dst || !*dst)
+		return -1;
+
+	for (; count && *src; --count) {
+		s32 code = utf16_get(&src);
+
+		if (code < 0) {
+			code = '?';
+			if (*src)
+				++src;
+		}
+		utf8_put(code, dst);
+	}
+	**dst = 0;
+	return 0;
+}
+
 
 size_t u16_strlen(const u16 *in)
 {
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (2 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 03/15] lib: charset: utility functions for Unicode Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:02   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings Heinrich Schuchardt
                   ` (10 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

Provide unit tests for Unicode functions.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 MAINTAINERS           |   1 +
 include/test/suites.h |   3 +-
 test/Kconfig          |   8 +
 test/Makefile         |   1 +
 test/cmd_ut.c         |  14 +-
 test/unicode_ut.c     | 470 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 493 insertions(+), 4 deletions(-)
 create mode 100644 test/unicode_ut.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 51a1472cf8..a324139471 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -374,6 +374,7 @@ F:	include/asm-generic/pe.h
 F:	lib/charset.c
 F:	lib/efi*/
 F:	test/py/tests/test_efi*
+F:	test/unicode_ut.c
 F:	cmd/bootefi.c
 F:	tools/file2include.c
 
diff --git a/include/test/suites.h b/include/test/suites.h
index b5019a7cd2..8e4eac60eb 100644
--- a/include/test/suites.h
+++ b/include/test/suites.h
@@ -23,10 +23,11 @@ struct unit_test;
 int cmd_ut_category(const char *name, struct unit_test *tests, int n_ents,
 		    int argc, char * const argv[]);
 
+int do_ut_compression(cmd_tbl_t *cmdtp, int flag, int argc, char *const argv[]);
 int do_ut_dm(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
 int do_ut_env(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
 int do_ut_overlay(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
 int do_ut_time(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
-int do_ut_compression(cmd_tbl_t *cmdtp, int flag, int argc, char *const argv[]);
+int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
 
 #endif /* __TEST_SUITES_H__ */
diff --git a/test/Kconfig b/test/Kconfig
index 3643761bc6..de16d179d0 100644
--- a/test/Kconfig
+++ b/test/Kconfig
@@ -15,6 +15,14 @@ config UT_TIME
 	  problems. But if you are having problems with udelay() and the like,
 	  this is a good place to start.
 
+config UT_UNICODE
+	bool "Unit tests for Unicode functions"
+	depends on UNIT_TEST
+	default y
+	help
+	  Enables the 'ut unicode' command which tests that the functions for
+	  manipulating Unicode strings work correctly.
+
 source "test/dm/Kconfig"
 source "test/env/Kconfig"
 source "test/overlay/Kconfig"
diff --git a/test/Makefile b/test/Makefile
index 1092011fdb..a5f52fd5ad 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_SANDBOX) += command_ut.o
 obj-$(CONFIG_SANDBOX) += compression.o
 obj-$(CONFIG_SANDBOX) += print_ut.o
 obj-$(CONFIG_UT_TIME) += time_ut.o
+obj-$(CONFIG_UT_UNICODE) += unicode_ut.o
 obj-$(CONFIG_$(SPL_)LOG) += log/
diff --git a/test/cmd_ut.c b/test/cmd_ut.c
index 934a5a931b..d6a2593850 100644
--- a/test/cmd_ut.c
+++ b/test/cmd_ut.c
@@ -49,6 +49,10 @@ static cmd_tbl_t cmd_ut_sub[] = {
 #ifdef CONFIG_UT_TIME
 	U_BOOT_CMD_MKENT(time, CONFIG_SYS_MAXARGS, 1, do_ut_time, "", ""),
 #endif
+#if defined(CONFIG_UT_UNICODE) && \
+	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
+	U_BOOT_CMD_MKENT(unicode, CONFIG_SYS_MAXARGS, 1, do_ut_unicode, "", ""),
+#endif
 #ifdef CONFIG_SANDBOX
 	U_BOOT_CMD_MKENT(compression, CONFIG_SYS_MAXARGS, 1, do_ut_compression,
 			 "", ""),
@@ -93,6 +97,9 @@ static int do_ut(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
 #ifdef CONFIG_SYS_LONGHELP
 static char ut_help_text[] =
 	"all - execute all enabled tests\n"
+#ifdef CONFIG_SANDBOX
+	"ut compression - Test compressors and bootm decompression\n"
+#endif
 #ifdef CONFIG_UT_DM
 	"ut dm [test-name]\n"
 #endif
@@ -105,11 +112,12 @@ static char ut_help_text[] =
 #ifdef CONFIG_UT_TIME
 	"ut time - Very basic test of time functions\n"
 #endif
-#ifdef CONFIG_SANDBOX
-	"ut compression - Test compressors and bootm decompression\n"
+#if defined(CONFIG_UT_UNICODE) && \
+	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
+	"ut unicode - test Unicode functions\n"
 #endif
 	;
-#endif
+#endif /* CONFIG_SYS_LONGHELP */
 
 U_BOOT_CMD(
 	ut, CONFIG_SYS_MAXARGS, 1, do_ut,
diff --git a/test/unicode_ut.c b/test/unicode_ut.c
new file mode 100644
index 0000000000..29316606c4
--- /dev/null
+++ b/test/unicode_ut.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Unit tests for Unicode functions
+ *
+ * Copyright (c) 2018 Heinrich Schuchardt <xypron.glpk@gmx.de>
+ */
+
+#include <common.h>
+#include <charset.h>
+#include <command.h>
+#include <errno.h>
+
+/* Constants c1-c4 and d1-d4 encode the same letters */
+
+/* Six characters translating to one utf-8 byte each. */
+static const u16 c1[] = {0x55, 0x2d, 0x42, 0x6f, 0x6f, 0x74, 0x00};
+/* One character translating to two utf-8 bytes */
+static const u16 c2[] = {0x6b, 0x61, 0x66, 0x62, 0xe1, 0x74, 0x75, 0x72, 0x00};
+/* Three characters translating to three utf-8 bytes each */
+static const u16 c3[] = {0x6f5c, 0x6c34, 0x8266, 0x00};
+/* Three letters translating to four utf-8 bytes each */
+static const u16 c4[] = {0xd801, 0xdc8d, 0xd801, 0xdc96, 0xd801, 0xdc87,
+			 0x0000};
+
+/* Six characters translating to one utf-16 word each. */
+static const char d1[] = {0x55, 0x2d, 0x42, 0x6f, 0x6f, 0x74, 0x00};
+/* Eight characters translating to one utf-16 word each */
+static const char d2[] = {0x6b, 0x61, 0x66, 0x62, 0xc3, 0xa1, 0x74, 0x75,
+			  0x72, 0x00};
+/* Three characters translating to one utf-16 word each */
+static const char d3[] = {0xe6, 0xbd, 0x9c, 0xe6, 0xb0, 0xb4, 0xe8, 0x89,
+			  0xa6, 0x00};
+/* Three letters translating to two utf-16 word each */
+static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
+			  0xf0, 0x90, 0x92, 0x87, 0x00};
+
+static int ut_utf8_get(void)
+{
+	const char *s;
+	s32 code;
+	int i;
+
+	/* Check characters less than 0x800 */
+	s = d2;
+	for (i = 0; i < 8; ++i) {
+		code = utf8_get((const char **)&s);
+		/* c2 is the utf-8 encoding of d2 */
+		if (code != c2[i])
+			return -1;
+		if (!code)
+			break;
+	}
+	if (s != d2 + 9)
+		return -1;
+
+	/* Check characters less than 0x10000 */
+	s = d3;
+	for (i = 0; i < 4; ++i) {
+		code = utf8_get((const char **)&s);
+		/* c3 is the utf-8 encoding of d3 */
+		if (code != c3[i])
+			return -1;
+		if (!code)
+			break;
+	}
+	if (s != d3 + 9)
+		return -1;
+
+	/* Check character greater 0xffff */
+	s = d4;
+	code = utf8_get((const char **)&s);
+	if (code != 0x0001048d)
+		return -1;
+	if (s != d4 + 4)
+		return -1;
+
+	return 0;
+}
+
+static int ut_utf8_put(void)
+{
+	char buffer[8] = { 0, };
+	char *pos;
+
+	/* Commercial at, translates to one character */
+	pos = buffer;
+	if (utf8_put('@', &pos))
+		return -1;
+	if (pos - buffer != 1)
+		return -1;
+	if (buffer[0] != '@' || buffer[1])
+		return -1;
+
+	/* Latin letter G with acute, translates to two charactes */
+	pos = buffer;
+	if (utf8_put(0x1f4, &pos))
+		return -1;
+	if (pos - buffer != 2)
+		return -1;
+	if (buffer[0] != (char)0xc7 || buffer[1] != (char)0xb4 || buffer[2])
+		return -1;
+
+	/* Tagalog letter i, translates to three characters */
+	pos = buffer;
+	if (utf8_put(0x1701, &pos))
+		return -1;
+	if (pos - buffer != 3)
+		return -1;
+	if (buffer[0] != (char)0xe1 || buffer[1] != (char)0x9c ||
+	    buffer[2] != (char)0x81 || buffer[3])
+		return -1;
+
+	/* Hamster face, translates to four characters */
+	pos = buffer;
+	if (utf8_put(0x1f439, &pos))
+		return -1;
+	if (pos - buffer != 4)
+		return -1;
+	if (buffer[0] != (char)0xf0 || buffer[1] != (char)0x9f ||
+	    buffer[2] != (char)0x90 || buffer[3] != (char)0xb9 || buffer[4])
+		return -1;
+
+	/* Illegal code */
+	pos = buffer;
+	if (utf8_put(0xd888, &pos) != -1)
+		return -1;
+
+	return 0;
+}
+
+int ut_utf8_utf16_strlen(void)
+{
+	if (utf8_utf16_strlen(d1) != 6)
+		return 1;
+	if (utf8_utf16_strlen(d2) != 8)
+		return 1;
+	if (utf8_utf16_strlen(d3) != 3)
+		return 1;
+	if (utf8_utf16_strlen(d4) != 6)
+		return 1;
+	return 0;
+}
+
+int ut_utf8_utf16_strnlen(void)
+{
+	if (utf8_utf16_strnlen(d1, 3) != 3)
+		return 1;
+	if (utf8_utf16_strnlen(d1, 13) != 6)
+		return 1;
+	if (utf8_utf16_strnlen(d2, 6) != 6)
+		return 1;
+	if (utf8_utf16_strnlen(d3, 2) != 2)
+		return 1;
+	if (utf8_utf16_strnlen(d4, 2) != 4)
+		return 1;
+	if (utf8_utf16_strnlen(d4, 3) != 6)
+		return 1;
+	return 0;
+}
+
+int ut_u16_strcmp(const u16 *a1, const u16 *a2, size_t count)
+{
+	for (; (*a1 || *a2) && count; ++a1, ++a2, --count) {
+		if (*a1 < *a2)
+			return -1;
+		if (*a1 > *a2)
+			return 1;
+	}
+	return 0;
+}
+
+int ut_utf8_utf16_strcpy(void)
+{
+	u16 buf[16];
+	u16 *pos;
+
+	pos = buf;
+	utf8_utf16_strcpy(&pos, d1);
+	if (pos - buf != 6)
+		return 1;
+	if (ut_u16_strcmp(buf, c1, 16))
+		return 1;
+
+	pos = buf;
+	utf8_utf16_strcpy(&pos, d2);
+	if (pos - buf != 8)
+		return 1;
+	if (ut_u16_strcmp(buf, c2, 16))
+		return 1;
+
+	pos = buf;
+	utf8_utf16_strcpy(&pos, d3);
+	if (pos - buf != 3)
+		return 1;
+	if (ut_u16_strcmp(buf, c3, 16))
+		return 1;
+
+	pos = buf;
+	utf8_utf16_strcpy(&pos, d4);
+	if (pos - buf != 6)
+		return 1;
+	if (ut_u16_strcmp(buf, c4, 16))
+		return 1;
+
+	return 0;
+}
+
+int ut_utf8_utf16_strncpy(void)
+{
+	u16 buf[16];
+	u16 *pos;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf8_utf16_strncpy(&pos, d1, 4);
+	if (pos - buf != 4)
+		return 1;
+	if (buf[4])
+		return 1;
+	if (ut_u16_strcmp(buf, c1, 4))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf8_utf16_strncpy(&pos, d2, 10);
+	if (pos - buf != 8)
+		return 1;
+	if (!buf[4])
+		return 1;
+	if (ut_u16_strcmp(buf, c2, SIZE_MAX))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf8_utf16_strncpy(&pos, d3, 2);
+	if (pos - buf != 2)
+		return 1;
+	if (buf[2])
+		return 1;
+	if (ut_u16_strcmp(buf, c3, 2))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf8_utf16_strncpy(&pos, d4, 2);
+	if (pos - buf != 4)
+		return 1;
+	if (buf[4])
+		return 1;
+	if (ut_u16_strcmp(buf, c4, 4))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf8_utf16_strncpy(&pos, d4, 10);
+	if (pos - buf != 6)
+		return 1;
+	if (!buf[5])
+		return 1;
+	if (ut_u16_strcmp(buf, c4, SIZE_MAX))
+		return 1;
+
+	return 0;
+}
+
+static int ut_utf16_get(void)
+{
+	const u16 *s;
+	s32 code;
+	int i;
+
+	/* Check characters less than 0x10000 */
+	s = c2;
+	for (i = 0; i < 9; ++i) {
+		code = utf16_get((const u16 **)&s);
+		if (code != c2[i])
+			return -1;
+		if (!code)
+			break;
+	}
+	if (s != c2 + 8)
+		return -1;
+
+	/* Check character greater 0xffff */
+	s = c4;
+	code = utf16_get((const u16 **)&s);
+	if (code != 0x0001048d)
+		return -1;
+	if (s != c4 + 2)
+		return -1;
+
+	return 0;
+}
+
+static int ut_utf16_put(void)
+{
+	u16 buffer[4] = { 0, };
+	u16 *pos;
+
+	/* Commercial at, translates to one word */
+	pos = buffer;
+	if (utf16_put('@', &pos))
+		return -1;
+	if (pos - buffer != 1)
+		return -1;
+	if (buffer[0] != (u16)'@' || buffer[1])
+		return -1;
+
+	/* Hamster face, translates to two words */
+	pos = buffer;
+	if (utf16_put(0x1f439, &pos))
+		return -1;
+	if (pos - buffer != 2)
+		return -1;
+	if (buffer[0] != (u16)0xd83d || buffer[1] != (u16)0xdc39 || buffer[2])
+		return -1;
+
+	/* Illegal code */
+	pos = buffer;
+	if (utf16_put(0xd888, &pos) != -1)
+		return -1;
+
+	return 0;
+}
+
+int ut_utf16_utf8_strlen(void)
+{
+	if (utf16_utf8_strlen(c1) != 6)
+		return 1;
+	if (utf16_utf8_strlen(c2) != 9)
+		return 1;
+	if (utf16_utf8_strlen(c3) != 9)
+		return 1;
+	if (utf16_utf8_strlen(c4) != 12)
+		return 1;
+	return 0;
+}
+
+int ut_utf16_utf8_strnlen(void)
+{
+	if (utf16_utf8_strnlen(c1, 3) != 3)
+		return 1;
+	if (utf16_utf8_strnlen(c1, 13) != 6)
+		return 1;
+	if (utf16_utf8_strnlen(c2, 6) != 7)
+		return 1;
+	if (utf16_utf8_strnlen(c3, 2) != 6)
+		return 1;
+	if (utf16_utf8_strnlen(c4, 2) != 8)
+		return 1;
+	if (utf16_utf8_strnlen(c4, 3) != 12)
+		return 1;
+	return 0;
+}
+
+int ut_utf16_utf8_strcpy(void)
+{
+	char buf[16];
+	char *pos;
+
+	pos = buf;
+	utf16_utf8_strcpy(&pos, c1);
+	if (pos - buf != 6)
+		return 1;
+	if (strcmp(buf, d1))
+		return 1;
+
+	pos = buf;
+	utf16_utf8_strcpy(&pos, c2);
+	if (pos - buf != 9)
+		return 1;
+	if (strcmp(buf, d2))
+		return 1;
+
+	pos = buf;
+	utf16_utf8_strcpy(&pos, c3);
+	if (pos - buf != 9)
+		return 1;
+	if (strcmp(buf, d3))
+		return 1;
+
+	pos = buf;
+	utf16_utf8_strcpy(&pos, c4);
+	if (pos - buf != 12)
+		return 1;
+	if (strcmp(buf, d4))
+		return 1;
+
+	return 0;
+}
+
+int ut_utf16_utf8_strncpy(void)
+{
+	char buf[16];
+	char *pos;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf16_utf8_strncpy(&pos, c1, 4);
+	if (pos - buf != 4)
+		return 1;
+	if (buf[4])
+		return 1;
+	if (strncmp(buf, d1, 4))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf16_utf8_strncpy(&pos, c2, 10);
+	if (pos - buf != 9)
+		return 1;
+	if (!buf[4])
+		return 1;
+	if (strncmp(buf, d2, SIZE_MAX))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf16_utf8_strncpy(&pos, c3, 2);
+	if (pos - buf != 6)
+		return 1;
+	if (buf[6])
+		return 1;
+	if (strncmp(buf, d3, 6))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf16_utf8_strncpy(&pos, c4, 2);
+	if (pos - buf != 8)
+		return 1;
+	if (buf[8])
+		return 1;
+	if (strncmp(buf, d4, 8))
+		return 1;
+
+	pos = buf;
+	memset(buf, 0, sizeof(buf));
+	utf16_utf8_strncpy(&pos, c4, 10);
+	if (pos - buf != 12)
+		return 1;
+	if (!buf[5])
+		return 1;
+	if (strncmp(buf, d4, SIZE_MAX))
+		return 1;
+
+	return 0;
+}
+
+int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
+{
+	int ret = 0;
+
+	ret |= ut_utf8_get();
+	ret |= ut_utf8_put();
+	ret |= ut_utf8_utf16_strlen();
+	ret |= ut_utf8_utf16_strnlen();
+	ret |= ut_utf8_utf16_strcpy();
+	ret |= ut_utf8_utf16_strncpy();
+	ret |= ut_utf16_get();
+	ret |= ut_utf16_put();
+	ret |= ut_utf16_utf8_strlen();
+	ret |= ut_utf16_utf8_strnlen();
+	ret |= ut_utf16_utf8_strcpy();
+	ret |= ut_utf16_utf8_strncpy();
+
+	printf("Test %s\n", ret ? "failed" : "passed");
+
+	return ret ? CMD_RET_FAILURE : CMD_RET_SUCCESS;
+}
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (3 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:05   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 06/15] test: test printing Unicode Heinrich Schuchardt
                   ` (9 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

The width and precision of the printf() function refer to the number of
characters not to the number of bytes printed.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 lib/vsprintf.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index a07128ad96..b7eb9d5f5e 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -280,18 +280,22 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
 		int precision, int flags)
 {
 	u16 *str = s ? s : L"<NULL>";
-	int utf16_len = u16_strnlen(str, precision);
-	u8 utf8[utf16_len * MAX_UTF8_PER_UTF16];
-	int utf8_len, i;
-
-	utf8_len = utf16_to_utf8(utf8, str, utf16_len) - utf8;
+	ssize_t i, len = utf16_strnlen(str, precision);
 
 	if (!(flags & LEFT))
-		while (utf8_len < field_width--)
+		for (; len < field_width; --field_width)
 			ADDCH(buf, ' ');
-	for (i = 0; i < utf8_len; ++i)
-		ADDCH(buf, utf8[i]);
-	while (utf8_len < field_width--)
+	for (i = 0; i < len; ++i) {
+		s32 code = utf16_get((const u16 **)&str);
+
+		if (code < 0) {
+			code = '?';
+			if (*str)
+				++str;
+		}
+		utf8_put(code, &buf);
+	}
+	for (; i < field_width; --field_width)
 		ADDCH(buf, ' ');
 	return buf;
 }
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 06/15] test: test printing Unicode
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (4 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:06   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length Heinrich Schuchardt
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

Test printing of Unicode strings

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 test/unicode_ut.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/test/unicode_ut.c b/test/unicode_ut.c
index 29316606c4..8e8c4d189e 100644
--- a/test/unicode_ut.c
+++ b/test/unicode_ut.c
@@ -34,6 +34,42 @@ static const char d3[] = {0xe6, 0xbd, 0x9c, 0xe6, 0xb0, 0xb4, 0xe8, 0x89,
 static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
 			  0xf0, 0x90, 0x92, 0x87, 0x00};
 
+static int ut_string16(void)
+{
+#if defined(CONFIG_EFI_LOADER) && \
+	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
+	char buf[20];
+
+	memset(buf, 0xff, sizeof(buf));
+	sprintf(buf, "%8.6ls", c2);
+	if (buf[1] != ' ')
+		return -1;
+	if (strncmp(&buf[2], d2, 7))
+		return -1;
+	if (buf[9])
+		return -1;
+
+	memset(buf, 0xff, sizeof(buf));
+	sprintf(buf, "%8.6ls", c4);
+	if (buf[4] != ' ')
+		return -1;
+	if (strncmp(&buf[5], d4, 12))
+		return -1;
+	if (buf[17])
+		return -1;
+
+	memset(buf, 0xff, sizeof(buf));
+	sprintf(buf, "%-8.2ls", c4);
+	if (strncmp(buf, d4, 8))
+		return -1;
+	if (buf[8] != ' ')
+		return -1;
+	if (buf[14])
+		return -1;
+#endif
+	return 0;
+}
+
 static int ut_utf8_get(void)
 {
 	const char *s;
@@ -451,6 +487,7 @@ int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
 {
 	int ret = 0;
 
+	ret |= ut_string16();
 	ret |= ut_utf8_get();
 	ret |= ut_utf8_put();
 	ret |= ut_utf8_utf16_strlen();
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (5 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 06/15] test: test printing Unicode Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:13   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 08/15] efi_loader: don't use unlimited stack as buffer Heinrich Schuchardt
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

The EFI spec does not provide a length limit for variables.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 lib/efi_loader/efi_variable.c | 52 ++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/lib/efi_loader/efi_variable.c b/lib/efi_loader/efi_variable.c
index 770c67abb9..495738884b 100644
--- a/lib/efi_loader/efi_variable.c
+++ b/lib/efi_loader/efi_variable.c
@@ -44,10 +44,7 @@
  * converted to utf16?
  */
 
-#define MAX_VAR_NAME 31
-#define MAX_NATIVE_VAR_NAME \
-	(strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_") + \
-		(MAX_VAR_NAME * MAX_UTF8_PER_UTF16))
+#define PREFIX_LEN (strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_"))
 
 static int hex(int ch)
 {
@@ -101,18 +98,20 @@ static char *mem2hex(char *hexstr, const u8 *mem, int count)
 	return hexstr;
 }
 
-static efi_status_t efi_to_native(char *native, u16 *variable_name,
+static efi_status_t efi_to_native(char **native, const u16 *variable_name,
 				  efi_guid_t *vendor)
 {
 	size_t len;
+	char *pos;
 
-	len = u16_strlen((u16 *)variable_name);
-	if (len >= MAX_VAR_NAME)
-		return EFI_DEVICE_ERROR;
+	len = PREFIX_LEN + utf16_utf8_strlen(variable_name) + 1;
+	*native = malloc(len);
+	if (!*native)
+		return EFI_OUT_OF_RESOURCES;
 
-	native += sprintf(native, "efi_%pUl_", vendor);
-	native  = (char *)utf16_to_utf8((u8 *)native, (u16 *)variable_name, len);
-	*native = '\0';
+	pos = *native;
+	pos += sprintf(pos, "efi_%pUl_", vendor);
+	utf16_utf8_strcpy(&pos, variable_name);
 
 	return EFI_SUCCESS;
 }
@@ -168,7 +167,7 @@ efi_status_t EFIAPI efi_get_variable(u16 *variable_name, efi_guid_t *vendor,
 				     u32 *attributes, efi_uintn_t *data_size,
 				     void *data)
 {
-	char native_name[MAX_NATIVE_VAR_NAME + 1];
+	char *native_name;
 	efi_status_t ret;
 	unsigned long in_size;
 	const char *val, *s;
@@ -180,13 +179,14 @@ efi_status_t EFIAPI efi_get_variable(u16 *variable_name, efi_guid_t *vendor,
 	if (!variable_name || !vendor || !data_size)
 		return EFI_EXIT(EFI_INVALID_PARAMETER);
 
-	ret = efi_to_native(native_name, variable_name, vendor);
+	ret = efi_to_native(&native_name, variable_name, vendor);
 	if (ret)
 		return EFI_EXIT(ret);
 
 	debug("%s: get '%s'\n", __func__, native_name);
 
 	val = env_get(native_name);
+	free(native_name);
 	if (!val)
 		return EFI_EXIT(EFI_NOT_FOUND);
 
@@ -256,35 +256,41 @@ efi_status_t EFIAPI efi_set_variable(u16 *variable_name, efi_guid_t *vendor,
 				     u32 attributes, efi_uintn_t data_size,
 				     void *data)
 {
-	char native_name[MAX_NATIVE_VAR_NAME + 1];
+	char *native_name = NULL, *val = NULL, *s;
 	efi_status_t ret = EFI_SUCCESS;
-	char *val, *s;
 	u32 attr;
 
 	EFI_ENTRY("\"%ls\" %pUl %x %zu %p", variable_name, vendor, attributes,
 		  data_size, data);
 
-	if (!variable_name || !vendor)
-		return EFI_EXIT(EFI_INVALID_PARAMETER);
+	if (!variable_name || !vendor) {
+		ret = EFI_INVALID_PARAMETER;
+		goto out;
+	}
 
-	ret = efi_to_native(native_name, variable_name, vendor);
+	ret = efi_to_native(&native_name, variable_name, vendor);
 	if (ret)
-		return EFI_EXIT(ret);
+		goto out;
 
 #define ACCESS_ATTR (EFI_VARIABLE_RUNTIME_ACCESS | EFI_VARIABLE_BOOTSERVICE_ACCESS)
 
 	if ((data_size == 0) || !(attributes & ACCESS_ATTR)) {
 		/* delete the variable: */
 		env_set(native_name, NULL);
-		return EFI_EXIT(EFI_SUCCESS);
+		ret = EFI_SUCCESS;
+		goto out;
 	}
 
 	val = env_get(native_name);
 	if (val) {
 		parse_attr(val, &attr);
 
-		if (attr & READ_ONLY)
-			return EFI_EXIT(EFI_WRITE_PROTECTED);
+		if (attr & READ_ONLY) {
+			/* We should not free val */
+			val = NULL;
+			ret = EFI_WRITE_PROTECTED;
+			goto out;
+		}
 	}
 
 	val = malloc(2 * data_size + strlen("{ro,run,boot}(blob)") + 1);
@@ -320,6 +326,8 @@ efi_status_t EFIAPI efi_set_variable(u16 *variable_name, efi_guid_t *vendor,
 	if (env_set(native_name, val))
 		ret = EFI_DEVICE_ERROR;
 
+out:
+	free(native_name);
 	free(val);
 
 	return EFI_EXIT(ret);
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 08/15] efi_loader: don't use unlimited stack as buffer
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (6 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:16   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 09/15] efi_loader: buffer size for load options Heinrich Schuchardt
                   ` (6 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

The length of a string printed to the console by the
EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL is not limited by the UEFI spec.
Hence should not allocate a buffer for it on the stack.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 lib/efi_loader/efi_console.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/lib/efi_loader/efi_console.c b/lib/efi_loader/efi_console.c
index f3d612880c..f5f3f256dd 100644
--- a/lib/efi_loader/efi_console.c
+++ b/lib/efi_loader/efi_console.c
@@ -111,16 +111,21 @@ static efi_status_t EFIAPI efi_cout_output_string(
 {
 	struct simple_text_output_mode *con = &efi_con_mode;
 	struct cout_mode *mode = &efi_cout_modes[con->mode];
-
-	EFI_ENTRY("%p, %p", this, string);
-
-	unsigned int n16 = u16_strlen(string);
-	char buf[MAX_UTF8_PER_UTF16 * n16 + 1];
+	char *buf, *pos;
 	u16 *p;
+	efi_status_t ret = EFI_SUCCESS;
 
-	*utf16_to_utf8((u8 *)buf, string, n16) = '\0';
+	EFI_ENTRY("%p, %p", this, string);
 
+	buf = malloc(utf16_utf8_strlen(string) + 1);
+	if (!buf) {
+		ret = EFI_OUT_OF_RESOURCES;
+		goto out;
+	}
+	pos = buf;
+	utf16_utf8_strcpy(&pos, string);
 	fputs(stdout, buf);
+	free(buf);
 
 	/*
 	 * Update the cursor position.
@@ -158,7 +163,8 @@ static efi_status_t EFIAPI efi_cout_output_string(
 		con->cursor_row = min(con->cursor_row, (s32)mode->rows - 1);
 	}
 
-	return EFI_EXIT(EFI_SUCCESS);
+out:
+	return EFI_EXIT(ret);
 }
 
 static efi_status_t EFIAPI efi_cout_test_string(
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 09/15] efi_loader: buffer size for load options
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (7 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 08/15] efi_loader: don't use unlimited stack as buffer Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:17   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 10/15] lib: charset: remove obsolete functions Heinrich Schuchardt
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

The number of bytes in an utf-8 string is an upper limit for the number of
words in the equivalent utf-16 string. In so far the inumbant coding works
correctly. For non-ASCII characters the utf-16 string is shorter. With the
patch only the necessary buffer size is allocated for the load options.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 cmd/bootefi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmd/bootefi.c b/cmd/bootefi.c
index b60c151fb4..708113b16e 100644
--- a/cmd/bootefi.c
+++ b/cmd/bootefi.c
@@ -116,18 +116,20 @@ static void set_load_options(struct efi_loaded_image *loaded_image_info,
 {
 	size_t size;
 	const char *env = env_get(env_var);
+	u16 *pos;
 
 	loaded_image_info->load_options = NULL;
 	loaded_image_info->load_options_size = 0;
 	if (!env)
 		return;
-	size = strlen(env) + 1;
+	size = utf8_utf16_strlen(env) + 1;
 	loaded_image_info->load_options = calloc(size, sizeof(u16));
 	if (!loaded_image_info->load_options) {
 		printf("ERROR: Out of memory\n");
 		return;
 	}
-	utf8_to_utf16(loaded_image_info->load_options, (u8 *)env, size);
+	pos = loaded_image_info->load_options;
+	utf8_utf16_strcpy(&pos, env);
 	loaded_image_info->load_options_size = size * 2;
 }
 
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 10/15] lib: charset: remove obsolete functions
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (8 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 09/15] efi_loader: buffer size for load options Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 11/15] efi_loader: capitalization table Heinrich Schuchardt
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

Remove functions:
- utf8_to_utf16()
- utf16_strcpy()
- utf16_strdup()

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 include/charset.h | 23 --------------
 lib/charset.c     | 79 -----------------------------------------------
 2 files changed, 102 deletions(-)

diff --git a/include/charset.h b/include/charset.h
index 81e31d1b26..1c0976dde6 100644
--- a/include/charset.h
+++ b/include/charset.h
@@ -161,16 +161,6 @@ size_t u16_strlen(const u16 *in);
  */
 size_t u16_strnlen(const u16 *in, size_t count);
 
-/**
- * utf16_strcpy() - UTF16 equivalent of strcpy()
- */
-uint16_t *utf16_strcpy(uint16_t *dest, const uint16_t *src);
-
-/**
- * utf16_strdup() - UTF16 equivalent of strdup()
- */
-uint16_t *utf16_strdup(const uint16_t *s);
-
 /**
  * utf16_to_utf8() - Convert an utf16 string to utf8
  *
@@ -187,17 +177,4 @@ uint16_t *utf16_strdup(const uint16_t *s);
  */
 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size);
 
-/**
- * utf8_to_utf16() - Convert an utf8 string to utf16
- *
- * Converts up to 'size' characters of the utf16 string 'src' to utf8
- * written to the 'dest' buffer. Stops at 0x00.
- *
- * @dest   the destination buffer to write the utf8 characters
- * @src    the source utf16 string
- * @size   maximum number of utf16 characters to convert
- * @return the pointer to the first unwritten byte in 'dest'
- */
-uint16_t *utf8_to_utf16(uint16_t *dest, const uint8_t *src, size_t size);
-
 #endif /* __CHARSET_H_ */
diff --git a/lib/charset.c b/lib/charset.c
index 0f4c6f26eb..d2c723be3c 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -263,29 +263,6 @@ size_t u16_strnlen(const u16 *in, size_t count)
 	return i;
 }
 
-uint16_t *utf16_strcpy(uint16_t *dest, const uint16_t *src)
-{
-	uint16_t *tmp = dest;
-
-	while ((*dest++ = *src++) != '\0')
-		/* nothing */;
-	return tmp;
-
-}
-
-uint16_t *utf16_strdup(const uint16_t *s)
-{
-	uint16_t *new;
-
-	if (!s)
-		return NULL;
-	new = malloc((u16_strlen(s) + 1) * 2);
-	if (!new)
-		return NULL;
-	utf16_strcpy(new, s);
-	return new;
-}
-
 /* Convert UTF-16 to UTF-8.  */
 uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
 {
@@ -338,59 +315,3 @@ uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
 
 	return dest;
 }
-
-uint16_t *utf8_to_utf16(uint16_t *dest, const uint8_t *src, size_t size)
-{
-	while (size--) {
-		int extension_bytes;
-		uint32_t code;
-
-		extension_bytes = 0;
-		if (*src <= 0x7f) {
-			code = *src++;
-			/* Exit on zero byte */
-			if (!code)
-				size = 0;
-		} else if (*src <= 0xbf) {
-			/* Illegal code */
-			code = '?';
-		} else if (*src <= 0xdf) {
-			code = *src++ & 0x1f;
-			extension_bytes = 1;
-		} else if (*src <= 0xef) {
-			code = *src++ & 0x0f;
-			extension_bytes = 2;
-		} else if (*src <= 0xf7) {
-			code = *src++ & 0x07;
-			extension_bytes = 3;
-		} else {
-			/* Illegal code */
-			code = '?';
-		}
-
-		for (; extension_bytes && size; --size, --extension_bytes) {
-			if ((*src & 0xc0) == 0x80) {
-				code <<= 6;
-				code |= *src++ & 0x3f;
-			} else {
-				/* Illegal code */
-				code = '?';
-				++src;
-				--size;
-				break;
-			}
-		}
-
-		if (code < 0x10000) {
-			*dest++ = code;
-		} else {
-			/*
-			 * Simplified expression for
-			 * (((code - 0x10000) >> 10) & 0x3ff) | 0xd800
-			 */
-			*dest++ = (code >> 10) + 0xd7c0;
-			*dest++ = (code & 0x3ff) | 0xdc00;
-		}
-	}
-	return dest;
-}
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (9 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 10/15] lib: charset: remove obsolete functions Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:22   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 12/15] lib: charset: upper/lower case conversion Heinrich Schuchardt
                   ` (3 subsequent siblings)
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

This patch provides a define to initialize a table that maps lower to
capital letters for Unicode code point 0x0000 - 0xffff.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 MAINTAINERS              |    1 +
 include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 1910 insertions(+)
 create mode 100644 include/capitalization.h

diff --git a/MAINTAINERS b/MAINTAINERS
index a324139471..0a543309f2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -368,6 +368,7 @@ F:	doc/DocBook/efi.tmpl
 F:	doc/README.uefi
 F:	doc/README.iscsi
 F:	Documentation/efi.rst
+F:	include/capitalization.h
 F:	include/efi*
 F:	include/pe.h
 F:	include/asm-generic/pe.h
diff --git a/include/capitalization.h b/include/capitalization.h
new file mode 100644
index 0000000000..50d5108f98
--- /dev/null
+++ b/include/capitalization.h
@@ -0,0 +1,1909 @@
+/* SPDX-License-Identifier: Unicode-DFS-2016 */
+/*
+ * Correspondence table for small and capital Unicode letters in the range of
+ * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
+ */
+
+struct capitalization_table {
+	u16 upper;
+	u16 lower;
+};
+
+#define UNICODE_CAPITALIZATION_TABLE { \
+	{ 0x0531, /* ARMENIAN CAPITAL LETTER AYB */ \
+	  0x0561, /* ARMENIAN SMALL LETTER AYB */ }, \
+	{ 0x0532, /* ARMENIAN CAPITAL LETTER BEN */ \
+	  0x0562, /* ARMENIAN SMALL LETTER BEN */ }, \
+	{ 0x053E, /* ARMENIAN CAPITAL LETTER CA */ \
+	  0x056E, /* ARMENIAN SMALL LETTER CA */ }, \
+	{ 0x0549, /* ARMENIAN CAPITAL LETTER CHA */ \
+	  0x0579, /* ARMENIAN SMALL LETTER CHA */ }, \
+	{ 0x0543, /* ARMENIAN CAPITAL LETTER CHEH */ \
+	  0x0573, /* ARMENIAN SMALL LETTER CHEH */ }, \
+	{ 0x0551, /* ARMENIAN CAPITAL LETTER CO */ \
+	  0x0581, /* ARMENIAN SMALL LETTER CO */ }, \
+	{ 0x0534, /* ARMENIAN CAPITAL LETTER DA */ \
+	  0x0564, /* ARMENIAN SMALL LETTER DA */ }, \
+	{ 0x0535, /* ARMENIAN CAPITAL LETTER ECH */ \
+	  0x0565, /* ARMENIAN SMALL LETTER ECH */ }, \
+	{ 0x0537, /* ARMENIAN CAPITAL LETTER EH */ \
+	  0x0567, /* ARMENIAN SMALL LETTER EH */ }, \
+	{ 0x0538, /* ARMENIAN CAPITAL LETTER ET */ \
+	  0x0568, /* ARMENIAN SMALL LETTER ET */ }, \
+	{ 0x0556, /* ARMENIAN CAPITAL LETTER FEH */ \
+	  0x0586, /* ARMENIAN SMALL LETTER FEH */ }, \
+	{ 0x0542, /* ARMENIAN CAPITAL LETTER GHAD */ \
+	  0x0572, /* ARMENIAN SMALL LETTER GHAD */ }, \
+	{ 0x0533, /* ARMENIAN CAPITAL LETTER GIM */ \
+	  0x0563, /* ARMENIAN SMALL LETTER GIM */ }, \
+	{ 0x0540, /* ARMENIAN CAPITAL LETTER HO */ \
+	  0x0570, /* ARMENIAN SMALL LETTER HO */ }, \
+	{ 0x053B, /* ARMENIAN CAPITAL LETTER INI */ \
+	  0x056B, /* ARMENIAN SMALL LETTER INI */ }, \
+	{ 0x0541, /* ARMENIAN CAPITAL LETTER JA */ \
+	  0x0571, /* ARMENIAN SMALL LETTER JA */ }, \
+	{ 0x054B, /* ARMENIAN CAPITAL LETTER JHEH */ \
+	  0x057B, /* ARMENIAN SMALL LETTER JHEH */ }, \
+	{ 0x0554, /* ARMENIAN CAPITAL LETTER KEH */ \
+	  0x0584, /* ARMENIAN SMALL LETTER KEH */ }, \
+	{ 0x053F, /* ARMENIAN CAPITAL LETTER KEN */ \
+	  0x056F, /* ARMENIAN SMALL LETTER KEN */ }, \
+	{ 0x053C, /* ARMENIAN CAPITAL LETTER LIWN */ \
+	  0x056C, /* ARMENIAN SMALL LETTER LIWN */ }, \
+	{ 0x0544, /* ARMENIAN CAPITAL LETTER MEN */ \
+	  0x0574, /* ARMENIAN SMALL LETTER MEN */ }, \
+	{ 0x0546, /* ARMENIAN CAPITAL LETTER NOW */ \
+	  0x0576, /* ARMENIAN SMALL LETTER NOW */ }, \
+	{ 0x0555, /* ARMENIAN CAPITAL LETTER OH */ \
+	  0x0585, /* ARMENIAN SMALL LETTER OH */ }, \
+	{ 0x054A, /* ARMENIAN CAPITAL LETTER PEH */ \
+	  0x057A, /* ARMENIAN SMALL LETTER PEH */ }, \
+	{ 0x0553, /* ARMENIAN CAPITAL LETTER PIWR */ \
+	  0x0583, /* ARMENIAN SMALL LETTER PIWR */ }, \
+	{ 0x054C, /* ARMENIAN CAPITAL LETTER RA */ \
+	  0x057C, /* ARMENIAN SMALL LETTER RA */ }, \
+	{ 0x0550, /* ARMENIAN CAPITAL LETTER REH */ \
+	  0x0580, /* ARMENIAN SMALL LETTER REH */ }, \
+	{ 0x054D, /* ARMENIAN CAPITAL LETTER SEH */ \
+	  0x057D, /* ARMENIAN SMALL LETTER SEH */ }, \
+	{ 0x0547, /* ARMENIAN CAPITAL LETTER SHA */ \
+	  0x0577, /* ARMENIAN SMALL LETTER SHA */ }, \
+	{ 0x054F, /* ARMENIAN CAPITAL LETTER TIWN */ \
+	  0x057F, /* ARMENIAN SMALL LETTER TIWN */ }, \
+	{ 0x0539, /* ARMENIAN CAPITAL LETTER TO */ \
+	  0x0569, /* ARMENIAN SMALL LETTER TO */ }, \
+	{ 0x054E, /* ARMENIAN CAPITAL LETTER VEW */ \
+	  0x057E, /* ARMENIAN SMALL LETTER VEW */ }, \
+	{ 0x0548, /* ARMENIAN CAPITAL LETTER VO */ \
+	  0x0578, /* ARMENIAN SMALL LETTER VO */ }, \
+	{ 0x053D, /* ARMENIAN CAPITAL LETTER XEH */ \
+	  0x056D, /* ARMENIAN SMALL LETTER XEH */ }, \
+	{ 0x0545, /* ARMENIAN CAPITAL LETTER YI */ \
+	  0x0575, /* ARMENIAN SMALL LETTER YI */ }, \
+	{ 0x0552, /* ARMENIAN CAPITAL LETTER YIWN */ \
+	  0x0582, /* ARMENIAN SMALL LETTER YIWN */ }, \
+	{ 0x0536, /* ARMENIAN CAPITAL LETTER ZA */ \
+	  0x0566, /* ARMENIAN SMALL LETTER ZA */ }, \
+	{ 0x053A, /* ARMENIAN CAPITAL LETTER ZHE */ \
+	  0x056A, /* ARMENIAN SMALL LETTER ZHE */ }, \
+	{ 0x24B6, /* CIRCLED LATIN CAPITAL LETTER A */ \
+	  0x24D0, /* CIRCLED LATIN SMALL LETTER A */ }, \
+	{ 0x24B7, /* CIRCLED LATIN CAPITAL LETTER B */ \
+	  0x24D1, /* CIRCLED LATIN SMALL LETTER B */ }, \
+	{ 0x24B8, /* CIRCLED LATIN CAPITAL LETTER C */ \
+	  0x24D2, /* CIRCLED LATIN SMALL LETTER C */ }, \
+	{ 0x24B9, /* CIRCLED LATIN CAPITAL LETTER D */ \
+	  0x24D3, /* CIRCLED LATIN SMALL LETTER D */ }, \
+	{ 0x24BA, /* CIRCLED LATIN CAPITAL LETTER E */ \
+	  0x24D4, /* CIRCLED LATIN SMALL LETTER E */ }, \
+	{ 0x24BB, /* CIRCLED LATIN CAPITAL LETTER F */ \
+	  0x24D5, /* CIRCLED LATIN SMALL LETTER F */ }, \
+	{ 0x24BC, /* CIRCLED LATIN CAPITAL LETTER G */ \
+	  0x24D6, /* CIRCLED LATIN SMALL LETTER G */ }, \
+	{ 0x24BD, /* CIRCLED LATIN CAPITAL LETTER H */ \
+	  0x24D7, /* CIRCLED LATIN SMALL LETTER H */ }, \
+	{ 0x24BE, /* CIRCLED LATIN CAPITAL LETTER I */ \
+	  0x24D8, /* CIRCLED LATIN SMALL LETTER I */ }, \
+	{ 0x24BF, /* CIRCLED LATIN CAPITAL LETTER J */ \
+	  0x24D9, /* CIRCLED LATIN SMALL LETTER J */ }, \
+	{ 0x24C0, /* CIRCLED LATIN CAPITAL LETTER K */ \
+	  0x24DA, /* CIRCLED LATIN SMALL LETTER K */ }, \
+	{ 0x24C1, /* CIRCLED LATIN CAPITAL LETTER L */ \
+	  0x24DB, /* CIRCLED LATIN SMALL LETTER L */ }, \
+	{ 0x24C2, /* CIRCLED LATIN CAPITAL LETTER M */ \
+	  0x24DC, /* CIRCLED LATIN SMALL LETTER M */ }, \
+	{ 0x24C3, /* CIRCLED LATIN CAPITAL LETTER N */ \
+	  0x24DD, /* CIRCLED LATIN SMALL LETTER N */ }, \
+	{ 0x24C4, /* CIRCLED LATIN CAPITAL LETTER O */ \
+	  0x24DE, /* CIRCLED LATIN SMALL LETTER O */ }, \
+	{ 0x24C5, /* CIRCLED LATIN CAPITAL LETTER P */ \
+	  0x24DF, /* CIRCLED LATIN SMALL LETTER P */ }, \
+	{ 0x24C6, /* CIRCLED LATIN CAPITAL LETTER Q */ \
+	  0x24E0, /* CIRCLED LATIN SMALL LETTER Q */ }, \
+	{ 0x24C7, /* CIRCLED LATIN CAPITAL LETTER R */ \
+	  0x24E1, /* CIRCLED LATIN SMALL LETTER R */ }, \
+	{ 0x24C8, /* CIRCLED LATIN CAPITAL LETTER S */ \
+	  0x24E2, /* CIRCLED LATIN SMALL LETTER S */ }, \
+	{ 0x24C9, /* CIRCLED LATIN CAPITAL LETTER T */ \
+	  0x24E3, /* CIRCLED LATIN SMALL LETTER T */ }, \
+	{ 0x24CA, /* CIRCLED LATIN CAPITAL LETTER U */ \
+	  0x24E4, /* CIRCLED LATIN SMALL LETTER U */ }, \
+	{ 0x24CB, /* CIRCLED LATIN CAPITAL LETTER V */ \
+	  0x24E5, /* CIRCLED LATIN SMALL LETTER V */ }, \
+	{ 0x24CC, /* CIRCLED LATIN CAPITAL LETTER W */ \
+	  0x24E6, /* CIRCLED LATIN SMALL LETTER W */ }, \
+	{ 0x24CD, /* CIRCLED LATIN CAPITAL LETTER X */ \
+	  0x24E7, /* CIRCLED LATIN SMALL LETTER X */ }, \
+	{ 0x24CE, /* CIRCLED LATIN CAPITAL LETTER Y */ \
+	  0x24E8, /* CIRCLED LATIN SMALL LETTER Y */ }, \
+	{ 0x24CF, /* CIRCLED LATIN CAPITAL LETTER Z */ \
+	  0x24E9, /* CIRCLED LATIN SMALL LETTER Z */ }, \
+	{ 0x2CC8, /* COPTIC CAPITAL LETTER AKHMIMIC KHEI */ \
+	  0x2CC9, /* COPTIC SMALL LETTER AKHMIMIC KHEI */ }, \
+	{ 0x2C80, /* COPTIC CAPITAL LETTER ALFA */ \
+	  0x2C81, /* COPTIC SMALL LETTER ALFA */ }, \
+	{ 0x2CF2, /* COPTIC CAPITAL LETTER BOHAIRIC KHEI */ \
+	  0x2CF3, /* COPTIC SMALL LETTER BOHAIRIC KHEI */ }, \
+	{ 0x2CC2, /* COPTIC CAPITAL LETTER CROSSED SHEI */ \
+	  0x2CC3, /* COPTIC SMALL LETTER CROSSED SHEI */ }, \
+	{ 0x2CB6, /* COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE */ \
+	  0x2CB7, /* COPTIC SMALL LETTER CRYPTOGRAMMIC EIE */ }, \
+	{ 0x2CED, /* COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA */ \
+	  0x2CEE, /* COPTIC SMALL LETTER CRYPTOGRAMMIC GANGIA */ }, \
+	{ 0x2CBC, /* COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI */ \
+	  0x2CBD, /* COPTIC SMALL LETTER CRYPTOGRAMMIC NI */ }, \
+	{ 0x2CEB, /* COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI */ \
+	  0x2CEC, /* COPTIC SMALL LETTER CRYPTOGRAMMIC SHEI */ }, \
+	{ 0x2C86, /* COPTIC CAPITAL LETTER DALDA */ \
+	  0x2C87, /* COPTIC SMALL LETTER DALDA */ }, \
+	{ 0x03EE, /* COPTIC CAPITAL LETTER DEI */ \
+	  0x03EF, /* COPTIC SMALL LETTER DEI */ }, \
+	{ 0x2CB2, /* COPTIC CAPITAL LETTER DIALECT-P ALEF */ \
+	  0x2CB3, /* COPTIC SMALL LETTER DIALECT-P ALEF */ }, \
+	{ 0x2CCA, /* COPTIC CAPITAL LETTER DIALECT-P HORI */ \
+	  0x2CCB, /* COPTIC SMALL LETTER DIALECT-P HORI */ }, \
+	{ 0x2CB8, /* COPTIC CAPITAL LETTER DIALECT-P KAPA */ \
+	  0x2CB9, /* COPTIC SMALL LETTER DIALECT-P KAPA */ }, \
+	{ 0x2CBA, /* COPTIC CAPITAL LETTER DIALECT-P NI */ \
+	  0x2CBB, /* COPTIC SMALL LETTER DIALECT-P NI */ }, \
+	{ 0x2C88, /* COPTIC CAPITAL LETTER EIE */ \
+	  0x2C89, /* COPTIC SMALL LETTER EIE */ }, \
+	{ 0x03E4, /* COPTIC CAPITAL LETTER FEI */ \
+	  0x03E5, /* COPTIC SMALL LETTER FEI */ }, \
+	{ 0x2CAA, /* COPTIC CAPITAL LETTER FI */ \
+	  0x2CAB, /* COPTIC SMALL LETTER FI */ }, \
+	{ 0x2C84, /* COPTIC CAPITAL LETTER GAMMA */ \
+	  0x2C85, /* COPTIC SMALL LETTER GAMMA */ }, \
+	{ 0x03EA, /* COPTIC CAPITAL LETTER GANGIA */ \
+	  0x03EB, /* COPTIC SMALL LETTER GANGIA */ }, \
+	{ 0x2C8E, /* COPTIC CAPITAL LETTER HATE */ \
+	  0x2C8F, /* COPTIC SMALL LETTER HATE */ }, \
+	{ 0x03E8, /* COPTIC CAPITAL LETTER HORI */ \
+	  0x03E9, /* COPTIC SMALL LETTER HORI */ }, \
+	{ 0x2C92, /* COPTIC CAPITAL LETTER IAUDA */ \
+	  0x2C93, /* COPTIC SMALL LETTER IAUDA */ }, \
+	{ 0x2C94, /* COPTIC CAPITAL LETTER KAPA */ \
+	  0x2C95, /* COPTIC SMALL LETTER KAPA */ }, \
+	{ 0x03E6, /* COPTIC CAPITAL LETTER KHEI */ \
+	  0x03E7, /* COPTIC SMALL LETTER KHEI */ }, \
+	{ 0x2CAC, /* COPTIC CAPITAL LETTER KHI */ \
+	  0x2CAD, /* COPTIC SMALL LETTER KHI */ }, \
+	{ 0x2C9C, /* COPTIC CAPITAL LETTER KSI */ \
+	  0x2C9D, /* COPTIC SMALL LETTER KSI */ }, \
+	{ 0x2C96, /* COPTIC CAPITAL LETTER LAULA */ \
+	  0x2C97, /* COPTIC SMALL LETTER LAULA */ }, \
+	{ 0x2CD0, /* COPTIC CAPITAL LETTER L-SHAPED HA */ \
+	  0x2CD1, /* COPTIC SMALL LETTER L-SHAPED HA */ }, \
+	{ 0x2C98, /* COPTIC CAPITAL LETTER MI */ \
+	  0x2C99, /* COPTIC SMALL LETTER MI */ }, \
+	{ 0x2C9A, /* COPTIC CAPITAL LETTER NI */ \
+	  0x2C9B, /* COPTIC SMALL LETTER NI */ }, \
+	{ 0x2C9E, /* COPTIC CAPITAL LETTER O */ \
+	  0x2C9F, /* COPTIC SMALL LETTER O */ }, \
+	{ 0x2CB4, /* COPTIC CAPITAL LETTER OLD COPTIC AIN */ \
+	  0x2CB5, /* COPTIC SMALL LETTER OLD COPTIC AIN */ }, \
+	{ 0x2CD8, /* COPTIC CAPITAL LETTER OLD COPTIC DJA */ \
+	  0x2CD9, /* COPTIC SMALL LETTER OLD COPTIC DJA */ }, \
+	{ 0x2CC6, /* COPTIC CAPITAL LETTER OLD COPTIC ESH */ \
+	  0x2CC7, /* COPTIC SMALL LETTER OLD COPTIC ESH */ }, \
+	{ 0x2CD6, /* COPTIC CAPITAL LETTER OLD COPTIC GANGIA */ \
+	  0x2CD7, /* COPTIC SMALL LETTER OLD COPTIC GANGIA */ }, \
+	{ 0x2CCE, /* COPTIC CAPITAL LETTER OLD COPTIC HA */ \
+	  0x2CCF, /* COPTIC SMALL LETTER OLD COPTIC HA */ }, \
+	{ 0x2CD4, /* COPTIC CAPITAL LETTER OLD COPTIC HAT */ \
+	  0x2CD5, /* COPTIC SMALL LETTER OLD COPTIC HAT */ }, \
+	{ 0x2CD2, /* COPTIC CAPITAL LETTER OLD COPTIC HEI */ \
+	  0x2CD3, /* COPTIC SMALL LETTER OLD COPTIC HEI */ }, \
+	{ 0x2CCC, /* COPTIC CAPITAL LETTER OLD COPTIC HORI */ \
+	  0x2CCD, /* COPTIC SMALL LETTER OLD COPTIC HORI */ }, \
+	{ 0x2CBE, /* COPTIC CAPITAL LETTER OLD COPTIC OOU */ \
+	  0x2CBF, /* COPTIC SMALL LETTER OLD COPTIC OOU */ }, \
+	{ 0x2CC4, /* COPTIC CAPITAL LETTER OLD COPTIC SHEI */ \
+	  0x2CC5, /* COPTIC SMALL LETTER OLD COPTIC SHEI */ }, \
+	{ 0x2CDA, /* COPTIC CAPITAL LETTER OLD COPTIC SHIMA */ \
+	  0x2CDB, /* COPTIC SMALL LETTER OLD COPTIC SHIMA */ }, \
+	{ 0x2CDE, /* COPTIC CAPITAL LETTER OLD NUBIAN NGI */ \
+	  0x2CDF, /* COPTIC SMALL LETTER OLD NUBIAN NGI */ }, \
+	{ 0x2CE0, /* COPTIC CAPITAL LETTER OLD NUBIAN NYI */ \
+	  0x2CE1, /* COPTIC SMALL LETTER OLD NUBIAN NYI */ }, \
+	{ 0x2CDC, /* COPTIC CAPITAL LETTER OLD NUBIAN SHIMA */ \
+	  0x2CDD, /* COPTIC SMALL LETTER OLD NUBIAN SHIMA */ }, \
+	{ 0x2CE2, /* COPTIC CAPITAL LETTER OLD NUBIAN WAU */ \
+	  0x2CE3, /* COPTIC SMALL LETTER OLD NUBIAN WAU */ }, \
+	{ 0x2CB0, /* COPTIC CAPITAL LETTER OOU */ \
+	  0x2CB1, /* COPTIC SMALL LETTER OOU */ }, \
+	{ 0x2CA0, /* COPTIC CAPITAL LETTER PI */ \
+	  0x2CA1, /* COPTIC SMALL LETTER PI */ }, \
+	{ 0x2CAE, /* COPTIC CAPITAL LETTER PSI */ \
+	  0x2CAF, /* COPTIC SMALL LETTER PSI */ }, \
+	{ 0x2CA2, /* COPTIC CAPITAL LETTER RO */ \
+	  0x2CA3, /* COPTIC SMALL LETTER RO */ }, \
+	{ 0x2CC0, /* COPTIC CAPITAL LETTER SAMPI */ \
+	  0x2CC1, /* COPTIC SMALL LETTER SAMPI */ }, \
+	{ 0x03E2, /* COPTIC CAPITAL LETTER SHEI */ \
+	  0x03E3, /* COPTIC SMALL LETTER SHEI */ }, \
+	{ 0x03EC, /* COPTIC CAPITAL LETTER SHIMA */ \
+	  0x03ED, /* COPTIC SMALL LETTER SHIMA */ }, \
+	{ 0x2CA4, /* COPTIC CAPITAL LETTER SIMA */ \
+	  0x2CA5, /* COPTIC SMALL LETTER SIMA */ }, \
+	{ 0x2C8A, /* COPTIC CAPITAL LETTER SOU */ \
+	  0x2C8B, /* COPTIC SMALL LETTER SOU */ }, \
+	{ 0x2CA6, /* COPTIC CAPITAL LETTER TAU */ \
+	  0x2CA7, /* COPTIC SMALL LETTER TAU */ }, \
+	{ 0x2C90, /* COPTIC CAPITAL LETTER THETHE */ \
+	  0x2C91, /* COPTIC SMALL LETTER THETHE */ }, \
+	{ 0x2CA8, /* COPTIC CAPITAL LETTER UA */ \
+	  0x2CA9, /* COPTIC SMALL LETTER UA */ }, \
+	{ 0x2C82, /* COPTIC CAPITAL LETTER VIDA */ \
+	  0x2C83, /* COPTIC SMALL LETTER VIDA */ }, \
+	{ 0x2C8C, /* COPTIC CAPITAL LETTER ZATA */ \
+	  0x2C8D, /* COPTIC SMALL LETTER ZATA */ }, \
+	{ 0x0410, /* CYRILLIC CAPITAL LETTER A */ \
+	  0x0430, /* CYRILLIC SMALL LETTER A */ }, \
+	{ 0x04D0, /* CYRILLIC CAPITAL LETTER A WITH BREVE */ \
+	  0x04D1, /* CYRILLIC SMALL LETTER A WITH BREVE */ }, \
+	{ 0x04D2, /* CYRILLIC CAPITAL LETTER A WITH DIAERESIS */ \
+	  0x04D3, /* CYRILLIC SMALL LETTER A WITH DIAERESIS */ }, \
+	{ 0x04BC, /* CYRILLIC CAPITAL LETTER ABKHASIAN CHE */ \
+	  0x04BD, /* CYRILLIC SMALL LETTER ABKHASIAN CHE */ }, \
+	{ 0x04BE, /* CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER */ \
+	  0x04BF, /* CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER */ }, \
+	{ 0x04E0, /* CYRILLIC CAPITAL LETTER ABKHASIAN DZE */ \
+	  0x04E1, /* CYRILLIC SMALL LETTER ABKHASIAN DZE */ }, \
+	{ 0x04A8, /* CYRILLIC CAPITAL LETTER ABKHASIAN HA */ \
+	  0x04A9, /* CYRILLIC SMALL LETTER ABKHASIAN HA */ }, \
+	{ 0x051E, /* CYRILLIC CAPITAL LETTER ALEUT KA */ \
+	  0x051F, /* CYRILLIC SMALL LETTER ALEUT KA */ }, \
+	{ 0x04E8, /* CYRILLIC CAPITAL LETTER BARRED O */ \
+	  0x04E9, /* CYRILLIC SMALL LETTER BARRED O */ }, \
+	{ 0x04EA, /* CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS */ \
+	  0x04EB, /* CYRILLIC SMALL LETTER BARRED O WITH DIAERESIS */ }, \
+	{ 0x04A0, /* CYRILLIC CAPITAL LETTER BASHKIR KA */ \
+	  0x04A1, /* CYRILLIC SMALL LETTER BASHKIR KA */ }, \
+	{ 0x0411, /* CYRILLIC CAPITAL LETTER BE */ \
+	  0x0431, /* CYRILLIC SMALL LETTER BE */ }, \
+	{ 0x046A, /* CYRILLIC CAPITAL LETTER BIG YUS */ \
+	  0x046B, /* CYRILLIC SMALL LETTER BIG YUS */ }, \
+	{ 0xA66A, /* CYRILLIC CAPITAL LETTER BINOCULAR O */ \
+	  0xA66B, /* CYRILLIC SMALL LETTER BINOCULAR O */ }, \
+	{ 0xA65A, /* CYRILLIC CAPITAL LETTER BLENDED YUS */ \
+	  0xA65B, /* CYRILLIC SMALL LETTER BLENDED YUS */ }, \
+	{ 0xA64C, /* CYRILLIC CAPITAL LETTER BROAD OMEGA */ \
+	  0xA64D, /* CYRILLIC SMALL LETTER BROAD OMEGA */ }, \
+	{ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */ \
+	  0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */ }, \
+	{ 0xA686, /* CYRILLIC CAPITAL LETTER CCHE */ \
+	  0xA687, /* CYRILLIC SMALL LETTER CCHE */ }, \
+	{ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */ \
+	  0x0447, /* CYRILLIC SMALL LETTER CHE */ }, \
+	{ 0x04B6, /* CYRILLIC CAPITAL LETTER CHE WITH DESCENDER */ \
+	  0x04B7, /* CYRILLIC SMALL LETTER CHE WITH DESCENDER */ }, \
+	{ 0x04F4, /* CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS */ \
+	  0x04F5, /* CYRILLIC SMALL LETTER CHE WITH DIAERESIS */ }, \
+	{ 0x04B8, /* CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE */ \
+	  0x04B9, /* CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE */ }, \
+	{ 0xA658, /* CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS */ \
+	  0xA659, /* CYRILLIC SMALL LETTER CLOSED LITTLE YUS */ }, \
+	{ 0xA69A, /* CYRILLIC CAPITAL LETTER CROSSED O */ \
+	  0xA69B, /* CYRILLIC SMALL LETTER CROSSED O */ }, \
+	{ 0x052C, /* CYRILLIC CAPITAL LETTER DCHE */ \
+	  0x052D, /* CYRILLIC SMALL LETTER DCHE */ }, \
+	{ 0x0414, /* CYRILLIC CAPITAL LETTER DE */ \
+	  0x0434, /* CYRILLIC SMALL LETTER DE */ }, \
+	{ 0x0402, /* CYRILLIC CAPITAL LETTER DJE */ \
+	  0x0452, /* CYRILLIC SMALL LETTER DJE */ }, \
+	{ 0xA648, /* CYRILLIC CAPITAL LETTER DJERV */ \
+	  0xA649, /* CYRILLIC SMALL LETTER DJERV */ }, \
+	{ 0xA66C, /* CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O */ \
+	  0xA66D, /* CYRILLIC SMALL LETTER DOUBLE MONOCULAR O */ }, \
+	{ 0xA698, /* CYRILLIC CAPITAL LETTER DOUBLE O */ \
+	  0xA699, /* CYRILLIC SMALL LETTER DOUBLE O */ }, \
+	{ 0xA680, /* CYRILLIC CAPITAL LETTER DWE */ \
+	  0xA681, /* CYRILLIC SMALL LETTER DWE */ }, \
+	{ 0x0405, /* CYRILLIC CAPITAL LETTER DZE */ \
+	  0x0455, /* CYRILLIC SMALL LETTER DZE */ }, \
+	{ 0xA642, /* CYRILLIC CAPITAL LETTER DZELO */ \
+	  0xA643, /* CYRILLIC SMALL LETTER DZELO */ }, \
+	{ 0x040F, /* CYRILLIC CAPITAL LETTER DZHE */ \
+	  0x045F, /* CYRILLIC SMALL LETTER DZHE */ }, \
+	{ 0xA682, /* CYRILLIC CAPITAL LETTER DZWE */ \
+	  0xA683, /* CYRILLIC SMALL LETTER DZWE */ }, \
+	{ 0xA688, /* CYRILLIC CAPITAL LETTER DZZE */ \
+	  0xA689, /* CYRILLIC SMALL LETTER DZZE */ }, \
+	{ 0x052A, /* CYRILLIC CAPITAL LETTER DZZHE */ \
+	  0x052B, /* CYRILLIC SMALL LETTER DZZHE */ }, \
+	{ 0x042D, /* CYRILLIC CAPITAL LETTER E */ \
+	  0x044D, /* CYRILLIC SMALL LETTER E */ }, \
+	{ 0x04EC, /* CYRILLIC CAPITAL LETTER E WITH DIAERESIS */ \
+	  0x04ED, /* CYRILLIC SMALL LETTER E WITH DIAERESIS */ }, \
+	{ 0x0424, /* CYRILLIC CAPITAL LETTER EF */ \
+	  0x0444, /* CYRILLIC SMALL LETTER EF */ }, \
+	{ 0x041B, /* CYRILLIC CAPITAL LETTER EL */ \
+	  0x043B, /* CYRILLIC SMALL LETTER EL */ }, \
+	{ 0x052E, /* CYRILLIC CAPITAL LETTER EL WITH DESCENDER */ \
+	  0x052F, /* CYRILLIC SMALL LETTER EL WITH DESCENDER */ }, \
+	{ 0x0512, /* CYRILLIC CAPITAL LETTER EL WITH HOOK */ \
+	  0x0513, /* CYRILLIC SMALL LETTER EL WITH HOOK */ }, \
+	{ 0x0520, /* CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK */ \
+	  0x0521, /* CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK */ }, \
+	{ 0x04C5, /* CYRILLIC CAPITAL LETTER EL WITH TAIL */ \
+	  0x04C6, /* CYRILLIC SMALL LETTER EL WITH TAIL */ }, \
+	{ 0x041C, /* CYRILLIC CAPITAL LETTER EM */ \
+	  0x043C, /* CYRILLIC SMALL LETTER EM */ }, \
+	{ 0x04CD, /* CYRILLIC CAPITAL LETTER EM WITH TAIL */ \
+	  0x04CE, /* CYRILLIC SMALL LETTER EM WITH TAIL */ }, \
+	{ 0x041D, /* CYRILLIC CAPITAL LETTER EN */ \
+	  0x043D, /* CYRILLIC SMALL LETTER EN */ }, \
+	{ 0x04A2, /* CYRILLIC CAPITAL LETTER EN WITH DESCENDER */ \
+	  0x04A3, /* CYRILLIC SMALL LETTER EN WITH DESCENDER */ }, \
+	{ 0x04C7, /* CYRILLIC CAPITAL LETTER EN WITH HOOK */ \
+	  0x04C8, /* CYRILLIC SMALL LETTER EN WITH HOOK */ }, \
+	{ 0x0528, /* CYRILLIC CAPITAL LETTER EN WITH LEFT HOOK */ \
+	  0x0529, /* CYRILLIC SMALL LETTER EN WITH LEFT HOOK */ }, \
+	{ 0x0522, /* CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK */ \
+	  0x0523, /* CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK */ }, \
+	{ 0x04C9, /* CYRILLIC CAPITAL LETTER EN WITH TAIL */ \
+	  0x04CA, /* CYRILLIC SMALL LETTER EN WITH TAIL */ }, \
+	{ 0x0420, /* CYRILLIC CAPITAL LETTER ER */ \
+	  0x0440, /* CYRILLIC SMALL LETTER ER */ }, \
+	{ 0x048E, /* CYRILLIC CAPITAL LETTER ER WITH TICK */ \
+	  0x048F, /* CYRILLIC SMALL LETTER ER WITH TICK */ }, \
+	{ 0x0421, /* CYRILLIC CAPITAL LETTER ES */ \
+	  0x0441, /* CYRILLIC SMALL LETTER ES */ }, \
+	{ 0x04AA, /* CYRILLIC CAPITAL LETTER ES WITH DESCENDER */ \
+	  0x04AB, /* CYRILLIC SMALL LETTER ES WITH DESCENDER */ }, \
+	{ 0x0472, /* CYRILLIC CAPITAL LETTER FITA */ \
+	  0x0473, /* CYRILLIC SMALL LETTER FITA */ }, \
+	{ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */ \
+	  0x0433, /* CYRILLIC SMALL LETTER GHE */ }, \
+	{ 0x04F6, /* CYRILLIC CAPITAL LETTER GHE WITH DESCENDER */ \
+	  0x04F7, /* CYRILLIC SMALL LETTER GHE WITH DESCENDER */ }, \
+	{ 0x0494, /* CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK */ \
+	  0x0495, /* CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK */ }, \
+	{ 0x0492, /* CYRILLIC CAPITAL LETTER GHE WITH STROKE */ \
+	  0x0493, /* CYRILLIC SMALL LETTER GHE WITH STROKE */ }, \
+	{ 0x04FA, /* CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK */ \
+	  0x04FB, /* CYRILLIC SMALL LETTER GHE WITH STROKE AND HOOK */ }, \
+	{ 0x0490, /* CYRILLIC CAPITAL LETTER GHE WITH UPTURN */ \
+	  0x0491, /* CYRILLIC SMALL LETTER GHE WITH UPTURN */ }, \
+	{ 0x0403, /* CYRILLIC CAPITAL LETTER GJE */ \
+	  0x0453, /* CYRILLIC SMALL LETTER GJE */ }, \
+	{ 0x0425, /* CYRILLIC CAPITAL LETTER HA */ \
+	  0x0445, /* CYRILLIC SMALL LETTER HA */ }, \
+	{ 0x04B2, /* CYRILLIC CAPITAL LETTER HA WITH DESCENDER */ \
+	  0x04B3, /* CYRILLIC SMALL LETTER HA WITH DESCENDER */ }, \
+	{ 0x04FC, /* CYRILLIC CAPITAL LETTER HA WITH HOOK */ \
+	  0x04FD, /* CYRILLIC SMALL LETTER HA WITH HOOK */ }, \
+	{ 0x04FE, /* CYRILLIC CAPITAL LETTER HA WITH STROKE */ \
+	  0x04FF, /* CYRILLIC SMALL LETTER HA WITH STROKE */ }, \
+	{ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */ \
+	  0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */ }, \
+	{ 0xA694, /* CYRILLIC CAPITAL LETTER HWE */ \
+	  0xA695, /* CYRILLIC SMALL LETTER HWE */ }, \
+	{ 0x0418, /* CYRILLIC CAPITAL LETTER I */ \
+	  0x0438, /* CYRILLIC SMALL LETTER I */ }, \
+	{ 0x04E4, /* CYRILLIC CAPITAL LETTER I WITH DIAERESIS */ \
+	  0x04E5, /* CYRILLIC SMALL LETTER I WITH DIAERESIS */ }, \
+	{ 0x040D, /* CYRILLIC CAPITAL LETTER I WITH GRAVE */ \
+	  0x045D, /* CYRILLIC SMALL LETTER I WITH GRAVE */ }, \
+	{ 0x04E2, /* CYRILLIC CAPITAL LETTER I WITH MACRON */ \
+	  0x04E3, /* CYRILLIC SMALL LETTER I WITH MACRON */ }, \
+	{ 0x0415, /* CYRILLIC CAPITAL LETTER IE */ \
+	  0x0435, /* CYRILLIC SMALL LETTER IE */ }, \
+	{ 0x04D6, /* CYRILLIC CAPITAL LETTER IE WITH BREVE */ \
+	  0x04D7, /* CYRILLIC SMALL LETTER IE WITH BREVE */ }, \
+	{ 0x0400, /* CYRILLIC CAPITAL LETTER IE WITH GRAVE */ \
+	  0x0450, /* CYRILLIC SMALL LETTER IE WITH GRAVE */ }, \
+	{ 0x0401, /* CYRILLIC CAPITAL LETTER IO */ \
+	  0x0451, /* CYRILLIC SMALL LETTER IO */ }, \
+	{ 0xA646, /* CYRILLIC CAPITAL LETTER IOTA */ \
+	  0xA647, /* CYRILLIC SMALL LETTER IOTA */ }, \
+	{ 0xA656, /* CYRILLIC CAPITAL LETTER IOTIFIED A */ \
+	  0xA657, /* CYRILLIC SMALL LETTER IOTIFIED A */ }, \
+	{ 0x046C, /* CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS */ \
+	  0x046D, /* CYRILLIC SMALL LETTER IOTIFIED BIG YUS */ }, \
+	{ 0xA65C, /* CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS */ \
+	  0xA65D, /* CYRILLIC SMALL LETTER IOTIFIED CLOSED LITTLE YUS */ }, \
+	{ 0x0464, /* CYRILLIC CAPITAL LETTER IOTIFIED E */ \
+	  0x0465, /* CYRILLIC SMALL LETTER IOTIFIED E */ }, \
+	{ 0x0468, /* CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS */ \
+	  0x0469, /* CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS */ }, \
+	{ 0xA652, /* CYRILLIC CAPITAL LETTER IOTIFIED YAT */ \
+	  0xA653, /* CYRILLIC SMALL LETTER IOTIFIED YAT */ }, \
+	{ 0x0474, /* CYRILLIC CAPITAL LETTER IZHITSA */ \
+	  0x0475, /* CYRILLIC SMALL LETTER IZHITSA */ }, \
+	{ 0x0476, /* CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT */ \
+	  0x0477, /* CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT */ }, \
+	{ 0x0408, /* CYRILLIC CAPITAL LETTER JE */ \
+	  0x0458, /* CYRILLIC SMALL LETTER JE */ }, \
+	{ 0x041A, /* CYRILLIC CAPITAL LETTER KA */ \
+	  0x043A, /* CYRILLIC SMALL LETTER KA */ }, \
+	{ 0x049A, /* CYRILLIC CAPITAL LETTER KA WITH DESCENDER */ \
+	  0x049B, /* CYRILLIC SMALL LETTER KA WITH DESCENDER */ }, \
+	{ 0x04C3, /* CYRILLIC CAPITAL LETTER KA WITH HOOK */ \
+	  0x04C4, /* CYRILLIC SMALL LETTER KA WITH HOOK */ }, \
+	{ 0x049E, /* CYRILLIC CAPITAL LETTER KA WITH STROKE */ \
+	  0x049F, /* CYRILLIC SMALL LETTER KA WITH STROKE */ }, \
+	{ 0x049C, /* CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE */ \
+	  0x049D, /* CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE */ }, \
+	{ 0x04CB, /* CYRILLIC CAPITAL LETTER KHAKASSIAN CHE */ \
+	  0x04CC, /* CYRILLIC SMALL LETTER KHAKASSIAN CHE */ }, \
+	{ 0x040C, /* CYRILLIC CAPITAL LETTER KJE */ \
+	  0x045C, /* CYRILLIC SMALL LETTER KJE */ }, \
+	{ 0x0500, /* CYRILLIC CAPITAL LETTER KOMI DE */ \
+	  0x0501, /* CYRILLIC SMALL LETTER KOMI DE */ }, \
+	{ 0x0502, /* CYRILLIC CAPITAL LETTER KOMI DJE */ \
+	  0x0503, /* CYRILLIC SMALL LETTER KOMI DJE */ }, \
+	{ 0x0506, /* CYRILLIC CAPITAL LETTER KOMI DZJE */ \
+	  0x0507, /* CYRILLIC SMALL LETTER KOMI DZJE */ }, \
+	{ 0x0508, /* CYRILLIC CAPITAL LETTER KOMI LJE */ \
+	  0x0509, /* CYRILLIC SMALL LETTER KOMI LJE */ }, \
+	{ 0x050A, /* CYRILLIC CAPITAL LETTER KOMI NJE */ \
+	  0x050B, /* CYRILLIC SMALL LETTER KOMI NJE */ }, \
+	{ 0x050C, /* CYRILLIC CAPITAL LETTER KOMI SJE */ \
+	  0x050D, /* CYRILLIC SMALL LETTER KOMI SJE */ }, \
+	{ 0x050E, /* CYRILLIC CAPITAL LETTER KOMI TJE */ \
+	  0x050F, /* CYRILLIC SMALL LETTER KOMI TJE */ }, \
+	{ 0x0504, /* CYRILLIC CAPITAL LETTER KOMI ZJE */ \
+	  0x0505, /* CYRILLIC SMALL LETTER KOMI ZJE */ }, \
+	{ 0x0480, /* CYRILLIC CAPITAL LETTER KOPPA */ \
+	  0x0481, /* CYRILLIC SMALL LETTER KOPPA */ }, \
+	{ 0x046E, /* CYRILLIC CAPITAL LETTER KSI */ \
+	  0x046F, /* CYRILLIC SMALL LETTER KSI */ }, \
+	{ 0x0514, /* CYRILLIC CAPITAL LETTER LHA */ \
+	  0x0515, /* CYRILLIC SMALL LETTER LHA */ }, \
+	{ 0x0466, /* CYRILLIC CAPITAL LETTER LITTLE YUS */ \
+	  0x0467, /* CYRILLIC SMALL LETTER LITTLE YUS */ }, \
+	{ 0x0409, /* CYRILLIC CAPITAL LETTER LJE */ \
+	  0x0459, /* CYRILLIC SMALL LETTER LJE */ }, \
+	{ 0xA668, /* CYRILLIC CAPITAL LETTER MONOCULAR O */ \
+	  0xA669, /* CYRILLIC SMALL LETTER MONOCULAR O */ }, \
+	{ 0xA64A, /* CYRILLIC CAPITAL LETTER MONOGRAPH UK */ \
+	  0xA64B, /* CYRILLIC SMALL LETTER MONOGRAPH UK */ }, \
+	{ 0xA64E, /* CYRILLIC CAPITAL LETTER NEUTRAL YER */ \
+	  0xA64F, /* CYRILLIC SMALL LETTER NEUTRAL YER */ }, \
+	{ 0x040A, /* CYRILLIC CAPITAL LETTER NJE */ \
+	  0x045A, /* CYRILLIC SMALL LETTER NJE */ }, \
+	{ 0x041E, /* CYRILLIC CAPITAL LETTER O */ \
+	  0x043E, /* CYRILLIC SMALL LETTER O */ }, \
+	{ 0x04E6, /* CYRILLIC CAPITAL LETTER O WITH DIAERESIS */ \
+	  0x04E7, /* CYRILLIC SMALL LETTER O WITH DIAERESIS */ }, \
+	{ 0x0460, /* CYRILLIC CAPITAL LETTER OMEGA */ \
+	  0x0461, /* CYRILLIC SMALL LETTER OMEGA */ }, \
+	{ 0x047C, /* CYRILLIC CAPITAL LETTER OMEGA WITH TITLO */ \
+	  0x047D, /* CYRILLIC SMALL LETTER OMEGA WITH TITLO */ }, \
+	{ 0x047E, /* CYRILLIC CAPITAL LETTER OT */ \
+	  0x047F, /* CYRILLIC SMALL LETTER OT */ }, \
+	{ 0x041F, /* CYRILLIC CAPITAL LETTER PE */ \
+	  0x043F, /* CYRILLIC SMALL LETTER PE */ }, \
+	{ 0x0524, /* CYRILLIC CAPITAL LETTER PE WITH DESCENDER */ \
+	  0x0525, /* CYRILLIC SMALL LETTER PE WITH DESCENDER */ }, \
+	{ 0x04A6, /* CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK */ \
+	  0x04A7, /* CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK */ }, \
+	{ 0x0470, /* CYRILLIC CAPITAL LETTER PSI */ \
+	  0x0471, /* CYRILLIC SMALL LETTER PSI */ }, \
+	{ 0x051A, /* CYRILLIC CAPITAL LETTER QA */ \
+	  0x051B, /* CYRILLIC SMALL LETTER QA */ }, \
+	{ 0xA644, /* CYRILLIC CAPITAL LETTER REVERSED DZE */ \
+	  0xA645, /* CYRILLIC SMALL LETTER REVERSED DZE */ }, \
+	{ 0xA660, /* CYRILLIC CAPITAL LETTER REVERSED TSE */ \
+	  0xA661, /* CYRILLIC SMALL LETTER REVERSED TSE */ }, \
+	{ 0xA654, /* CYRILLIC CAPITAL LETTER REVERSED YU */ \
+	  0xA655, /* CYRILLIC SMALL LETTER REVERSED YU */ }, \
+	{ 0x0510, /* CYRILLIC CAPITAL LETTER REVERSED ZE */ \
+	  0x0511, /* CYRILLIC SMALL LETTER REVERSED ZE */ }, \
+	{ 0x0516, /* CYRILLIC CAPITAL LETTER RHA */ \
+	  0x0517, /* CYRILLIC SMALL LETTER RHA */ }, \
+	{ 0x047A, /* CYRILLIC CAPITAL LETTER ROUND OMEGA */ \
+	  0x047B, /* CYRILLIC SMALL LETTER ROUND OMEGA */ }, \
+	{ 0x04D8, /* CYRILLIC CAPITAL LETTER SCHWA */ \
+	  0x04D9, /* CYRILLIC SMALL LETTER SCHWA */ }, \
+	{ 0x04DA, /* CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS */ \
+	  0x04DB, /* CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS */ }, \
+	{ 0x048C, /* CYRILLIC CAPITAL LETTER SEMISOFT SIGN */ \
+	  0x048D, /* CYRILLIC SMALL LETTER SEMISOFT SIGN */ }, \
+	{ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */ \
+	  0x0448, /* CYRILLIC SMALL LETTER SHA */ }, \
+	{ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */ \
+	  0x0449, /* CYRILLIC SMALL LETTER SHCHA */ }, \
+	{ 0x04BA, /* CYRILLIC CAPITAL LETTER SHHA */ \
+	  0x04BB, /* CYRILLIC SMALL LETTER SHHA */ }, \
+	{ 0x0526, /* CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER */ \
+	  0x0527, /* CYRILLIC SMALL LETTER SHHA WITH DESCENDER */ }, \
+	{ 0x048A, /* CYRILLIC CAPITAL LETTER SHORT I WITH TAIL */ \
+	  0x048B, /* CYRILLIC SMALL LETTER SHORT I WITH TAIL */ }, \
+	{ 0x040E, /* CYRILLIC CAPITAL LETTER SHORT U */ \
+	  0x045E, /* CYRILLIC SMALL LETTER SHORT U */ }, \
+	{ 0xA696, /* CYRILLIC CAPITAL LETTER SHWE */ \
+	  0xA697, /* CYRILLIC SMALL LETTER SHWE */ }, \
+	{ 0xA662, /* CYRILLIC CAPITAL LETTER SOFT DE */ \
+	  0xA663, /* CYRILLIC SMALL LETTER SOFT DE */ }, \
+	{ 0xA664, /* CYRILLIC CAPITAL LETTER SOFT EL */ \
+	  0xA665, /* CYRILLIC SMALL LETTER SOFT EL */ }, \
+	{ 0xA666, /* CYRILLIC CAPITAL LETTER SOFT EM */ \
+	  0xA667, /* CYRILLIC SMALL LETTER SOFT EM */ }, \
+	{ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */ \
+	  0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */ }, \
+	{ 0x04AE, /* CYRILLIC CAPITAL LETTER STRAIGHT U */ \
+	  0x04AF, /* CYRILLIC SMALL LETTER STRAIGHT U */ }, \
+	{ 0x04B0, /* CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE */ \
+	  0x04B1, /* CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE */ }, \
+	{ 0xA692, /* CYRILLIC CAPITAL LETTER TCHE */ \
+	  0xA693, /* CYRILLIC SMALL LETTER TCHE */ }, \
+	{ 0x0422, /* CYRILLIC CAPITAL LETTER TE */ \
+	  0x0442, /* CYRILLIC SMALL LETTER TE */ }, \
+	{ 0x04AC, /* CYRILLIC CAPITAL LETTER TE WITH DESCENDER */ \
+	  0x04AD, /* CYRILLIC SMALL LETTER TE WITH DESCENDER */ }, \
+	{ 0xA68A, /* CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK */ \
+	  0xA68B, /* CYRILLIC SMALL LETTER TE WITH MIDDLE HOOK */ }, \
+	{ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */ \
+	  0x0446, /* CYRILLIC SMALL LETTER TSE */ }, \
+	{ 0x040B, /* CYRILLIC CAPITAL LETTER TSHE */ \
+	  0x045B, /* CYRILLIC SMALL LETTER TSHE */ }, \
+	{ 0xA690, /* CYRILLIC CAPITAL LETTER TSSE */ \
+	  0xA691, /* CYRILLIC SMALL LETTER TSSE */ }, \
+	{ 0xA68E, /* CYRILLIC CAPITAL LETTER TSWE */ \
+	  0xA68F, /* CYRILLIC SMALL LETTER TSWE */ }, \
+	{ 0xA68C, /* CYRILLIC CAPITAL LETTER TWE */ \
+	  0xA68D, /* CYRILLIC SMALL LETTER TWE */ }, \
+	{ 0x0423, /* CYRILLIC CAPITAL LETTER U */ \
+	  0x0443, /* CYRILLIC SMALL LETTER U */ }, \
+	{ 0x04F0, /* CYRILLIC CAPITAL LETTER U WITH DIAERESIS */ \
+	  0x04F1, /* CYRILLIC SMALL LETTER U WITH DIAERESIS */ }, \
+	{ 0x04F2, /* CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE */ \
+	  0x04F3, /* CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE */ }, \
+	{ 0x04EE, /* CYRILLIC CAPITAL LETTER U WITH MACRON */ \
+	  0x04EF, /* CYRILLIC SMALL LETTER U WITH MACRON */ }, \
+	{ 0x0478, /* CYRILLIC CAPITAL LETTER UK */ \
+	  0x0479, /* CYRILLIC SMALL LETTER UK */ }, \
+	{ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */ \
+	  0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */ }, \
+	{ 0x0412, /* CYRILLIC CAPITAL LETTER VE */ \
+	  0x0432, /* CYRILLIC SMALL LETTER VE */ }, \
+	{ 0x051C, /* CYRILLIC CAPITAL LETTER WE */ \
+	  0x051D, /* CYRILLIC SMALL LETTER WE */ }, \
+	{ 0x042F, /* CYRILLIC CAPITAL LETTER YA */ \
+	  0x044F, /* CYRILLIC SMALL LETTER YA */ }, \
+	{ 0x0518, /* CYRILLIC CAPITAL LETTER YAE */ \
+	  0x0519, /* CYRILLIC SMALL LETTER YAE */ }, \
+	{ 0x0462, /* CYRILLIC CAPITAL LETTER YAT */ \
+	  0x0463, /* CYRILLIC SMALL LETTER YAT */ }, \
+	{ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */ \
+	  0x044B, /* CYRILLIC SMALL LETTER YERU */ }, \
+	{ 0xA650, /* CYRILLIC CAPITAL LETTER YERU WITH BACK YER */ \
+	  0xA651, /* CYRILLIC SMALL LETTER YERU WITH BACK YER */ }, \
+	{ 0x04F8, /* CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS */ \
+	  0x04F9, /* CYRILLIC SMALL LETTER YERU WITH DIAERESIS */ }, \
+	{ 0x0407, /* CYRILLIC CAPITAL LETTER YI */ \
+	  0x0457, /* CYRILLIC SMALL LETTER YI */ }, \
+	{ 0xA65E, /* CYRILLIC CAPITAL LETTER YN */ \
+	  0xA65F, /* CYRILLIC SMALL LETTER YN */ }, \
+	{ 0x042E, /* CYRILLIC CAPITAL LETTER YU */ \
+	  0x044E, /* CYRILLIC SMALL LETTER YU */ }, \
+	{ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */ \
+	  0x0437, /* CYRILLIC SMALL LETTER ZE */ }, \
+	{ 0x0498, /* CYRILLIC CAPITAL LETTER ZE WITH DESCENDER */ \
+	  0x0499, /* CYRILLIC SMALL LETTER ZE WITH DESCENDER */ }, \
+	{ 0x04DE, /* CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS */ \
+	  0x04DF, /* CYRILLIC SMALL LETTER ZE WITH DIAERESIS */ }, \
+	{ 0xA640, /* CYRILLIC CAPITAL LETTER ZEMLYA */ \
+	  0xA641, /* CYRILLIC SMALL LETTER ZEMLYA */ }, \
+	{ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */ \
+	  0x0436, /* CYRILLIC SMALL LETTER ZHE */ }, \
+	{ 0x04C1, /* CYRILLIC CAPITAL LETTER ZHE WITH BREVE */ \
+	  0x04C2, /* CYRILLIC SMALL LETTER ZHE WITH BREVE */ }, \
+	{ 0x0496, /* CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER */ \
+	  0x0497, /* CYRILLIC SMALL LETTER ZHE WITH DESCENDER */ }, \
+	{ 0x04DC, /* CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS */ \
+	  0x04DD, /* CYRILLIC SMALL LETTER ZHE WITH DIAERESIS */ }, \
+	{ 0xA684, /* CYRILLIC CAPITAL LETTER ZHWE */ \
+	  0xA685, /* CYRILLIC SMALL LETTER ZHWE */ }, \
+	{ 0xFF21, /* FULLWIDTH LATIN CAPITAL LETTER A */ \
+	  0xFF41, /* FULLWIDTH LATIN SMALL LETTER A */ }, \
+	{ 0xFF22, /* FULLWIDTH LATIN CAPITAL LETTER B */ \
+	  0xFF42, /* FULLWIDTH LATIN SMALL LETTER B */ }, \
+	{ 0xFF23, /* FULLWIDTH LATIN CAPITAL LETTER C */ \
+	  0xFF43, /* FULLWIDTH LATIN SMALL LETTER C */ }, \
+	{ 0xFF24, /* FULLWIDTH LATIN CAPITAL LETTER D */ \
+	  0xFF44, /* FULLWIDTH LATIN SMALL LETTER D */ }, \
+	{ 0xFF25, /* FULLWIDTH LATIN CAPITAL LETTER E */ \
+	  0xFF45, /* FULLWIDTH LATIN SMALL LETTER E */ }, \
+	{ 0xFF26, /* FULLWIDTH LATIN CAPITAL LETTER F */ \
+	  0xFF46, /* FULLWIDTH LATIN SMALL LETTER F */ }, \
+	{ 0xFF27, /* FULLWIDTH LATIN CAPITAL LETTER G */ \
+	  0xFF47, /* FULLWIDTH LATIN SMALL LETTER G */ }, \
+	{ 0xFF28, /* FULLWIDTH LATIN CAPITAL LETTER H */ \
+	  0xFF48, /* FULLWIDTH LATIN SMALL LETTER H */ }, \
+	{ 0xFF29, /* FULLWIDTH LATIN CAPITAL LETTER I */ \
+	  0xFF49, /* FULLWIDTH LATIN SMALL LETTER I */ }, \
+	{ 0xFF2A, /* FULLWIDTH LATIN CAPITAL LETTER J */ \
+	  0xFF4A, /* FULLWIDTH LATIN SMALL LETTER J */ }, \
+	{ 0xFF2B, /* FULLWIDTH LATIN CAPITAL LETTER K */ \
+	  0xFF4B, /* FULLWIDTH LATIN SMALL LETTER K */ }, \
+	{ 0xFF2C, /* FULLWIDTH LATIN CAPITAL LETTER L */ \
+	  0xFF4C, /* FULLWIDTH LATIN SMALL LETTER L */ }, \
+	{ 0xFF2D, /* FULLWIDTH LATIN CAPITAL LETTER M */ \
+	  0xFF4D, /* FULLWIDTH LATIN SMALL LETTER M */ }, \
+	{ 0xFF2E, /* FULLWIDTH LATIN CAPITAL LETTER N */ \
+	  0xFF4E, /* FULLWIDTH LATIN SMALL LETTER N */ }, \
+	{ 0xFF2F, /* FULLWIDTH LATIN CAPITAL LETTER O */ \
+	  0xFF4F, /* FULLWIDTH LATIN SMALL LETTER O */ }, \
+	{ 0xFF30, /* FULLWIDTH LATIN CAPITAL LETTER P */ \
+	  0xFF50, /* FULLWIDTH LATIN SMALL LETTER P */ }, \
+	{ 0xFF31, /* FULLWIDTH LATIN CAPITAL LETTER Q */ \
+	  0xFF51, /* FULLWIDTH LATIN SMALL LETTER Q */ }, \
+	{ 0xFF32, /* FULLWIDTH LATIN CAPITAL LETTER R */ \
+	  0xFF52, /* FULLWIDTH LATIN SMALL LETTER R */ }, \
+	{ 0xFF33, /* FULLWIDTH LATIN CAPITAL LETTER S */ \
+	  0xFF53, /* FULLWIDTH LATIN SMALL LETTER S */ }, \
+	{ 0xFF34, /* FULLWIDTH LATIN CAPITAL LETTER T */ \
+	  0xFF54, /* FULLWIDTH LATIN SMALL LETTER T */ }, \
+	{ 0xFF35, /* FULLWIDTH LATIN CAPITAL LETTER U */ \
+	  0xFF55, /* FULLWIDTH LATIN SMALL LETTER U */ }, \
+	{ 0xFF36, /* FULLWIDTH LATIN CAPITAL LETTER V */ \
+	  0xFF56, /* FULLWIDTH LATIN SMALL LETTER V */ }, \
+	{ 0xFF37, /* FULLWIDTH LATIN CAPITAL LETTER W */ \
+	  0xFF57, /* FULLWIDTH LATIN SMALL LETTER W */ }, \
+	{ 0xFF38, /* FULLWIDTH LATIN CAPITAL LETTER X */ \
+	  0xFF58, /* FULLWIDTH LATIN SMALL LETTER X */ }, \
+	{ 0xFF39, /* FULLWIDTH LATIN CAPITAL LETTER Y */ \
+	  0xFF59, /* FULLWIDTH LATIN SMALL LETTER Y */ }, \
+	{ 0xFF3A, /* FULLWIDTH LATIN CAPITAL LETTER Z */ \
+	  0xFF5A, /* FULLWIDTH LATIN SMALL LETTER Z */ }, \
+	{ 0x10CD, /* GEORGIAN CAPITAL LETTER AEN */ \
+	  0x2D2D, /* GEORGIAN SMALL LETTER AEN */ }, \
+	{ 0x10A0, /* GEORGIAN CAPITAL LETTER AN */ \
+	  0x2D00, /* GEORGIAN SMALL LETTER AN */ }, \
+	{ 0x10A1, /* GEORGIAN CAPITAL LETTER BAN */ \
+	  0x2D01, /* GEORGIAN SMALL LETTER BAN */ }, \
+	{ 0x10BA, /* GEORGIAN CAPITAL LETTER CAN */ \
+	  0x2D1A, /* GEORGIAN SMALL LETTER CAN */ }, \
+	{ 0x10BD, /* GEORGIAN CAPITAL LETTER CHAR */ \
+	  0x2D1D, /* GEORGIAN SMALL LETTER CHAR */ }, \
+	{ 0x10B9, /* GEORGIAN CAPITAL LETTER CHIN */ \
+	  0x2D19, /* GEORGIAN SMALL LETTER CHIN */ }, \
+	{ 0x10BC, /* GEORGIAN CAPITAL LETTER CIL */ \
+	  0x2D1C, /* GEORGIAN SMALL LETTER CIL */ }, \
+	{ 0x10A3, /* GEORGIAN CAPITAL LETTER DON */ \
+	  0x2D03, /* GEORGIAN SMALL LETTER DON */ }, \
+	{ 0x10A4, /* GEORGIAN CAPITAL LETTER EN */ \
+	  0x2D04, /* GEORGIAN SMALL LETTER EN */ }, \
+	{ 0x10A2, /* GEORGIAN CAPITAL LETTER GAN */ \
+	  0x2D02, /* GEORGIAN SMALL LETTER GAN */ }, \
+	{ 0x10B6, /* GEORGIAN CAPITAL LETTER GHAN */ \
+	  0x2D16, /* GEORGIAN SMALL LETTER GHAN */ }, \
+	{ 0x10C0, /* GEORGIAN CAPITAL LETTER HAE */ \
+	  0x2D20, /* GEORGIAN SMALL LETTER HAE */ }, \
+	{ 0x10C4, /* GEORGIAN CAPITAL LETTER HAR */ \
+	  0x2D24, /* GEORGIAN SMALL LETTER HAR */ }, \
+	{ 0x10C1, /* GEORGIAN CAPITAL LETTER HE */ \
+	  0x2D21, /* GEORGIAN SMALL LETTER HE */ }, \
+	{ 0x10C2, /* GEORGIAN CAPITAL LETTER HIE */ \
+	  0x2D22, /* GEORGIAN SMALL LETTER HIE */ }, \
+	{ 0x10C5, /* GEORGIAN CAPITAL LETTER HOE */ \
+	  0x2D25, /* GEORGIAN SMALL LETTER HOE */ }, \
+	{ 0x10A8, /* GEORGIAN CAPITAL LETTER IN */ \
+	  0x2D08, /* GEORGIAN SMALL LETTER IN */ }, \
+	{ 0x10BF, /* GEORGIAN CAPITAL LETTER JHAN */ \
+	  0x2D1F, /* GEORGIAN SMALL LETTER JHAN */ }, \
+	{ 0x10BB, /* GEORGIAN CAPITAL LETTER JIL */ \
+	  0x2D1B, /* GEORGIAN SMALL LETTER JIL */ }, \
+	{ 0x10A9, /* GEORGIAN CAPITAL LETTER KAN */ \
+	  0x2D09, /* GEORGIAN SMALL LETTER KAN */ }, \
+	{ 0x10B5, /* GEORGIAN CAPITAL LETTER KHAR */ \
+	  0x2D15, /* GEORGIAN SMALL LETTER KHAR */ }, \
+	{ 0x10AA, /* GEORGIAN CAPITAL LETTER LAS */ \
+	  0x2D0A, /* GEORGIAN SMALL LETTER LAS */ }, \
+	{ 0x10AB, /* GEORGIAN CAPITAL LETTER MAN */ \
+	  0x2D0B, /* GEORGIAN SMALL LETTER MAN */ }, \
+	{ 0x10AC, /* GEORGIAN CAPITAL LETTER NAR */ \
+	  0x2D0C, /* GEORGIAN SMALL LETTER NAR */ }, \
+	{ 0x10AD, /* GEORGIAN CAPITAL LETTER ON */ \
+	  0x2D0D, /* GEORGIAN SMALL LETTER ON */ }, \
+	{ 0x10AE, /* GEORGIAN CAPITAL LETTER PAR */ \
+	  0x2D0E, /* GEORGIAN SMALL LETTER PAR */ }, \
+	{ 0x10B4, /* GEORGIAN CAPITAL LETTER PHAR */ \
+	  0x2D14, /* GEORGIAN SMALL LETTER PHAR */ }, \
+	{ 0x10B7, /* GEORGIAN CAPITAL LETTER QAR */ \
+	  0x2D17, /* GEORGIAN SMALL LETTER QAR */ }, \
+	{ 0x10B0, /* GEORGIAN CAPITAL LETTER RAE */ \
+	  0x2D10, /* GEORGIAN SMALL LETTER RAE */ }, \
+	{ 0x10B1, /* GEORGIAN CAPITAL LETTER SAN */ \
+	  0x2D11, /* GEORGIAN SMALL LETTER SAN */ }, \
+	{ 0x10B8, /* GEORGIAN CAPITAL LETTER SHIN */ \
+	  0x2D18, /* GEORGIAN SMALL LETTER SHIN */ }, \
+	{ 0x10A7, /* GEORGIAN CAPITAL LETTER TAN */ \
+	  0x2D07, /* GEORGIAN SMALL LETTER TAN */ }, \
+	{ 0x10B2, /* GEORGIAN CAPITAL LETTER TAR */ \
+	  0x2D12, /* GEORGIAN SMALL LETTER TAR */ }, \
+	{ 0x10B3, /* GEORGIAN CAPITAL LETTER UN */ \
+	  0x2D13, /* GEORGIAN SMALL LETTER UN */ }, \
+	{ 0x10A5, /* GEORGIAN CAPITAL LETTER VIN */ \
+	  0x2D05, /* GEORGIAN SMALL LETTER VIN */ }, \
+	{ 0x10C3, /* GEORGIAN CAPITAL LETTER WE */ \
+	  0x2D23, /* GEORGIAN SMALL LETTER WE */ }, \
+	{ 0x10BE, /* GEORGIAN CAPITAL LETTER XAN */ \
+	  0x2D1E, /* GEORGIAN SMALL LETTER XAN */ }, \
+	{ 0x10C7, /* GEORGIAN CAPITAL LETTER YN */ \
+	  0x2D27, /* GEORGIAN SMALL LETTER YN */ }, \
+	{ 0x10A6, /* GEORGIAN CAPITAL LETTER ZEN */ \
+	  0x2D06, /* GEORGIAN SMALL LETTER ZEN */ }, \
+	{ 0x10AF, /* GEORGIAN CAPITAL LETTER ZHAR */ \
+	  0x2D0F, /* GEORGIAN SMALL LETTER ZHAR */ }, \
+	{ 0x2C00, /* GLAGOLITIC CAPITAL LETTER AZU */ \
+	  0x2C30, /* GLAGOLITIC SMALL LETTER AZU */ }, \
+	{ 0x2C28, /* GLAGOLITIC CAPITAL LETTER BIG YUS */ \
+	  0x2C58, /* GLAGOLITIC SMALL LETTER BIG YUS */ }, \
+	{ 0x2C01, /* GLAGOLITIC CAPITAL LETTER BUKY */ \
+	  0x2C31, /* GLAGOLITIC SMALL LETTER BUKY */ }, \
+	{ 0x2C1D, /* GLAGOLITIC CAPITAL LETTER CHRIVI */ \
+	  0x2C4D, /* GLAGOLITIC SMALL LETTER CHRIVI */ }, \
+	{ 0x2C0C, /* GLAGOLITIC CAPITAL LETTER DJERVI */ \
+	  0x2C3C, /* GLAGOLITIC SMALL LETTER DJERVI */ }, \
+	{ 0x2C04, /* GLAGOLITIC CAPITAL LETTER DOBRO */ \
+	  0x2C34, /* GLAGOLITIC SMALL LETTER DOBRO */ }, \
+	{ 0x2C07, /* GLAGOLITIC CAPITAL LETTER DZELO */ \
+	  0x2C37, /* GLAGOLITIC SMALL LETTER DZELO */ }, \
+	{ 0x2C2A, /* GLAGOLITIC CAPITAL LETTER FITA */ \
+	  0x2C5A, /* GLAGOLITIC SMALL LETTER FITA */ }, \
+	{ 0x2C17, /* GLAGOLITIC CAPITAL LETTER FRITU */ \
+	  0x2C47, /* GLAGOLITIC SMALL LETTER FRITU */ }, \
+	{ 0x2C03, /* GLAGOLITIC CAPITAL LETTER GLAGOLI */ \
+	  0x2C33, /* GLAGOLITIC SMALL LETTER GLAGOLI */ }, \
+	{ 0x2C18, /* GLAGOLITIC CAPITAL LETTER HERU */ \
+	  0x2C48, /* GLAGOLITIC SMALL LETTER HERU */ }, \
+	{ 0x2C0B, /* GLAGOLITIC CAPITAL LETTER I */ \
+	  0x2C3B, /* GLAGOLITIC SMALL LETTER I */ }, \
+	{ 0x2C0A, /* GLAGOLITIC CAPITAL LETTER INITIAL IZHE */ \
+	  0x2C3A, /* GLAGOLITIC SMALL LETTER INITIAL IZHE */ }, \
+	{ 0x2C29, /* GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS */ \
+	  0x2C59, /* GLAGOLITIC SMALL LETTER IOTATED BIG YUS */ }, \
+	{ 0x2C27, /* GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS */ \
+	  0x2C57, /* GLAGOLITIC SMALL LETTER IOTATED SMALL YUS */ }, \
+	{ 0x2C09, /* GLAGOLITIC CAPITAL LETTER IZHE */ \
+	  0x2C39, /* GLAGOLITIC SMALL LETTER IZHE */ }, \
+	{ 0x2C2B, /* GLAGOLITIC CAPITAL LETTER IZHITSA */ \
+	  0x2C5B, /* GLAGOLITIC SMALL LETTER IZHITSA */ }, \
+	{ 0x2C0D, /* GLAGOLITIC CAPITAL LETTER KAKO */ \
+	  0x2C3D, /* GLAGOLITIC SMALL LETTER KAKO */ }, \
+	{ 0x2C2E, /* GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE */ \
+	  0x2C5E, /* GLAGOLITIC SMALL LETTER LATINATE MYSLITE */ }, \
+	{ 0x2C0E, /* GLAGOLITIC CAPITAL LETTER LJUDIJE */ \
+	  0x2C3E, /* GLAGOLITIC SMALL LETTER LJUDIJE */ }, \
+	{ 0x2C0F, /* GLAGOLITIC CAPITAL LETTER MYSLITE */ \
+	  0x2C3F, /* GLAGOLITIC SMALL LETTER MYSLITE */ }, \
+	{ 0x2C10, /* GLAGOLITIC CAPITAL LETTER NASHI */ \
+	  0x2C40, /* GLAGOLITIC SMALL LETTER NASHI */ }, \
+	{ 0x2C11, /* GLAGOLITIC CAPITAL LETTER ONU */ \
+	  0x2C41, /* GLAGOLITIC SMALL LETTER ONU */ }, \
+	{ 0x2C19, /* GLAGOLITIC CAPITAL LETTER OTU */ \
+	  0x2C49, /* GLAGOLITIC SMALL LETTER OTU */ }, \
+	{ 0x2C1A, /* GLAGOLITIC CAPITAL LETTER PE */ \
+	  0x2C4A, /* GLAGOLITIC SMALL LETTER PE */ }, \
+	{ 0x2C12, /* GLAGOLITIC CAPITAL LETTER POKOJI */ \
+	  0x2C42, /* GLAGOLITIC SMALL LETTER POKOJI */ }, \
+	{ 0x2C13, /* GLAGOLITIC CAPITAL LETTER RITSI */ \
+	  0x2C43, /* GLAGOLITIC SMALL LETTER RITSI */ }, \
+	{ 0x2C1E, /* GLAGOLITIC CAPITAL LETTER SHA */ \
+	  0x2C4E, /* GLAGOLITIC SMALL LETTER SHA */ }, \
+	{ 0x2C1B, /* GLAGOLITIC CAPITAL LETTER SHTA */ \
+	  0x2C4B, /* GLAGOLITIC SMALL LETTER SHTA */ }, \
+	{ 0x2C2C, /* GLAGOLITIC CAPITAL LETTER SHTAPIC */ \
+	  0x2C5C, /* GLAGOLITIC SMALL LETTER SHTAPIC */ }, \
+	{ 0x2C14, /* GLAGOLITIC CAPITAL LETTER SLOVO */ \
+	  0x2C44, /* GLAGOLITIC SMALL LETTER SLOVO */ }, \
+	{ 0x2C24, /* GLAGOLITIC CAPITAL LETTER SMALL YUS */ \
+	  0x2C54, /* GLAGOLITIC SMALL LETTER SMALL YUS */ }, \
+	{ 0x2C25, /* GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL */ \
+	  0x2C55, /* GLAGOLITIC SMALL LETTER SMALL YUS WITH TAIL */ }, \
+	{ 0x2C22, /* GLAGOLITIC CAPITAL LETTER SPIDERY HA */ \
+	  0x2C52, /* GLAGOLITIC SMALL LETTER SPIDERY HA */ }, \
+	{ 0x2C2D, /* GLAGOLITIC CAPITAL LETTER TROKUTASTI A */ \
+	  0x2C5D, /* GLAGOLITIC SMALL LETTER TROKUTASTI A */ }, \
+	{ 0x2C1C, /* GLAGOLITIC CAPITAL LETTER TSI */ \
+	  0x2C4C, /* GLAGOLITIC SMALL LETTER TSI */ }, \
+	{ 0x2C15, /* GLAGOLITIC CAPITAL LETTER TVRIDO */ \
+	  0x2C45, /* GLAGOLITIC SMALL LETTER TVRIDO */ }, \
+	{ 0x2C16, /* GLAGOLITIC CAPITAL LETTER UKU */ \
+	  0x2C46, /* GLAGOLITIC SMALL LETTER UKU */ }, \
+	{ 0x2C02, /* GLAGOLITIC CAPITAL LETTER VEDE */ \
+	  0x2C32, /* GLAGOLITIC SMALL LETTER VEDE */ }, \
+	{ 0x2C21, /* GLAGOLITIC CAPITAL LETTER YATI */ \
+	  0x2C51, /* GLAGOLITIC SMALL LETTER YATI */ }, \
+	{ 0x2C20, /* GLAGOLITIC CAPITAL LETTER YERI */ \
+	  0x2C50, /* GLAGOLITIC SMALL LETTER YERI */ }, \
+	{ 0x2C1F, /* GLAGOLITIC CAPITAL LETTER YERU */ \
+	  0x2C4F, /* GLAGOLITIC SMALL LETTER YERU */ }, \
+	{ 0x2C05, /* GLAGOLITIC CAPITAL LETTER YESTU */ \
+	  0x2C35, /* GLAGOLITIC SMALL LETTER YESTU */ }, \
+	{ 0x2C26, /* GLAGOLITIC CAPITAL LETTER YO */ \
+	  0x2C56, /* GLAGOLITIC SMALL LETTER YO */ }, \
+	{ 0x2C23, /* GLAGOLITIC CAPITAL LETTER YU */ \
+	  0x2C53, /* GLAGOLITIC SMALL LETTER YU */ }, \
+	{ 0x2C08, /* GLAGOLITIC CAPITAL LETTER ZEMLJA */ \
+	  0x2C38, /* GLAGOLITIC SMALL LETTER ZEMLJA */ }, \
+	{ 0x2C06, /* GLAGOLITIC CAPITAL LETTER ZHIVETE */ \
+	  0x2C36, /* GLAGOLITIC SMALL LETTER ZHIVETE */ }, \
+	{ 0x0391, /* GREEK CAPITAL LETTER ALPHA */ \
+	  0x03B1, /* GREEK SMALL LETTER ALPHA */ }, \
+	{ 0x1F09, /* GREEK CAPITAL LETTER ALPHA WITH DASIA */ \
+	  0x1F01, /* GREEK SMALL LETTER ALPHA WITH DASIA */ }, \
+	{ 0x1F0D, /* GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA */ \
+	  0x1F05, /* GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA */ }, \
+	{ 0x1F0F, /* GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI */ \
+	  0x1F07, /* GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI */ }, \
+	{ 0x1F0B, /* GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA */ \
+	  0x1F03, /* GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA */ }, \
+	{ 0x1FB9, /* GREEK CAPITAL LETTER ALPHA WITH MACRON */ \
+	  0x1FB1, /* GREEK SMALL LETTER ALPHA WITH MACRON */ }, \
+	{ 0x1FBB, /* GREEK CAPITAL LETTER ALPHA WITH OXIA */ \
+	  0x1F71, /* GREEK SMALL LETTER ALPHA WITH OXIA */ }, \
+	{ 0x1F08, /* GREEK CAPITAL LETTER ALPHA WITH PSILI */ \
+	  0x1F00, /* GREEK SMALL LETTER ALPHA WITH PSILI */ }, \
+	{ 0x1F0C, /* GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA */ \
+	  0x1F04, /* GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA */ }, \
+	{ 0x1F0E, /* GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI */ \
+	  0x1F06, /* GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI */ }, \
+	{ 0x1F0A, /* GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA */ \
+	  0x1F02, /* GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA */ }, \
+	{ 0x0386, /* GREEK CAPITAL LETTER ALPHA WITH TONOS */ \
+	  0x03AC, /* GREEK SMALL LETTER ALPHA WITH TONOS */ }, \
+	{ 0x1FBA, /* GREEK CAPITAL LETTER ALPHA WITH VARIA */ \
+	  0x1F70, /* GREEK SMALL LETTER ALPHA WITH VARIA */ }, \
+	{ 0x1FB8, /* GREEK CAPITAL LETTER ALPHA WITH VRACHY */ \
+	  0x1FB0, /* GREEK SMALL LETTER ALPHA WITH VRACHY */ }, \
+	{ 0x0372, /* GREEK CAPITAL LETTER ARCHAIC SAMPI */ \
+	  0x0373, /* GREEK SMALL LETTER ARCHAIC SAMPI */ }, \
+	{ 0x0392, /* GREEK CAPITAL LETTER BETA */ \
+	  0x03B2, /* GREEK SMALL LETTER BETA */ }, \
+	{ 0x03A7, /* GREEK CAPITAL LETTER CHI */ \
+	  0x03C7, /* GREEK SMALL LETTER CHI */ }, \
+	{ 0x0394, /* GREEK CAPITAL LETTER DELTA */ \
+	  0x03B4, /* GREEK SMALL LETTER DELTA */ }, \
+	{ 0x0395, /* GREEK CAPITAL LETTER EPSILON */ \
+	  0x03B5, /* GREEK SMALL LETTER EPSILON */ }, \
+	{ 0x1F19, /* GREEK CAPITAL LETTER EPSILON WITH DASIA */ \
+	  0x1F11, /* GREEK SMALL LETTER EPSILON WITH DASIA */ }, \
+	{ 0x1F1D, /* GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA */ \
+	  0x1F15, /* GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA */ }, \
+	{ 0x1F1B, /* GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA */ \
+	  0x1F13, /* GREEK SMALL LETTER EPSILON WITH DASIA AND VARIA */ }, \
+	{ 0x1FC9, /* GREEK CAPITAL LETTER EPSILON WITH OXIA */ \
+	  0x1F73, /* GREEK SMALL LETTER EPSILON WITH OXIA */ }, \
+	{ 0x1F18, /* GREEK CAPITAL LETTER EPSILON WITH PSILI */ \
+	  0x1F10, /* GREEK SMALL LETTER EPSILON WITH PSILI */ }, \
+	{ 0x1F1C, /* GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA */ \
+	  0x1F14, /* GREEK SMALL LETTER EPSILON WITH PSILI AND OXIA */ }, \
+	{ 0x1F1A, /* GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA */ \
+	  0x1F12, /* GREEK SMALL LETTER EPSILON WITH PSILI AND VARIA */ }, \
+	{ 0x0388, /* GREEK CAPITAL LETTER EPSILON WITH TONOS */ \
+	  0x03AD, /* GREEK SMALL LETTER EPSILON WITH TONOS */ }, \
+	{ 0x1FC8, /* GREEK CAPITAL LETTER EPSILON WITH VARIA */ \
+	  0x1F72, /* GREEK SMALL LETTER EPSILON WITH VARIA */ }, \
+	{ 0x0397, /* GREEK CAPITAL LETTER ETA */ \
+	  0x03B7, /* GREEK SMALL LETTER ETA */ }, \
+	{ 0x1F29, /* GREEK CAPITAL LETTER ETA WITH DASIA */ \
+	  0x1F21, /* GREEK SMALL LETTER ETA WITH DASIA */ }, \
+	{ 0x1F2D, /* GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA */ \
+	  0x1F25, /* GREEK SMALL LETTER ETA WITH DASIA AND OXIA */ }, \
+	{ 0x1F2F, /* GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI */ \
+	  0x1F27, /* GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI */ }, \
+	{ 0x1F2B, /* GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA */ \
+	  0x1F23, /* GREEK SMALL LETTER ETA WITH DASIA AND VARIA */ }, \
+	{ 0x1FCB, /* GREEK CAPITAL LETTER ETA WITH OXIA */ \
+	  0x1F75, /* GREEK SMALL LETTER ETA WITH OXIA */ }, \
+	{ 0x1F28, /* GREEK CAPITAL LETTER ETA WITH PSILI */ \
+	  0x1F20, /* GREEK SMALL LETTER ETA WITH PSILI */ }, \
+	{ 0x1F2C, /* GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA */ \
+	  0x1F24, /* GREEK SMALL LETTER ETA WITH PSILI AND OXIA */ }, \
+	{ 0x1F2E, /* GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI */ \
+	  0x1F26, /* GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI */ }, \
+	{ 0x1F2A, /* GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA */ \
+	  0x1F22, /* GREEK SMALL LETTER ETA WITH PSILI AND VARIA */ }, \
+	{ 0x0389, /* GREEK CAPITAL LETTER ETA WITH TONOS */ \
+	  0x03AE, /* GREEK SMALL LETTER ETA WITH TONOS */ }, \
+	{ 0x1FCA, /* GREEK CAPITAL LETTER ETA WITH VARIA */ \
+	  0x1F74, /* GREEK SMALL LETTER ETA WITH VARIA */ }, \
+	{ 0x0393, /* GREEK CAPITAL LETTER GAMMA */ \
+	  0x03B3, /* GREEK SMALL LETTER GAMMA */ }, \
+	{ 0x0370, /* GREEK CAPITAL LETTER HETA */ \
+	  0x0371, /* GREEK SMALL LETTER HETA */ }, \
+	{ 0x0399, /* GREEK CAPITAL LETTER IOTA */ \
+	  0x03B9, /* GREEK SMALL LETTER IOTA */ }, \
+	{ 0x1F39, /* GREEK CAPITAL LETTER IOTA WITH DASIA */ \
+	  0x1F31, /* GREEK SMALL LETTER IOTA WITH DASIA */ }, \
+	{ 0x1F3D, /* GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA */ \
+	  0x1F35, /* GREEK SMALL LETTER IOTA WITH DASIA AND OXIA */ }, \
+	{ 0x1F3F, /* GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI */ \
+	  0x1F37, /* GREEK SMALL LETTER IOTA WITH DASIA AND PERISPOMENI */ }, \
+	{ 0x1F3B, /* GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA */ \
+	  0x1F33, /* GREEK SMALL LETTER IOTA WITH DASIA AND VARIA */ }, \
+	{ 0x03AA, /* GREEK CAPITAL LETTER IOTA WITH DIALYTIKA */ \
+	  0x03CA, /* GREEK SMALL LETTER IOTA WITH DIALYTIKA */ }, \
+	{ 0x1FD9, /* GREEK CAPITAL LETTER IOTA WITH MACRON */ \
+	  0x1FD1, /* GREEK SMALL LETTER IOTA WITH MACRON */ }, \
+	{ 0x1FDB, /* GREEK CAPITAL LETTER IOTA WITH OXIA */ \
+	  0x1F77, /* GREEK SMALL LETTER IOTA WITH OXIA */ }, \
+	{ 0x1F38, /* GREEK CAPITAL LETTER IOTA WITH PSILI */ \
+	  0x1F30, /* GREEK SMALL LETTER IOTA WITH PSILI */ }, \
+	{ 0x1F3C, /* GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA */ \
+	  0x1F34, /* GREEK SMALL LETTER IOTA WITH PSILI AND OXIA */ }, \
+	{ 0x1F3E, /* GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI */ \
+	  0x1F36, /* GREEK SMALL LETTER IOTA WITH PSILI AND PERISPOMENI */ }, \
+	{ 0x1F3A, /* GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA */ \
+	  0x1F32, /* GREEK SMALL LETTER IOTA WITH PSILI AND VARIA */ }, \
+	{ 0x038A, /* GREEK CAPITAL LETTER IOTA WITH TONOS */ \
+	  0x03AF, /* GREEK SMALL LETTER IOTA WITH TONOS */ }, \
+	{ 0x1FDA, /* GREEK CAPITAL LETTER IOTA WITH VARIA */ \
+	  0x1F76, /* GREEK SMALL LETTER IOTA WITH VARIA */ }, \
+	{ 0x1FD8, /* GREEK CAPITAL LETTER IOTA WITH VRACHY */ \
+	  0x1FD0, /* GREEK SMALL LETTER IOTA WITH VRACHY */ }, \
+	{ 0x039A, /* GREEK CAPITAL LETTER KAPPA */ \
+	  0x03BA, /* GREEK SMALL LETTER KAPPA */ }, \
+	{ 0x039B, /* GREEK CAPITAL LETTER LAMDA */ \
+	  0x03BB, /* GREEK SMALL LETTER LAMDA */ }, \
+	{ 0x039C, /* GREEK CAPITAL LETTER MU */ \
+	  0x03BC, /* GREEK SMALL LETTER MU */ }, \
+	{ 0x039D, /* GREEK CAPITAL LETTER NU */ \
+	  0x03BD, /* GREEK SMALL LETTER NU */ }, \
+	{ 0x03A9, /* GREEK CAPITAL LETTER OMEGA */ \
+	  0x03C9, /* GREEK SMALL LETTER OMEGA */ }, \
+	{ 0x1F69, /* GREEK CAPITAL LETTER OMEGA WITH DASIA */ \
+	  0x1F61, /* GREEK SMALL LETTER OMEGA WITH DASIA */ }, \
+	{ 0x1F6D, /* GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA */ \
+	  0x1F65, /* GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA */ }, \
+	{ 0x1F6F, /* GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI */ \
+	  0x1F67, /* GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI */ }, \
+	{ 0x1F6B, /* GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA */ \
+	  0x1F63, /* GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA */ }, \
+	{ 0x1FFB, /* GREEK CAPITAL LETTER OMEGA WITH OXIA */ \
+	  0x1F7D, /* GREEK SMALL LETTER OMEGA WITH OXIA */ }, \
+	{ 0x1F68, /* GREEK CAPITAL LETTER OMEGA WITH PSILI */ \
+	  0x1F60, /* GREEK SMALL LETTER OMEGA WITH PSILI */ }, \
+	{ 0x1F6C, /* GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA */ \
+	  0x1F64, /* GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA */ }, \
+	{ 0x1F6E, /* GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI */ \
+	  0x1F66, /* GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI */ }, \
+	{ 0x1F6A, /* GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA */ \
+	  0x1F62, /* GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA */ }, \
+	{ 0x038F, /* GREEK CAPITAL LETTER OMEGA WITH TONOS */ \
+	  0x03CE, /* GREEK SMALL LETTER OMEGA WITH TONOS */ }, \
+	{ 0x1FFA, /* GREEK CAPITAL LETTER OMEGA WITH VARIA */ \
+	  0x1F7C, /* GREEK SMALL LETTER OMEGA WITH VARIA */ }, \
+	{ 0x039F, /* GREEK CAPITAL LETTER OMICRON */ \
+	  0x03BF, /* GREEK SMALL LETTER OMICRON */ }, \
+	{ 0x1F49, /* GREEK CAPITAL LETTER OMICRON WITH DASIA */ \
+	  0x1F41, /* GREEK SMALL LETTER OMICRON WITH DASIA */ }, \
+	{ 0x1F4D, /* GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA */ \
+	  0x1F45, /* GREEK SMALL LETTER OMICRON WITH DASIA AND OXIA */ }, \
+	{ 0x1F4B, /* GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA */ \
+	  0x1F43, /* GREEK SMALL LETTER OMICRON WITH DASIA AND VARIA */ }, \
+	{ 0x1FF9, /* GREEK CAPITAL LETTER OMICRON WITH OXIA */ \
+	  0x1F79, /* GREEK SMALL LETTER OMICRON WITH OXIA */ }, \
+	{ 0x1F48, /* GREEK CAPITAL LETTER OMICRON WITH PSILI */ \
+	  0x1F40, /* GREEK SMALL LETTER OMICRON WITH PSILI */ }, \
+	{ 0x1F4C, /* GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA */ \
+	  0x1F44, /* GREEK SMALL LETTER OMICRON WITH PSILI AND OXIA */ }, \
+	{ 0x1F4A, /* GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA */ \
+	  0x1F42, /* GREEK SMALL LETTER OMICRON WITH PSILI AND VARIA */ }, \
+	{ 0x038C, /* GREEK CAPITAL LETTER OMICRON WITH TONOS */ \
+	  0x03CC, /* GREEK SMALL LETTER OMICRON WITH TONOS */ }, \
+	{ 0x1FF8, /* GREEK CAPITAL LETTER OMICRON WITH VARIA */ \
+	  0x1F78, /* GREEK SMALL LETTER OMICRON WITH VARIA */ }, \
+	{ 0x0376, /* GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA */ \
+	  0x0377, /* GREEK SMALL LETTER PAMPHYLIAN DIGAMMA */ }, \
+	{ 0x03A6, /* GREEK CAPITAL LETTER PHI */ \
+	  0x03C6, /* GREEK SMALL LETTER PHI */ }, \
+	{ 0x03A0, /* GREEK CAPITAL LETTER PI */ \
+	  0x03C0, /* GREEK SMALL LETTER PI */ }, \
+	{ 0x03A8, /* GREEK CAPITAL LETTER PSI */ \
+	  0x03C8, /* GREEK SMALL LETTER PSI */ }, \
+	{ 0x03A1, /* GREEK CAPITAL LETTER RHO */ \
+	  0x03C1, /* GREEK SMALL LETTER RHO */ }, \
+	{ 0x1FEC, /* GREEK CAPITAL LETTER RHO WITH DASIA */ \
+	  0x1FE5, /* GREEK SMALL LETTER RHO WITH DASIA */ }, \
+	{ 0x03FA, /* GREEK CAPITAL LETTER SAN */ \
+	  0x03FB, /* GREEK SMALL LETTER SAN */ }, \
+	{ 0x03F7, /* GREEK CAPITAL LETTER SHO */ \
+	  0x03F8, /* GREEK SMALL LETTER SHO */ }, \
+	{ 0x03A3, /* GREEK CAPITAL LETTER SIGMA */ \
+	  0x03C3, /* GREEK SMALL LETTER SIGMA */ }, \
+	{ 0x03A4, /* GREEK CAPITAL LETTER TAU */ \
+	  0x03C4, /* GREEK SMALL LETTER TAU */ }, \
+	{ 0x0398, /* GREEK CAPITAL LETTER THETA */ \
+	  0x03B8, /* GREEK SMALL LETTER THETA */ }, \
+	{ 0x03A5, /* GREEK CAPITAL LETTER UPSILON */ \
+	  0x03C5, /* GREEK SMALL LETTER UPSILON */ }, \
+	{ 0x1F59, /* GREEK CAPITAL LETTER UPSILON WITH DASIA */ \
+	  0x1F51, /* GREEK SMALL LETTER UPSILON WITH DASIA */ }, \
+	{ 0x1F5D, /* GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA */ \
+	  0x1F55, /* GREEK SMALL LETTER UPSILON WITH DASIA AND OXIA */ }, \
+	{ 0x1F5F, /* GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI */ \
+	  0x1F57, /* GREEK SMALL LETTER UPSILON WITH DASIA AND PERISPOMENI */ }, \
+	{ 0x1F5B, /* GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA */ \
+	  0x1F53, /* GREEK SMALL LETTER UPSILON WITH DASIA AND VARIA */ }, \
+	{ 0x03AB, /* GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA */ \
+	  0x03CB, /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA */ }, \
+	{ 0x1FE9, /* GREEK CAPITAL LETTER UPSILON WITH MACRON */ \
+	  0x1FE1, /* GREEK SMALL LETTER UPSILON WITH MACRON */ }, \
+	{ 0x1FEB, /* GREEK CAPITAL LETTER UPSILON WITH OXIA */ \
+	  0x1F7B, /* GREEK SMALL LETTER UPSILON WITH OXIA */ }, \
+	{ 0x038E, /* GREEK CAPITAL LETTER UPSILON WITH TONOS */ \
+	  0x03CD, /* GREEK SMALL LETTER UPSILON WITH TONOS */ }, \
+	{ 0x1FEA, /* GREEK CAPITAL LETTER UPSILON WITH VARIA */ \
+	  0x1F7A, /* GREEK SMALL LETTER UPSILON WITH VARIA */ }, \
+	{ 0x1FE8, /* GREEK CAPITAL LETTER UPSILON WITH VRACHY */ \
+	  0x1FE0, /* GREEK SMALL LETTER UPSILON WITH VRACHY */ }, \
+	{ 0x039E, /* GREEK CAPITAL LETTER XI */ \
+	  0x03BE, /* GREEK SMALL LETTER XI */ }, \
+	{ 0x0396, /* GREEK CAPITAL LETTER ZETA */ \
+	  0x03B6, /* GREEK SMALL LETTER ZETA */ }, \
+	{ 0x0041, /* LATIN CAPITAL LETTER A */ \
+	  0x0061, /* LATIN SMALL LETTER A */ }, \
+	{ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */ \
+	  0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */ }, \
+	{ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */ \
+	  0x0103, /* LATIN SMALL LETTER A WITH BREVE */ }, \
+	{ 0x1EAE, /* LATIN CAPITAL LETTER A WITH BREVE AND ACUTE */ \
+	  0x1EAF, /* LATIN SMALL LETTER A WITH BREVE AND ACUTE */ }, \
+	{ 0x1EB6, /* LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW */ \
+	  0x1EB7, /* LATIN SMALL LETTER A WITH BREVE AND DOT BELOW */ }, \
+	{ 0x1EB0, /* LATIN CAPITAL LETTER A WITH BREVE AND GRAVE */ \
+	  0x1EB1, /* LATIN SMALL LETTER A WITH BREVE AND GRAVE */ }, \
+	{ 0x1EB2, /* LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE */ \
+	  0x1EB3, /* LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE */ }, \
+	{ 0x1EB4, /* LATIN CAPITAL LETTER A WITH BREVE AND TILDE */ \
+	  0x1EB5, /* LATIN SMALL LETTER A WITH BREVE AND TILDE */ }, \
+	{ 0x01CD, /* LATIN CAPITAL LETTER A WITH CARON */ \
+	  0x01CE, /* LATIN SMALL LETTER A WITH CARON */ }, \
+	{ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */ \
+	  0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */ }, \
+	{ 0x1EA4, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE */ \
+	  0x1EA5, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE */ }, \
+	{ 0x1EAC, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW */ \
+	  0x1EAD, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW */ }, \
+	{ 0x1EA6, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE */ \
+	  0x1EA7, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE */ }, \
+	{ 0x1EA8, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE */ \
+	  0x1EA9, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE */ }, \
+	{ 0x1EAA, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE */ \
+	  0x1EAB, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE */ }, \
+	{ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */ \
+	  0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */ }, \
+	{ 0x01DE, /* LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON */ \
+	  0x01DF, /* LATIN SMALL LETTER A WITH DIAERESIS AND MACRON */ }, \
+	{ 0x0226, /* LATIN CAPITAL LETTER A WITH DOT ABOVE */ \
+	  0x0227, /* LATIN SMALL LETTER A WITH DOT ABOVE */ }, \
+	{ 0x01E0, /* LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON */ \
+	  0x01E1, /* LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON */ }, \
+	{ 0x1EA0, /* LATIN CAPITAL LETTER A WITH DOT BELOW */ \
+	  0x1EA1, /* LATIN SMALL LETTER A WITH DOT BELOW */ }, \
+	{ 0x0200, /* LATIN CAPITAL LETTER A WITH DOUBLE GRAVE */ \
+	  0x0201, /* LATIN SMALL LETTER A WITH DOUBLE GRAVE */ }, \
+	{ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */ \
+	  0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */ }, \
+	{ 0x1EA2, /* LATIN CAPITAL LETTER A WITH HOOK ABOVE */ \
+	  0x1EA3, /* LATIN SMALL LETTER A WITH HOOK ABOVE */ }, \
+	{ 0x0202, /* LATIN CAPITAL LETTER A WITH INVERTED BREVE */ \
+	  0x0203, /* LATIN SMALL LETTER A WITH INVERTED BREVE */ }, \
+	{ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */ \
+	  0x0101, /* LATIN SMALL LETTER A WITH MACRON */ }, \
+	{ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */ \
+	  0x0105, /* LATIN SMALL LETTER A WITH OGONEK */ }, \
+	{ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */ \
+	  0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */ }, \
+	{ 0x01FA, /* LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE */ \
+	  0x01FB, /* LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE */ }, \
+	{ 0x1E00, /* LATIN CAPITAL LETTER A WITH RING BELOW */ \
+	  0x1E01, /* LATIN SMALL LETTER A WITH RING BELOW */ }, \
+	{ 0x023A, /* LATIN CAPITAL LETTER A WITH STROKE */ \
+	  0x2C65, /* LATIN SMALL LETTER A WITH STROKE */ }, \
+	{ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */ \
+	  0x00E3, /* LATIN SMALL LETTER A WITH TILDE */ }, \
+	{ 0xA732, /* LATIN CAPITAL LETTER AA */ \
+	  0xA733, /* LATIN SMALL LETTER AA */ }, \
+	{ 0x00C6, /* LATIN CAPITAL LETTER AE */ \
+	  0x00E6, /* LATIN SMALL LETTER AE */ }, \
+	{ 0x01FC, /* LATIN CAPITAL LETTER AE WITH ACUTE */ \
+	  0x01FD, /* LATIN SMALL LETTER AE WITH ACUTE */ }, \
+	{ 0x01E2, /* LATIN CAPITAL LETTER AE WITH MACRON */ \
+	  0x01E3, /* LATIN SMALL LETTER AE WITH MACRON */ }, \
+	{ 0x2C6D, /* LATIN CAPITAL LETTER ALPHA */ \
+	  0x0251, /* LATIN SMALL LETTER ALPHA */ }, \
+	{ 0xA734, /* LATIN CAPITAL LETTER AO */ \
+	  0xA735, /* LATIN SMALL LETTER AO */ }, \
+	{ 0xA736, /* LATIN CAPITAL LETTER AU */ \
+	  0xA737, /* LATIN SMALL LETTER AU */ }, \
+	{ 0xA738, /* LATIN CAPITAL LETTER AV */ \
+	  0xA739, /* LATIN SMALL LETTER AV */ }, \
+	{ 0xA73A, /* LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR */ \
+	  0xA73B, /* LATIN SMALL LETTER AV WITH HORIZONTAL BAR */ }, \
+	{ 0xA73C, /* LATIN CAPITAL LETTER AY */ \
+	  0xA73D, /* LATIN SMALL LETTER AY */ }, \
+	{ 0x0042, /* LATIN CAPITAL LETTER B */ \
+	  0x0062, /* LATIN SMALL LETTER B */ }, \
+	{ 0x1E02, /* LATIN CAPITAL LETTER B WITH DOT ABOVE */ \
+	  0x1E03, /* LATIN SMALL LETTER B WITH DOT ABOVE */ }, \
+	{ 0x1E04, /* LATIN CAPITAL LETTER B WITH DOT BELOW */ \
+	  0x1E05, /* LATIN SMALL LETTER B WITH DOT BELOW */ }, \
+	{ 0xA796, /* LATIN CAPITAL LETTER B WITH FLOURISH */ \
+	  0xA797, /* LATIN SMALL LETTER B WITH FLOURISH */ }, \
+	{ 0x0181, /* LATIN CAPITAL LETTER B WITH HOOK */ \
+	  0x0253, /* LATIN SMALL LETTER B WITH HOOK */ }, \
+	{ 0x1E06, /* LATIN CAPITAL LETTER B WITH LINE BELOW */ \
+	  0x1E07, /* LATIN SMALL LETTER B WITH LINE BELOW */ }, \
+	{ 0x0243, /* LATIN CAPITAL LETTER B WITH STROKE */ \
+	  0x0180, /* LATIN SMALL LETTER B WITH STROKE */ }, \
+	{ 0x0182, /* LATIN CAPITAL LETTER B WITH TOPBAR */ \
+	  0x0183, /* LATIN SMALL LETTER B WITH TOPBAR */ }, \
+	{ 0xA7B4, /* LATIN CAPITAL LETTER BETA */ \
+	  0xA7B5, /* LATIN SMALL LETTER BETA */ }, \
+	{ 0xA746, /* LATIN CAPITAL LETTER BROKEN L */ \
+	  0xA747, /* LATIN SMALL LETTER BROKEN L */ }, \
+	{ 0x0043, /* LATIN CAPITAL LETTER C */ \
+	  0x0063, /* LATIN SMALL LETTER C */ }, \
+	{ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */ \
+	  0x0107, /* LATIN SMALL LETTER C WITH ACUTE */ }, \
+	{ 0xA792, /* LATIN CAPITAL LETTER C WITH BAR */ \
+	  0xA793, /* LATIN SMALL LETTER C WITH BAR */ }, \
+	{ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */ \
+	  0x010D, /* LATIN SMALL LETTER C WITH CARON */ }, \
+	{ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */ \
+	  0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */ }, \
+	{ 0x1E08, /* LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ \
+	  0x1E09, /* LATIN SMALL LETTER C WITH CEDILLA AND ACUTE */ }, \
+	{ 0x0108, /* LATIN CAPITAL LETTER C WITH CIRCUMFLEX */ \
+	  0x0109, /* LATIN SMALL LETTER C WITH CIRCUMFLEX */ }, \
+	{ 0x010A, /* LATIN CAPITAL LETTER C WITH DOT ABOVE */ \
+	  0x010B, /* LATIN SMALL LETTER C WITH DOT ABOVE */ }, \
+	{ 0x0187, /* LATIN CAPITAL LETTER C WITH HOOK */ \
+	  0x0188, /* LATIN SMALL LETTER C WITH HOOK */ }, \
+	{ 0x023B, /* LATIN CAPITAL LETTER C WITH STROKE */ \
+	  0x023C, /* LATIN SMALL LETTER C WITH STROKE */ }, \
+	{ 0xA7B3, /* LATIN CAPITAL LETTER CHI */ \
+	  0xAB53, /* LATIN SMALL LETTER CHI */ }, \
+	{ 0xA76E, /* LATIN CAPITAL LETTER CON */ \
+	  0xA76F, /* LATIN SMALL LETTER CON */ }, \
+	{ 0xA72C, /* LATIN CAPITAL LETTER CUATRILLO */ \
+	  0xA72D, /* LATIN SMALL LETTER CUATRILLO */ }, \
+	{ 0xA72E, /* LATIN CAPITAL LETTER CUATRILLO WITH COMMA */ \
+	  0xA72F, /* LATIN SMALL LETTER CUATRILLO WITH COMMA */ }, \
+	{ 0x0044, /* LATIN CAPITAL LETTER D */ \
+	  0x0064, /* LATIN SMALL LETTER D */ }, \
+	{ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */ \
+	  0x010F, /* LATIN SMALL LETTER D WITH CARON */ }, \
+	{ 0x1E10, /* LATIN CAPITAL LETTER D WITH CEDILLA */ \
+	  0x1E11, /* LATIN SMALL LETTER D WITH CEDILLA */ }, \
+	{ 0x1E12, /* LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW */ \
+	  0x1E13, /* LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW */ }, \
+	{ 0x1E0A, /* LATIN CAPITAL LETTER D WITH DOT ABOVE */ \
+	  0x1E0B, /* LATIN SMALL LETTER D WITH DOT ABOVE */ }, \
+	{ 0x1E0C, /* LATIN CAPITAL LETTER D WITH DOT BELOW */ \
+	  0x1E0D, /* LATIN SMALL LETTER D WITH DOT BELOW */ }, \
+	{ 0x018A, /* LATIN CAPITAL LETTER D WITH HOOK */ \
+	  0x0257, /* LATIN SMALL LETTER D WITH HOOK */ }, \
+	{ 0x1E0E, /* LATIN CAPITAL LETTER D WITH LINE BELOW */ \
+	  0x1E0F, /* LATIN SMALL LETTER D WITH LINE BELOW */ }, \
+	{ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */ \
+	  0x0111, /* LATIN SMALL LETTER D WITH STROKE */ }, \
+	{ 0x018B, /* LATIN CAPITAL LETTER D WITH TOPBAR */ \
+	  0x018C, /* LATIN SMALL LETTER D WITH TOPBAR */ }, \
+	{ 0x01F1, /* LATIN CAPITAL LETTER DZ */ \
+	  0x01F3, /* LATIN SMALL LETTER DZ */ }, \
+	{ 0x01C4, /* LATIN CAPITAL LETTER DZ WITH CARON */ \
+	  0x01C6, /* LATIN SMALL LETTER DZ WITH CARON */ }, \
+	{ 0x0045, /* LATIN CAPITAL LETTER E */ \
+	  0x0065, /* LATIN SMALL LETTER E */ }, \
+	{ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */ \
+	  0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */ }, \
+	{ 0x0114, /* LATIN CAPITAL LETTER E WITH BREVE */ \
+	  0x0115, /* LATIN SMALL LETTER E WITH BREVE */ }, \
+	{ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */ \
+	  0x011B, /* LATIN SMALL LETTER E WITH CARON */ }, \
+	{ 0x0228, /* LATIN CAPITAL LETTER E WITH CEDILLA */ \
+	  0x0229, /* LATIN SMALL LETTER E WITH CEDILLA */ }, \
+	{ 0x1E1C, /* LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE */ \
+	  0x1E1D, /* LATIN SMALL LETTER E WITH CEDILLA AND BREVE */ }, \
+	{ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */ \
+	  0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */ }, \
+	{ 0x1EBE, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE */ \
+	  0x1EBF, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE */ }, \
+	{ 0x1EC6, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW */ \
+	  0x1EC7, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW */ }, \
+	{ 0x1EC0, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE */ \
+	  0x1EC1, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE */ }, \
+	{ 0x1EC2, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE */ \
+	  0x1EC3, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE */ }, \
+	{ 0x1EC4, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE */ \
+	  0x1EC5, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE */ }, \
+	{ 0x1E18, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW */ \
+	  0x1E19, /* LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW */ }, \
+	{ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */ \
+	  0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */ }, \
+	{ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */ \
+	  0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */ }, \
+	{ 0x1EB8, /* LATIN CAPITAL LETTER E WITH DOT BELOW */ \
+	  0x1EB9, /* LATIN SMALL LETTER E WITH DOT BELOW */ }, \
+	{ 0x0204, /* LATIN CAPITAL LETTER E WITH DOUBLE GRAVE */ \
+	  0x0205, /* LATIN SMALL LETTER E WITH DOUBLE GRAVE */ }, \
+	{ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */ \
+	  0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */ }, \
+	{ 0x1EBA, /* LATIN CAPITAL LETTER E WITH HOOK ABOVE */ \
+	  0x1EBB, /* LATIN SMALL LETTER E WITH HOOK ABOVE */ }, \
+	{ 0x0206, /* LATIN CAPITAL LETTER E WITH INVERTED BREVE */ \
+	  0x0207, /* LATIN SMALL LETTER E WITH INVERTED BREVE */ }, \
+	{ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */ \
+	  0x0113, /* LATIN SMALL LETTER E WITH MACRON */ }, \
+	{ 0x1E16, /* LATIN CAPITAL LETTER E WITH MACRON AND ACUTE */ \
+	  0x1E17, /* LATIN SMALL LETTER E WITH MACRON AND ACUTE */ }, \
+	{ 0x1E14, /* LATIN CAPITAL LETTER E WITH MACRON AND GRAVE */ \
+	  0x1E15, /* LATIN SMALL LETTER E WITH MACRON AND GRAVE */ }, \
+	{ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */ \
+	  0x0119, /* LATIN SMALL LETTER E WITH OGONEK */ }, \
+	{ 0x0246, /* LATIN CAPITAL LETTER E WITH STROKE */ \
+	  0x0247, /* LATIN SMALL LETTER E WITH STROKE */ }, \
+	{ 0x1EBC, /* LATIN CAPITAL LETTER E WITH TILDE */ \
+	  0x1EBD, /* LATIN SMALL LETTER E WITH TILDE */ }, \
+	{ 0x1E1A, /* LATIN CAPITAL LETTER E WITH TILDE BELOW */ \
+	  0x1E1B, /* LATIN SMALL LETTER E WITH TILDE BELOW */ }, \
+	{ 0xA724, /* LATIN CAPITAL LETTER EGYPTOLOGICAL AIN */ \
+	  0xA725, /* LATIN SMALL LETTER EGYPTOLOGICAL AIN */ }, \
+	{ 0xA722, /* LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF */ \
+	  0xA723, /* LATIN SMALL LETTER EGYPTOLOGICAL ALEF */ }, \
+	{ 0x014A, /* LATIN CAPITAL LETTER ENG */ \
+	  0x014B, /* LATIN SMALL LETTER ENG */ }, \
+	{ 0x01A9, /* LATIN CAPITAL LETTER ESH */ \
+	  0x0283, /* LATIN SMALL LETTER ESH */ }, \
+	{ 0xA76A, /* LATIN CAPITAL LETTER ET */ \
+	  0xA76B, /* LATIN SMALL LETTER ET */ }, \
+	{ 0x00D0, /* LATIN CAPITAL LETTER ETH */ \
+	  0x00F0, /* LATIN SMALL LETTER ETH */ }, \
+	{ 0x01B7, /* LATIN CAPITAL LETTER EZH */ \
+	  0x0292, /* LATIN SMALL LETTER EZH */ }, \
+	{ 0x01B8, /* LATIN CAPITAL LETTER EZH REVERSED */ \
+	  0x01B9, /* LATIN SMALL LETTER EZH REVERSED */ }, \
+	{ 0x01EE, /* LATIN CAPITAL LETTER EZH WITH CARON */ \
+	  0x01EF, /* LATIN SMALL LETTER EZH WITH CARON */ }, \
+	{ 0x0046, /* LATIN CAPITAL LETTER F */ \
+	  0x0066, /* LATIN SMALL LETTER F */ }, \
+	{ 0x1E1E, /* LATIN CAPITAL LETTER F WITH DOT ABOVE */ \
+	  0x1E1F, /* LATIN SMALL LETTER F WITH DOT ABOVE */ }, \
+	{ 0x0191, /* LATIN CAPITAL LETTER F WITH HOOK */ \
+	  0x0192, /* LATIN SMALL LETTER F WITH HOOK */ }, \
+	{ 0xA798, /* LATIN CAPITAL LETTER F WITH STROKE */ \
+	  0xA799, /* LATIN SMALL LETTER F WITH STROKE */ }, \
+	{ 0x0047, /* LATIN CAPITAL LETTER G */ \
+	  0x0067, /* LATIN SMALL LETTER G */ }, \
+	{ 0x01F4, /* LATIN CAPITAL LETTER G WITH ACUTE */ \
+	  0x01F5, /* LATIN SMALL LETTER G WITH ACUTE */ }, \
+	{ 0x011E, /* LATIN CAPITAL LETTER G WITH BREVE */ \
+	  0x011F, /* LATIN SMALL LETTER G WITH BREVE */ }, \
+	{ 0x01E6, /* LATIN CAPITAL LETTER G WITH CARON */ \
+	  0x01E7, /* LATIN SMALL LETTER G WITH CARON */ }, \
+	{ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */ \
+	  0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */ }, \
+	{ 0x011C, /* LATIN CAPITAL LETTER G WITH CIRCUMFLEX */ \
+	  0x011D, /* LATIN SMALL LETTER G WITH CIRCUMFLEX */ }, \
+	{ 0x0120, /* LATIN CAPITAL LETTER G WITH DOT ABOVE */ \
+	  0x0121, /* LATIN SMALL LETTER G WITH DOT ABOVE */ }, \
+	{ 0x0193, /* LATIN CAPITAL LETTER G WITH HOOK */ \
+	  0x0260, /* LATIN SMALL LETTER G WITH HOOK */ }, \
+	{ 0x1E20, /* LATIN CAPITAL LETTER G WITH MACRON */ \
+	  0x1E21, /* LATIN SMALL LETTER G WITH MACRON */ }, \
+	{ 0xA7A0, /* LATIN CAPITAL LETTER G WITH OBLIQUE STROKE */ \
+	  0xA7A1, /* LATIN SMALL LETTER G WITH OBLIQUE STROKE */ }, \
+	{ 0x01E4, /* LATIN CAPITAL LETTER G WITH STROKE */ \
+	  0x01E5, /* LATIN SMALL LETTER G WITH STROKE */ }, \
+	{ 0x0194, /* LATIN CAPITAL LETTER GAMMA */ \
+	  0x0263, /* LATIN SMALL LETTER GAMMA */ }, \
+	{ 0x0241, /* LATIN CAPITAL LETTER GLOTTAL STOP */ \
+	  0x0242, /* LATIN SMALL LETTER GLOTTAL STOP */ }, \
+	{ 0x0048, /* LATIN CAPITAL LETTER H */ \
+	  0x0068, /* LATIN SMALL LETTER H */ }, \
+	{ 0x1E2A, /* LATIN CAPITAL LETTER H WITH BREVE BELOW */ \
+	  0x1E2B, /* LATIN SMALL LETTER H WITH BREVE BELOW */ }, \
+	{ 0x021E, /* LATIN CAPITAL LETTER H WITH CARON */ \
+	  0x021F, /* LATIN SMALL LETTER H WITH CARON */ }, \
+	{ 0x1E28, /* LATIN CAPITAL LETTER H WITH CEDILLA */ \
+	  0x1E29, /* LATIN SMALL LETTER H WITH CEDILLA */ }, \
+	{ 0x0124, /* LATIN CAPITAL LETTER H WITH CIRCUMFLEX */ \
+	  0x0125, /* LATIN SMALL LETTER H WITH CIRCUMFLEX */ }, \
+	{ 0x2C67, /* LATIN CAPITAL LETTER H WITH DESCENDER */ \
+	  0x2C68, /* LATIN SMALL LETTER H WITH DESCENDER */ }, \
+	{ 0x1E26, /* LATIN CAPITAL LETTER H WITH DIAERESIS */ \
+	  0x1E27, /* LATIN SMALL LETTER H WITH DIAERESIS */ }, \
+	{ 0x1E22, /* LATIN CAPITAL LETTER H WITH DOT ABOVE */ \
+	  0x1E23, /* LATIN SMALL LETTER H WITH DOT ABOVE */ }, \
+	{ 0x1E24, /* LATIN CAPITAL LETTER H WITH DOT BELOW */ \
+	  0x1E25, /* LATIN SMALL LETTER H WITH DOT BELOW */ }, \
+	{ 0xA7AA, /* LATIN CAPITAL LETTER H WITH HOOK */ \
+	  0x0266, /* LATIN SMALL LETTER H WITH HOOK */ }, \
+	{ 0x0126, /* LATIN CAPITAL LETTER H WITH STROKE */ \
+	  0x0127, /* LATIN SMALL LETTER H WITH STROKE */ }, \
+	{ 0x2C75, /* LATIN CAPITAL LETTER HALF H */ \
+	  0x2C76, /* LATIN SMALL LETTER HALF H */ }, \
+	{ 0xA726, /* LATIN CAPITAL LETTER HENG */ \
+	  0xA727, /* LATIN SMALL LETTER HENG */ }, \
+	{ 0x0049, /* LATIN CAPITAL LETTER I */ \
+	  0x0069, /* LATIN SMALL LETTER I */ }, \
+	{ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */ \
+	  0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */ }, \
+	{ 0x012C, /* LATIN CAPITAL LETTER I WITH BREVE */ \
+	  0x012D, /* LATIN SMALL LETTER I WITH BREVE */ }, \
+	{ 0x01CF, /* LATIN CAPITAL LETTER I WITH CARON */ \
+	  0x01D0, /* LATIN SMALL LETTER I WITH CARON */ }, \
+	{ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */ \
+	  0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */ }, \
+	{ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */ \
+	  0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */ }, \
+	{ 0x1E2E, /* LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE */ \
+	  0x1E2F, /* LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE */ }, \
+	{ 0x1ECA, /* LATIN CAPITAL LETTER I WITH DOT BELOW */ \
+	  0x1ECB, /* LATIN SMALL LETTER I WITH DOT BELOW */ }, \
+	{ 0x0208, /* LATIN CAPITAL LETTER I WITH DOUBLE GRAVE */ \
+	  0x0209, /* LATIN SMALL LETTER I WITH DOUBLE GRAVE */ }, \
+	{ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */ \
+	  0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */ }, \
+	{ 0x1EC8, /* LATIN CAPITAL LETTER I WITH HOOK ABOVE */ \
+	  0x1EC9, /* LATIN SMALL LETTER I WITH HOOK ABOVE */ }, \
+	{ 0x020A, /* LATIN CAPITAL LETTER I WITH INVERTED BREVE */ \
+	  0x020B, /* LATIN SMALL LETTER I WITH INVERTED BREVE */ }, \
+	{ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */ \
+	  0x012B, /* LATIN SMALL LETTER I WITH MACRON */ }, \
+	{ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */ \
+	  0x012F, /* LATIN SMALL LETTER I WITH OGONEK */ }, \
+	{ 0x0197, /* LATIN CAPITAL LETTER I WITH STROKE */ \
+	  0x0268, /* LATIN SMALL LETTER I WITH STROKE */ }, \
+	{ 0x0128, /* LATIN CAPITAL LETTER I WITH TILDE */ \
+	  0x0129, /* LATIN SMALL LETTER I WITH TILDE */ }, \
+	{ 0x1E2C, /* LATIN CAPITAL LETTER I WITH TILDE BELOW */ \
+	  0x1E2D, /* LATIN SMALL LETTER I WITH TILDE BELOW */ }, \
+	{ 0xA779, /* LATIN CAPITAL LETTER INSULAR D */ \
+	  0xA77A, /* LATIN SMALL LETTER INSULAR D */ }, \
+	{ 0xA77B, /* LATIN CAPITAL LETTER INSULAR F */ \
+	  0xA77C, /* LATIN SMALL LETTER INSULAR F */ }, \
+	{ 0xA77D, /* LATIN CAPITAL LETTER INSULAR G */ \
+	  0x1D79, /* LATIN SMALL LETTER INSULAR G */ }, \
+	{ 0xA782, /* LATIN CAPITAL LETTER INSULAR R */ \
+	  0xA783, /* LATIN SMALL LETTER INSULAR R */ }, \
+	{ 0xA784, /* LATIN CAPITAL LETTER INSULAR S */ \
+	  0xA785, /* LATIN SMALL LETTER INSULAR S */ }, \
+	{ 0xA786, /* LATIN CAPITAL LETTER INSULAR T */ \
+	  0xA787, /* LATIN SMALL LETTER INSULAR T */ }, \
+	{ 0x0196, /* LATIN CAPITAL LETTER IOTA */ \
+	  0x0269, /* LATIN SMALL LETTER IOTA */ }, \
+	{ 0xA76C, /* LATIN CAPITAL LETTER IS */ \
+	  0xA76D, /* LATIN SMALL LETTER IS */ }, \
+	{ 0x004A, /* LATIN CAPITAL LETTER J */ \
+	  0x006A, /* LATIN SMALL LETTER J */ }, \
+	{ 0x0134, /* LATIN CAPITAL LETTER J WITH CIRCUMFLEX */ \
+	  0x0135, /* LATIN SMALL LETTER J WITH CIRCUMFLEX */ }, \
+	{ 0xA7B2, /* LATIN CAPITAL LETTER J WITH CROSSED-TAIL */ \
+	  0x029D, /* LATIN SMALL LETTER J WITH CROSSED-TAIL */ }, \
+	{ 0x0248, /* LATIN CAPITAL LETTER J WITH STROKE */ \
+	  0x0249, /* LATIN SMALL LETTER J WITH STROKE */ }, \
+	{ 0x004B, /* LATIN CAPITAL LETTER K */ \
+	  0x006B, /* LATIN SMALL LETTER K */ }, \
+	{ 0x1E30, /* LATIN CAPITAL LETTER K WITH ACUTE */ \
+	  0x1E31, /* LATIN SMALL LETTER K WITH ACUTE */ }, \
+	{ 0x01E8, /* LATIN CAPITAL LETTER K WITH CARON */ \
+	  0x01E9, /* LATIN SMALL LETTER K WITH CARON */ }, \
+	{ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */ \
+	  0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */ }, \
+	{ 0x2C69, /* LATIN CAPITAL LETTER K WITH DESCENDER */ \
+	  0x2C6A, /* LATIN SMALL LETTER K WITH DESCENDER */ }, \
+	{ 0xA742, /* LATIN CAPITAL LETTER K WITH DIAGONAL STROKE */ \
+	  0xA743, /* LATIN SMALL LETTER K WITH DIAGONAL STROKE */ }, \
+	{ 0x1E32, /* LATIN CAPITAL LETTER K WITH DOT BELOW */ \
+	  0x1E33, /* LATIN SMALL LETTER K WITH DOT BELOW */ }, \
+	{ 0x0198, /* LATIN CAPITAL LETTER K WITH HOOK */ \
+	  0x0199, /* LATIN SMALL LETTER K WITH HOOK */ }, \
+	{ 0x1E34, /* LATIN CAPITAL LETTER K WITH LINE BELOW */ \
+	  0x1E35, /* LATIN SMALL LETTER K WITH LINE BELOW */ }, \
+	{ 0xA7A2, /* LATIN CAPITAL LETTER K WITH OBLIQUE STROKE */ \
+	  0xA7A3, /* LATIN SMALL LETTER K WITH OBLIQUE STROKE */ }, \
+	{ 0xA740, /* LATIN CAPITAL LETTER K WITH STROKE */ \
+	  0xA741, /* LATIN SMALL LETTER K WITH STROKE */ }, \
+	{ 0xA744, /* LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE */ \
+	  0xA745, /* LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE */ }, \
+	{ 0x004C, /* LATIN CAPITAL LETTER L */ \
+	  0x006C, /* LATIN SMALL LETTER L */ }, \
+	{ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */ \
+	  0x013A, /* LATIN SMALL LETTER L WITH ACUTE */ }, \
+	{ 0x023D, /* LATIN CAPITAL LETTER L WITH BAR */ \
+	  0x019A, /* LATIN SMALL LETTER L WITH BAR */ }, \
+	{ 0xA7AD, /* LATIN CAPITAL LETTER L WITH BELT */ \
+	  0x026C, /* LATIN SMALL LETTER L WITH BELT */ }, \
+	{ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */ \
+	  0x013E, /* LATIN SMALL LETTER L WITH CARON */ }, \
+	{ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */ \
+	  0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */ }, \
+	{ 0x1E3C, /* LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW */ \
+	  0x1E3D, /* LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW */ }, \
+	{ 0x1E36, /* LATIN CAPITAL LETTER L WITH DOT BELOW */ \
+	  0x1E37, /* LATIN SMALL LETTER L WITH DOT BELOW */ }, \
+	{ 0x1E38, /* LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON */ \
+	  0x1E39, /* LATIN SMALL LETTER L WITH DOT BELOW AND MACRON */ }, \
+	{ 0x2C60, /* LATIN CAPITAL LETTER L WITH DOUBLE BAR */ \
+	  0x2C61, /* LATIN SMALL LETTER L WITH DOUBLE BAR */ }, \
+	{ 0xA748, /* LATIN CAPITAL LETTER L WITH HIGH STROKE */ \
+	  0xA749, /* LATIN SMALL LETTER L WITH HIGH STROKE */ }, \
+	{ 0x1E3A, /* LATIN CAPITAL LETTER L WITH LINE BELOW */ \
+	  0x1E3B, /* LATIN SMALL LETTER L WITH LINE BELOW */ }, \
+	{ 0x013F, /* LATIN CAPITAL LETTER L WITH MIDDLE DOT */ \
+	  0x0140, /* LATIN SMALL LETTER L WITH MIDDLE DOT */ }, \
+	{ 0x2C62, /* LATIN CAPITAL LETTER L WITH MIDDLE TILDE */ \
+	  0x026B, /* LATIN SMALL LETTER L WITH MIDDLE TILDE */ }, \
+	{ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */ \
+	  0x0142, /* LATIN SMALL LETTER L WITH STROKE */ }, \
+	{ 0x01C7, /* LATIN CAPITAL LETTER LJ */ \
+	  0x01C9, /* LATIN SMALL LETTER LJ */ }, \
+	{ 0x004D, /* LATIN CAPITAL LETTER M */ \
+	  0x006D, /* LATIN SMALL LETTER M */ }, \
+	{ 0x1E3E, /* LATIN CAPITAL LETTER M WITH ACUTE */ \
+	  0x1E3F, /* LATIN SMALL LETTER M WITH ACUTE */ }, \
+	{ 0x1E40, /* LATIN CAPITAL LETTER M WITH DOT ABOVE */ \
+	  0x1E41, /* LATIN SMALL LETTER M WITH DOT ABOVE */ }, \
+	{ 0x1E42, /* LATIN CAPITAL LETTER M WITH DOT BELOW */ \
+	  0x1E43, /* LATIN SMALL LETTER M WITH DOT BELOW */ }, \
+	{ 0x2C6E, /* LATIN CAPITAL LETTER M WITH HOOK */ \
+	  0x0271, /* LATIN SMALL LETTER M WITH HOOK */ }, \
+	{ 0x1EFA, /* LATIN CAPITAL LETTER MIDDLE-WELSH LL */ \
+	  0x1EFB, /* LATIN SMALL LETTER MIDDLE-WELSH LL */ }, \
+	{ 0x1EFC, /* LATIN CAPITAL LETTER MIDDLE-WELSH V */ \
+	  0x1EFD, /* LATIN SMALL LETTER MIDDLE-WELSH V */ }, \
+	{ 0x004E, /* LATIN CAPITAL LETTER N */ \
+	  0x006E, /* LATIN SMALL LETTER N */ }, \
+	{ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */ \
+	  0x0144, /* LATIN SMALL LETTER N WITH ACUTE */ }, \
+	{ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */ \
+	  0x0148, /* LATIN SMALL LETTER N WITH CARON */ }, \
+	{ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */ \
+	  0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */ }, \
+	{ 0x1E4A, /* LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW */ \
+	  0x1E4B, /* LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW */ }, \
+	{ 0xA790, /* LATIN CAPITAL LETTER N WITH DESCENDER */ \
+	  0xA791, /* LATIN SMALL LETTER N WITH DESCENDER */ }, \
+	{ 0x1E44, /* LATIN CAPITAL LETTER N WITH DOT ABOVE */ \
+	  0x1E45, /* LATIN SMALL LETTER N WITH DOT ABOVE */ }, \
+	{ 0x1E46, /* LATIN CAPITAL LETTER N WITH DOT BELOW */ \
+	  0x1E47, /* LATIN SMALL LETTER N WITH DOT BELOW */ }, \
+	{ 0x01F8, /* LATIN CAPITAL LETTER N WITH GRAVE */ \
+	  0x01F9, /* LATIN SMALL LETTER N WITH GRAVE */ }, \
+	{ 0x019D, /* LATIN CAPITAL LETTER N WITH LEFT HOOK */ \
+	  0x0272, /* LATIN SMALL LETTER N WITH LEFT HOOK */ }, \
+	{ 0x1E48, /* LATIN CAPITAL LETTER N WITH LINE BELOW */ \
+	  0x1E49, /* LATIN SMALL LETTER N WITH LINE BELOW */ }, \
+	{ 0x0220, /* LATIN CAPITAL LETTER N WITH LONG RIGHT LEG */ \
+	  0x019E, /* LATIN SMALL LETTER N WITH LONG RIGHT LEG */ }, \
+	{ 0xA7A4, /* LATIN CAPITAL LETTER N WITH OBLIQUE STROKE */ \
+	  0xA7A5, /* LATIN SMALL LETTER N WITH OBLIQUE STROKE */ }, \
+	{ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */ \
+	  0x00F1, /* LATIN SMALL LETTER N WITH TILDE */ }, \
+	{ 0x01CA, /* LATIN CAPITAL LETTER NJ */ \
+	  0x01CC, /* LATIN SMALL LETTER NJ */ }, \
+	{ 0x004F, /* LATIN CAPITAL LETTER O */ \
+	  0x006F, /* LATIN SMALL LETTER O */ }, \
+	{ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */ \
+	  0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */ }, \
+	{ 0x014E, /* LATIN CAPITAL LETTER O WITH BREVE */ \
+	  0x014F, /* LATIN SMALL LETTER O WITH BREVE */ }, \
+	{ 0x01D1, /* LATIN CAPITAL LETTER O WITH CARON */ \
+	  0x01D2, /* LATIN SMALL LETTER O WITH CARON */ }, \
+	{ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */ \
+	  0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */ }, \
+	{ 0x1ED0, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE */ \
+	  0x1ED1, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE */ }, \
+	{ 0x1ED8, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW */ \
+	  0x1ED9, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW */ }, \
+	{ 0x1ED2, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE */ \
+	  0x1ED3, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE */ }, \
+	{ 0x1ED4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE */ \
+	  0x1ED5, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE */ }, \
+	{ 0x1ED6, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE */ \
+	  0x1ED7, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE */ }, \
+	{ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */ \
+	  0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */ }, \
+	{ 0x022A, /* LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON */ \
+	  0x022B, /* LATIN SMALL LETTER O WITH DIAERESIS AND MACRON */ }, \
+	{ 0x022E, /* LATIN CAPITAL LETTER O WITH DOT ABOVE */ \
+	  0x022F, /* LATIN SMALL LETTER O WITH DOT ABOVE */ }, \
+	{ 0x0230, /* LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON */ \
+	  0x0231, /* LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON */ }, \
+	{ 0x1ECC, /* LATIN CAPITAL LETTER O WITH DOT BELOW */ \
+	  0x1ECD, /* LATIN SMALL LETTER O WITH DOT BELOW */ }, \
+	{ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */ \
+	  0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */ }, \
+	{ 0x020C, /* LATIN CAPITAL LETTER O WITH DOUBLE GRAVE */ \
+	  0x020D, /* LATIN SMALL LETTER O WITH DOUBLE GRAVE */ }, \
+	{ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */ \
+	  0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */ }, \
+	{ 0x1ECE, /* LATIN CAPITAL LETTER O WITH HOOK ABOVE */ \
+	  0x1ECF, /* LATIN SMALL LETTER O WITH HOOK ABOVE */ }, \
+	{ 0x01A0, /* LATIN CAPITAL LETTER O WITH HORN */ \
+	  0x01A1, /* LATIN SMALL LETTER O WITH HORN */ }, \
+	{ 0x1EDA, /* LATIN CAPITAL LETTER O WITH HORN AND ACUTE */ \
+	  0x1EDB, /* LATIN SMALL LETTER O WITH HORN AND ACUTE */ }, \
+	{ 0x1EE2, /* LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW */ \
+	  0x1EE3, /* LATIN SMALL LETTER O WITH HORN AND DOT BELOW */ }, \
+	{ 0x1EDC, /* LATIN CAPITAL LETTER O WITH HORN AND GRAVE */ \
+	  0x1EDD, /* LATIN SMALL LETTER O WITH HORN AND GRAVE */ }, \
+	{ 0x1EDE, /* LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE */ \
+	  0x1EDF, /* LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE */ }, \
+	{ 0x1EE0, /* LATIN CAPITAL LETTER O WITH HORN AND TILDE */ \
+	  0x1EE1, /* LATIN SMALL LETTER O WITH HORN AND TILDE */ }, \
+	{ 0x020E, /* LATIN CAPITAL LETTER O WITH INVERTED BREVE */ \
+	  0x020F, /* LATIN SMALL LETTER O WITH INVERTED BREVE */ }, \
+	{ 0xA74A, /* LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY */ \
+	  0xA74B, /* LATIN SMALL LETTER O WITH LONG STROKE OVERLAY */ }, \
+	{ 0xA74C, /* LATIN CAPITAL LETTER O WITH LOOP */ \
+	  0xA74D, /* LATIN SMALL LETTER O WITH LOOP */ }, \
+	{ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */ \
+	  0x014D, /* LATIN SMALL LETTER O WITH MACRON */ }, \
+	{ 0x1E52, /* LATIN CAPITAL LETTER O WITH MACRON AND ACUTE */ \
+	  0x1E53, /* LATIN SMALL LETTER O WITH MACRON AND ACUTE */ }, \
+	{ 0x1E50, /* LATIN CAPITAL LETTER O WITH MACRON AND GRAVE */ \
+	  0x1E51, /* LATIN SMALL LETTER O WITH MACRON AND GRAVE */ }, \
+	{ 0x01EA, /* LATIN CAPITAL LETTER O WITH OGONEK */ \
+	  0x01EB, /* LATIN SMALL LETTER O WITH OGONEK */ }, \
+	{ 0x01EC, /* LATIN CAPITAL LETTER O WITH OGONEK AND MACRON */ \
+	  0x01ED, /* LATIN SMALL LETTER O WITH OGONEK AND MACRON */ }, \
+	{ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */ \
+	  0x00F8, /* LATIN SMALL LETTER O WITH STROKE */ }, \
+	{ 0x01FE, /* LATIN CAPITAL LETTER O WITH STROKE AND ACUTE */ \
+	  0x01FF, /* LATIN SMALL LETTER O WITH STROKE AND ACUTE */ }, \
+	{ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */ \
+	  0x00F5, /* LATIN SMALL LETTER O WITH TILDE */ }, \
+	{ 0x1E4C, /* LATIN CAPITAL LETTER O WITH TILDE AND ACUTE */ \
+	  0x1E4D, /* LATIN SMALL LETTER O WITH TILDE AND ACUTE */ }, \
+	{ 0x1E4E, /* LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS */ \
+	  0x1E4F, /* LATIN SMALL LETTER O WITH TILDE AND DIAERESIS */ }, \
+	{ 0x022C, /* LATIN CAPITAL LETTER O WITH TILDE AND MACRON */ \
+	  0x022D, /* LATIN SMALL LETTER O WITH TILDE AND MACRON */ }, \
+	{ 0x01A2, /* LATIN CAPITAL LETTER OI */ \
+	  0x01A3, /* LATIN SMALL LETTER OI */ }, \
+	{ 0xA7B6, /* LATIN CAPITAL LETTER OMEGA */ \
+	  0xA7B7, /* LATIN SMALL LETTER OMEGA */ }, \
+	{ 0xA74E, /* LATIN CAPITAL LETTER OO */ \
+	  0xA74F, /* LATIN SMALL LETTER OO */ }, \
+	{ 0x0190, /* LATIN CAPITAL LETTER OPEN E */ \
+	  0x025B, /* LATIN SMALL LETTER OPEN E */ }, \
+	{ 0x0186, /* LATIN CAPITAL LETTER OPEN O */ \
+	  0x0254, /* LATIN SMALL LETTER OPEN O */ }, \
+	{ 0x0222, /* LATIN CAPITAL LETTER OU */ \
+	  0x0223, /* LATIN SMALL LETTER OU */ }, \
+	{ 0x0050, /* LATIN CAPITAL LETTER P */ \
+	  0x0070, /* LATIN SMALL LETTER P */ }, \
+	{ 0x1E54, /* LATIN CAPITAL LETTER P WITH ACUTE */ \
+	  0x1E55, /* LATIN SMALL LETTER P WITH ACUTE */ }, \
+	{ 0x1E56, /* LATIN CAPITAL LETTER P WITH DOT ABOVE */ \
+	  0x1E57, /* LATIN SMALL LETTER P WITH DOT ABOVE */ }, \
+	{ 0xA752, /* LATIN CAPITAL LETTER P WITH FLOURISH */ \
+	  0xA753, /* LATIN SMALL LETTER P WITH FLOURISH */ }, \
+	{ 0x01A4, /* LATIN CAPITAL LETTER P WITH HOOK */ \
+	  0x01A5, /* LATIN SMALL LETTER P WITH HOOK */ }, \
+	{ 0xA754, /* LATIN CAPITAL LETTER P WITH SQUIRREL TAIL */ \
+	  0xA755, /* LATIN SMALL LETTER P WITH SQUIRREL TAIL */ }, \
+	{ 0x2C63, /* LATIN CAPITAL LETTER P WITH STROKE */ \
+	  0x1D7D, /* LATIN SMALL LETTER P WITH STROKE */ }, \
+	{ 0xA750, /* LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER */ \
+	  0xA751, /* LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER */ }, \
+	{ 0x0051, /* LATIN CAPITAL LETTER Q */ \
+	  0x0071, /* LATIN SMALL LETTER Q */ }, \
+	{ 0xA758, /* LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE */ \
+	  0xA759, /* LATIN SMALL LETTER Q WITH DIAGONAL STROKE */ }, \
+	{ 0xA756, /* LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER */ \
+	  0xA757, /* LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER */ }, \
+	{ 0x0052, /* LATIN CAPITAL LETTER R */ \
+	  0x0072, /* LATIN SMALL LETTER R */ }, \
+	{ 0xA75A, /* LATIN CAPITAL LETTER R ROTUNDA */ \
+	  0xA75B, /* LATIN SMALL LETTER R ROTUNDA */ }, \
+	{ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */ \
+	  0x0155, /* LATIN SMALL LETTER R WITH ACUTE */ }, \
+	{ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */ \
+	  0x0159, /* LATIN SMALL LETTER R WITH CARON */ }, \
+	{ 0x0156, /* LATIN CAPITAL LETTER R WITH CEDILLA */ \
+	  0x0157, /* LATIN SMALL LETTER R WITH CEDILLA */ }, \
+	{ 0x1E58, /* LATIN CAPITAL LETTER R WITH DOT ABOVE */ \
+	  0x1E59, /* LATIN SMALL LETTER R WITH DOT ABOVE */ }, \
+	{ 0x1E5A, /* LATIN CAPITAL LETTER R WITH DOT BELOW */ \
+	  0x1E5B, /* LATIN SMALL LETTER R WITH DOT BELOW */ }, \
+	{ 0x1E5C, /* LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON */ \
+	  0x1E5D, /* LATIN SMALL LETTER R WITH DOT BELOW AND MACRON */ }, \
+	{ 0x0210, /* LATIN CAPITAL LETTER R WITH DOUBLE GRAVE */ \
+	  0x0211, /* LATIN SMALL LETTER R WITH DOUBLE GRAVE */ }, \
+	{ 0x0212, /* LATIN CAPITAL LETTER R WITH INVERTED BREVE */ \
+	  0x0213, /* LATIN SMALL LETTER R WITH INVERTED BREVE */ }, \
+	{ 0x1E5E, /* LATIN CAPITAL LETTER R WITH LINE BELOW */ \
+	  0x1E5F, /* LATIN SMALL LETTER R WITH LINE BELOW */ }, \
+	{ 0xA7A6, /* LATIN CAPITAL LETTER R WITH OBLIQUE STROKE */ \
+	  0xA7A7, /* LATIN SMALL LETTER R WITH OBLIQUE STROKE */ }, \
+	{ 0x024C, /* LATIN CAPITAL LETTER R WITH STROKE */ \
+	  0x024D, /* LATIN SMALL LETTER R WITH STROKE */ }, \
+	{ 0x2C64, /* LATIN CAPITAL LETTER R WITH TAIL */ \
+	  0x027D, /* LATIN SMALL LETTER R WITH TAIL */ }, \
+	{ 0xA73E, /* LATIN CAPITAL LETTER REVERSED C WITH DOT */ \
+	  0xA73F, /* LATIN SMALL LETTER REVERSED C WITH DOT */ }, \
+	{ 0x018E, /* LATIN CAPITAL LETTER REVERSED E */ \
+	  0x0258, /* LATIN SMALL LETTER REVERSED E */ }, \
+	{ 0xA7AB, /* LATIN CAPITAL LETTER REVERSED OPEN E */ \
+	  0x025C, /* LATIN SMALL LETTER REVERSED OPEN E */ }, \
+	{ 0xA75C, /* LATIN CAPITAL LETTER RUM ROTUNDA */ \
+	  0xA75D, /* LATIN SMALL LETTER RUM ROTUNDA */ }, \
+	{ 0x0053, /* LATIN CAPITAL LETTER S */ \
+	  0x0073, /* LATIN SMALL LETTER S */ }, \
+	{ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */ \
+	  0x015B, /* LATIN SMALL LETTER S WITH ACUTE */ }, \
+	{ 0x1E64, /* LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE */ \
+	  0x1E65, /* LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE */ }, \
+	{ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */ \
+	  0x0161, /* LATIN SMALL LETTER S WITH CARON */ }, \
+	{ 0x1E66, /* LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE */ \
+	  0x1E67, /* LATIN SMALL LETTER S WITH CARON AND DOT ABOVE */ }, \
+	{ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */ \
+	  0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */ }, \
+	{ 0x015C, /* LATIN CAPITAL LETTER S WITH CIRCUMFLEX */ \
+	  0x015D, /* LATIN SMALL LETTER S WITH CIRCUMFLEX */ }, \
+	{ 0x0218, /* LATIN CAPITAL LETTER S WITH COMMA BELOW */ \
+	  0x0219, /* LATIN SMALL LETTER S WITH COMMA BELOW */ }, \
+	{ 0x1E60, /* LATIN CAPITAL LETTER S WITH DOT ABOVE */ \
+	  0x1E61, /* LATIN SMALL LETTER S WITH DOT ABOVE */ }, \
+	{ 0x1E62, /* LATIN CAPITAL LETTER S WITH DOT BELOW */ \
+	  0x1E63, /* LATIN SMALL LETTER S WITH DOT BELOW */ }, \
+	{ 0x1E68, /* LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE */ \
+	  0x1E69, /* LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE */ }, \
+	{ 0xA7A8, /* LATIN CAPITAL LETTER S WITH OBLIQUE STROKE */ \
+	  0xA7A9, /* LATIN SMALL LETTER S WITH OBLIQUE STROKE */ }, \
+	{ 0x2C7E, /* LATIN CAPITAL LETTER S WITH SWASH TAIL */ \
+	  0x023F, /* LATIN SMALL LETTER S WITH SWASH TAIL */ }, \
+	{ 0xA78B, /* LATIN CAPITAL LETTER SALTILLO */ \
+	  0xA78C, /* LATIN SMALL LETTER SALTILLO */ }, \
+	{ 0x018F, /* LATIN CAPITAL LETTER SCHWA */ \
+	  0x0259, /* LATIN SMALL LETTER SCHWA */ }, \
+	{ 0xA7AC, /* LATIN CAPITAL LETTER SCRIPT G */ \
+	  0x0261, /* LATIN SMALL LETTER SCRIPT G */ }, \
+	{ 0x1E9E, /* LATIN CAPITAL LETTER SHARP S */ \
+	  0x00DF, /* LATIN SMALL LETTER SHARP S */ }, \
+	{ 0x0054, /* LATIN CAPITAL LETTER T */ \
+	  0x0074, /* LATIN SMALL LETTER T */ }, \
+	{ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */ \
+	  0x0165, /* LATIN SMALL LETTER T WITH CARON */ }, \
+	{ 0x0162, /* LATIN CAPITAL LETTER T WITH CEDILLA */ \
+	  0x0163, /* LATIN SMALL LETTER T WITH CEDILLA */ }, \
+	{ 0x1E70, /* LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW */ \
+	  0x1E71, /* LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW */ }, \
+	{ 0x021A, /* LATIN CAPITAL LETTER T WITH COMMA BELOW */ \
+	  0x021B, /* LATIN SMALL LETTER T WITH COMMA BELOW */ }, \
+	{ 0x023E, /* LATIN CAPITAL LETTER T WITH DIAGONAL STROKE */ \
+	  0x2C66, /* LATIN SMALL LETTER T WITH DIAGONAL STROKE */ }, \
+	{ 0x1E6A, /* LATIN CAPITAL LETTER T WITH DOT ABOVE */ \
+	  0x1E6B, /* LATIN SMALL LETTER T WITH DOT ABOVE */ }, \
+	{ 0x1E6C, /* LATIN CAPITAL LETTER T WITH DOT BELOW */ \
+	  0x1E6D, /* LATIN SMALL LETTER T WITH DOT BELOW */ }, \
+	{ 0x01AC, /* LATIN CAPITAL LETTER T WITH HOOK */ \
+	  0x01AD, /* LATIN SMALL LETTER T WITH HOOK */ }, \
+	{ 0x1E6E, /* LATIN CAPITAL LETTER T WITH LINE BELOW */ \
+	  0x1E6F, /* LATIN SMALL LETTER T WITH LINE BELOW */ }, \
+	{ 0x01AE, /* LATIN CAPITAL LETTER T WITH RETROFLEX HOOK */ \
+	  0x0288, /* LATIN SMALL LETTER T WITH RETROFLEX HOOK */ }, \
+	{ 0x0166, /* LATIN CAPITAL LETTER T WITH STROKE */ \
+	  0x0167, /* LATIN SMALL LETTER T WITH STROKE */ }, \
+	{ 0x00DE, /* LATIN CAPITAL LETTER THORN */ \
+	  0x00FE, /* LATIN SMALL LETTER THORN */ }, \
+	{ 0xA764, /* LATIN CAPITAL LETTER THORN WITH STROKE */ \
+	  0xA765, /* LATIN SMALL LETTER THORN WITH STROKE */ }, \
+	{ 0xA766, /* LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER */ \
+	  0xA767, /* LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER */ }, \
+	{ 0x01BC, /* LATIN CAPITAL LETTER TONE FIVE */ \
+	  0x01BD, /* LATIN SMALL LETTER TONE FIVE */ }, \
+	{ 0x0184, /* LATIN CAPITAL LETTER TONE SIX */ \
+	  0x0185, /* LATIN SMALL LETTER TONE SIX */ }, \
+	{ 0x01A7, /* LATIN CAPITAL LETTER TONE TWO */ \
+	  0x01A8, /* LATIN SMALL LETTER TONE TWO */ }, \
+	{ 0xA72A, /* LATIN CAPITAL LETTER TRESILLO */ \
+	  0xA72B, /* LATIN SMALL LETTER TRESILLO */ }, \
+	{ 0x2C6F, /* LATIN CAPITAL LETTER TURNED A */ \
+	  0x0250, /* LATIN SMALL LETTER TURNED A */ }, \
+	{ 0x2C70, /* LATIN CAPITAL LETTER TURNED ALPHA */ \
+	  0x0252, /* LATIN SMALL LETTER TURNED ALPHA */ }, \
+	{ 0xA78D, /* LATIN CAPITAL LETTER TURNED H */ \
+	  0x0265, /* LATIN SMALL LETTER TURNED H */ }, \
+	{ 0xA77E, /* LATIN CAPITAL LETTER TURNED INSULAR G */ \
+	  0xA77F, /* LATIN SMALL LETTER TURNED INSULAR G */ }, \
+	{ 0xA7B0, /* LATIN CAPITAL LETTER TURNED K */ \
+	  0x029E, /* LATIN SMALL LETTER TURNED K */ }, \
+	{ 0xA780, /* LATIN CAPITAL LETTER TURNED L */ \
+	  0xA781, /* LATIN SMALL LETTER TURNED L */ }, \
+	{ 0x019C, /* LATIN CAPITAL LETTER TURNED M */ \
+	  0x026F, /* LATIN SMALL LETTER TURNED M */ }, \
+	{ 0xA7B1, /* LATIN CAPITAL LETTER TURNED T */ \
+	  0x0287, /* LATIN SMALL LETTER TURNED T */ }, \
+	{ 0x0245, /* LATIN CAPITAL LETTER TURNED V */ \
+	  0x028C, /* LATIN SMALL LETTER TURNED V */ }, \
+	{ 0xA728, /* LATIN CAPITAL LETTER TZ */ \
+	  0xA729, /* LATIN SMALL LETTER TZ */ }, \
+	{ 0x0055, /* LATIN CAPITAL LETTER U */ \
+	  0x0075, /* LATIN SMALL LETTER U */ }, \
+	{ 0x0244, /* LATIN CAPITAL LETTER U BAR */ \
+	  0x0289, /* LATIN SMALL LETTER U BAR */ }, \
+	{ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */ \
+	  0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */ }, \
+	{ 0x016C, /* LATIN CAPITAL LETTER U WITH BREVE */ \
+	  0x016D, /* LATIN SMALL LETTER U WITH BREVE */ }, \
+	{ 0x01D3, /* LATIN CAPITAL LETTER U WITH CARON */ \
+	  0x01D4, /* LATIN SMALL LETTER U WITH CARON */ }, \
+	{ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */ \
+	  0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */ }, \
+	{ 0x1E76, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW */ \
+	  0x1E77, /* LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW */ }, \
+	{ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */ \
+	  0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */ }, \
+	{ 0x01D7, /* LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE */ \
+	  0x01D8, /* LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE */ }, \
+	{ 0x01D9, /* LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON */ \
+	  0x01DA, /* LATIN SMALL LETTER U WITH DIAERESIS AND CARON */ }, \
+	{ 0x01DB, /* LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE */ \
+	  0x01DC, /* LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE */ }, \
+	{ 0x01D5, /* LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON */ \
+	  0x01D6, /* LATIN SMALL LETTER U WITH DIAERESIS AND MACRON */ }, \
+	{ 0x1E72, /* LATIN CAPITAL LETTER U WITH DIAERESIS BELOW */ \
+	  0x1E73, /* LATIN SMALL LETTER U WITH DIAERESIS BELOW */ }, \
+	{ 0x1EE4, /* LATIN CAPITAL LETTER U WITH DOT BELOW */ \
+	  0x1EE5, /* LATIN SMALL LETTER U WITH DOT BELOW */ }, \
+	{ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */ \
+	  0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */ }, \
+	{ 0x0214, /* LATIN CAPITAL LETTER U WITH DOUBLE GRAVE */ \
+	  0x0215, /* LATIN SMALL LETTER U WITH DOUBLE GRAVE */ }, \
+	{ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */ \
+	  0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */ }, \
+	{ 0x1EE6, /* LATIN CAPITAL LETTER U WITH HOOK ABOVE */ \
+	  0x1EE7, /* LATIN SMALL LETTER U WITH HOOK ABOVE */ }, \
+	{ 0x01AF, /* LATIN CAPITAL LETTER U WITH HORN */ \
+	  0x01B0, /* LATIN SMALL LETTER U WITH HORN */ }, \
+	{ 0x1EE8, /* LATIN CAPITAL LETTER U WITH HORN AND ACUTE */ \
+	  0x1EE9, /* LATIN SMALL LETTER U WITH HORN AND ACUTE */ }, \
+	{ 0x1EF0, /* LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW */ \
+	  0x1EF1, /* LATIN SMALL LETTER U WITH HORN AND DOT BELOW */ }, \
+	{ 0x1EEA, /* LATIN CAPITAL LETTER U WITH HORN AND GRAVE */ \
+	  0x1EEB, /* LATIN SMALL LETTER U WITH HORN AND GRAVE */ }, \
+	{ 0x1EEC, /* LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE */ \
+	  0x1EED, /* LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE */ }, \
+	{ 0x1EEE, /* LATIN CAPITAL LETTER U WITH HORN AND TILDE */ \
+	  0x1EEF, /* LATIN SMALL LETTER U WITH HORN AND TILDE */ }, \
+	{ 0x0216, /* LATIN CAPITAL LETTER U WITH INVERTED BREVE */ \
+	  0x0217, /* LATIN SMALL LETTER U WITH INVERTED BREVE */ }, \
+	{ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */ \
+	  0x016B, /* LATIN SMALL LETTER U WITH MACRON */ }, \
+	{ 0x1E7A, /* LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS */ \
+	  0x1E7B, /* LATIN SMALL LETTER U WITH MACRON AND DIAERESIS */ }, \
+	{ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */ \
+	  0x0173, /* LATIN SMALL LETTER U WITH OGONEK */ }, \
+	{ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */ \
+	  0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */ }, \
+	{ 0xA7B8, /* LATIN CAPITAL LETTER U WITH STROKE */ \
+	  0xA7B9, /* LATIN SMALL LETTER U WITH STROKE */ }, \
+	{ 0x0168, /* LATIN CAPITAL LETTER U WITH TILDE */ \
+	  0x0169, /* LATIN SMALL LETTER U WITH TILDE */ }, \
+	{ 0x1E78, /* LATIN CAPITAL LETTER U WITH TILDE AND ACUTE */ \
+	  0x1E79, /* LATIN SMALL LETTER U WITH TILDE AND ACUTE */ }, \
+	{ 0x1E74, /* LATIN CAPITAL LETTER U WITH TILDE BELOW */ \
+	  0x1E75, /* LATIN SMALL LETTER U WITH TILDE BELOW */ }, \
+	{ 0x01B1, /* LATIN CAPITAL LETTER UPSILON */ \
+	  0x028A, /* LATIN SMALL LETTER UPSILON */ }, \
+	{ 0x0056, /* LATIN CAPITAL LETTER V */ \
+	  0x0076, /* LATIN SMALL LETTER V */ }, \
+	{ 0xA75E, /* LATIN CAPITAL LETTER V WITH DIAGONAL STROKE */ \
+	  0xA75F, /* LATIN SMALL LETTER V WITH DIAGONAL STROKE */ }, \
+	{ 0x1E7E, /* LATIN CAPITAL LETTER V WITH DOT BELOW */ \
+	  0x1E7F, /* LATIN SMALL LETTER V WITH DOT BELOW */ }, \
+	{ 0x01B2, /* LATIN CAPITAL LETTER V WITH HOOK */ \
+	  0x028B, /* LATIN SMALL LETTER V WITH HOOK */ }, \
+	{ 0x1E7C, /* LATIN CAPITAL LETTER V WITH TILDE */ \
+	  0x1E7D, /* LATIN SMALL LETTER V WITH TILDE */ }, \
+	{ 0xA768, /* LATIN CAPITAL LETTER VEND */ \
+	  0xA769, /* LATIN SMALL LETTER VEND */ }, \
+	{ 0xA762, /* LATIN CAPITAL LETTER VISIGOTHIC Z */ \
+	  0xA763, /* LATIN SMALL LETTER VISIGOTHIC Z */ }, \
+	{ 0xA79A, /* LATIN CAPITAL LETTER VOLAPUK AE */ \
+	  0xA79B, /* LATIN SMALL LETTER VOLAPUK AE */ }, \
+	{ 0xA79C, /* LATIN CAPITAL LETTER VOLAPUK OE */ \
+	  0xA79D, /* LATIN SMALL LETTER VOLAPUK OE */ }, \
+	{ 0xA79E, /* LATIN CAPITAL LETTER VOLAPUK UE */ \
+	  0xA79F, /* LATIN SMALL LETTER VOLAPUK UE */ }, \
+	{ 0xA760, /* LATIN CAPITAL LETTER VY */ \
+	  0xA761, /* LATIN SMALL LETTER VY */ }, \
+	{ 0x0057, /* LATIN CAPITAL LETTER W */ \
+	  0x0077, /* LATIN SMALL LETTER W */ }, \
+	{ 0x1E82, /* LATIN CAPITAL LETTER W WITH ACUTE */ \
+	  0x1E83, /* LATIN SMALL LETTER W WITH ACUTE */ }, \
+	{ 0x0174, /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */ \
+	  0x0175, /* LATIN SMALL LETTER W WITH CIRCUMFLEX */ }, \
+	{ 0x1E84, /* LATIN CAPITAL LETTER W WITH DIAERESIS */ \
+	  0x1E85, /* LATIN SMALL LETTER W WITH DIAERESIS */ }, \
+	{ 0x1E86, /* LATIN CAPITAL LETTER W WITH DOT ABOVE */ \
+	  0x1E87, /* LATIN SMALL LETTER W WITH DOT ABOVE */ }, \
+	{ 0x1E88, /* LATIN CAPITAL LETTER W WITH DOT BELOW */ \
+	  0x1E89, /* LATIN SMALL LETTER W WITH DOT BELOW */ }, \
+	{ 0x1E80, /* LATIN CAPITAL LETTER W WITH GRAVE */ \
+	  0x1E81, /* LATIN SMALL LETTER W WITH GRAVE */ }, \
+	{ 0x2C72, /* LATIN CAPITAL LETTER W WITH HOOK */ \
+	  0x2C73, /* LATIN SMALL LETTER W WITH HOOK */ }, \
+	{ 0x0058, /* LATIN CAPITAL LETTER X */ \
+	  0x0078, /* LATIN SMALL LETTER X */ }, \
+	{ 0x1E8C, /* LATIN CAPITAL LETTER X WITH DIAERESIS */ \
+	  0x1E8D, /* LATIN SMALL LETTER X WITH DIAERESIS */ }, \
+	{ 0x1E8A, /* LATIN CAPITAL LETTER X WITH DOT ABOVE */ \
+	  0x1E8B, /* LATIN SMALL LETTER X WITH DOT ABOVE */ }, \
+	{ 0x0059, /* LATIN CAPITAL LETTER Y */ \
+	  0x0079, /* LATIN SMALL LETTER Y */ }, \
+	{ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */ \
+	  0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */ }, \
+	{ 0x0176, /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */ \
+	  0x0177, /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */ }, \
+	{ 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */ \
+	  0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */ }, \
+	{ 0x1E8E, /* LATIN CAPITAL LETTER Y WITH DOT ABOVE */ \
+	  0x1E8F, /* LATIN SMALL LETTER Y WITH DOT ABOVE */ }, \
+	{ 0x1EF4, /* LATIN CAPITAL LETTER Y WITH DOT BELOW */ \
+	  0x1EF5, /* LATIN SMALL LETTER Y WITH DOT BELOW */ }, \
+	{ 0x1EF2, /* LATIN CAPITAL LETTER Y WITH GRAVE */ \
+	  0x1EF3, /* LATIN SMALL LETTER Y WITH GRAVE */ }, \
+	{ 0x01B3, /* LATIN CAPITAL LETTER Y WITH HOOK */ \
+	  0x01B4, /* LATIN SMALL LETTER Y WITH HOOK */ }, \
+	{ 0x1EF6, /* LATIN CAPITAL LETTER Y WITH HOOK ABOVE */ \
+	  0x1EF7, /* LATIN SMALL LETTER Y WITH HOOK ABOVE */ }, \
+	{ 0x1EFE, /* LATIN CAPITAL LETTER Y WITH LOOP */ \
+	  0x1EFF, /* LATIN SMALL LETTER Y WITH LOOP */ }, \
+	{ 0x0232, /* LATIN CAPITAL LETTER Y WITH MACRON */ \
+	  0x0233, /* LATIN SMALL LETTER Y WITH MACRON */ }, \
+	{ 0x024E, /* LATIN CAPITAL LETTER Y WITH STROKE */ \
+	  0x024F, /* LATIN SMALL LETTER Y WITH STROKE */ }, \
+	{ 0x1EF8, /* LATIN CAPITAL LETTER Y WITH TILDE */ \
+	  0x1EF9, /* LATIN SMALL LETTER Y WITH TILDE */ }, \
+	{ 0x021C, /* LATIN CAPITAL LETTER YOGH */ \
+	  0x021D, /* LATIN SMALL LETTER YOGH */ }, \
+	{ 0x005A, /* LATIN CAPITAL LETTER Z */ \
+	  0x007A, /* LATIN SMALL LETTER Z */ }, \
+	{ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */ \
+	  0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */ }, \
+	{ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */ \
+	  0x017E, /* LATIN SMALL LETTER Z WITH CARON */ }, \
+	{ 0x1E90, /* LATIN CAPITAL LETTER Z WITH CIRCUMFLEX */ \
+	  0x1E91, /* LATIN SMALL LETTER Z WITH CIRCUMFLEX */ }, \
+	{ 0x2C6B, /* LATIN CAPITAL LETTER Z WITH DESCENDER */ \
+	  0x2C6C, /* LATIN SMALL LETTER Z WITH DESCENDER */ }, \
+	{ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */ \
+	  0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */ }, \
+	{ 0x1E92, /* LATIN CAPITAL LETTER Z WITH DOT BELOW */ \
+	  0x1E93, /* LATIN SMALL LETTER Z WITH DOT BELOW */ }, \
+	{ 0x0224, /* LATIN CAPITAL LETTER Z WITH HOOK */ \
+	  0x0225, /* LATIN SMALL LETTER Z WITH HOOK */ }, \
+	{ 0x1E94, /* LATIN CAPITAL LETTER Z WITH LINE BELOW */ \
+	  0x1E95, /* LATIN SMALL LETTER Z WITH LINE BELOW */ }, \
+	{ 0x01B5, /* LATIN CAPITAL LETTER Z WITH STROKE */ \
+	  0x01B6, /* LATIN SMALL LETTER Z WITH STROKE */ }, \
+	{ 0x2C7F, /* LATIN CAPITAL LETTER Z WITH SWASH TAIL */ \
+	  0x0240, /* LATIN SMALL LETTER Z WITH SWASH TAIL */ }, \
+	{ 0x0000, /* END OF LIST CAPITAL LETTERS */ \
+	  0x0000, /* END OF LIST SMALL LETTERS */ }, \
+}
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 12/15] lib: charset: upper/lower case conversion
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (10 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 11/15] efi_loader: capitalization table Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 13/15] test: tests for utf_to_lower() utf_to_upper() Heinrich Schuchardt
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

Provide functions for upper and lower case conversion.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 include/charset.h | 16 ++++++++++++++++
 lib/charset.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/include/charset.h b/include/charset.h
index 1c0976dde6..6fdfaa9579 100644
--- a/include/charset.h
+++ b/include/charset.h
@@ -142,6 +142,22 @@ int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count);
  */
 #define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX)
 
+/**
+ * utf_to_lower() - convert a Unicode letter to lower case
+ *
+ * @code:		letter to convert
+ * Return:		lower case letter or unchanged letter
+ */
+s32 utf_to_lower(const s32 code);
+
+/**
+ * utf_to_upper() - convert a Unicode letter to upper case
+ *
+ * @code:		letter to convert
+ * Return:		upper case letter or unchanged letter
+ */
+s32 utf_to_upper(const s32 code);
+
 /**
  * u16_strlen - count non-zero words
  *
diff --git a/lib/charset.c b/lib/charset.c
index d2c723be3c..6531b56e85 100644
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -6,8 +6,12 @@
  */
 
 #include <charset.h>
+#include <capitalization.h>
 #include <malloc.h>
 
+static struct capitalization_table capitalization_table[] =
+	UNICODE_CAPITALIZATION_TABLE;
+
 s32 utf8_get(const char **src)
 {
 	s32 code = 0;
@@ -248,6 +252,43 @@ int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
 	return 0;
 }
 
+s32 utf_to_lower(const s32 code)
+{
+	struct capitalization_table *pos = capitalization_table;
+	s32 ret = code;
+
+	if (code <= 0x7f) {
+		if (code >= 'A' && code <= 'Z')
+			ret += 0x20;
+		return ret;
+	}
+	for (; pos->upper; ++pos) {
+		if (pos->upper == code) {
+			ret = pos->lower;
+			break;
+		}
+	}
+	return ret;
+}
+
+s32 utf_to_upper(const s32 code)
+{
+	struct capitalization_table *pos = capitalization_table;
+	s32 ret = code;
+
+	if (code <= 0x7f) {
+		if (code >= 'a' && code <= 'z')
+			ret -= 0x20;
+		return ret;
+	}
+	for (; pos->lower; ++pos) {
+		if (pos->lower == code) {
+			ret = pos->upper;
+			break;
+		}
+	}
+	return ret;
+}
 
 size_t u16_strlen(const u16 *in)
 {
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 13/15] test: tests for utf_to_lower() utf_to_upper().
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (11 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 12/15] lib: charset: upper/lower case conversion Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 14/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
  2018-08-11 15:28 ` [U-Boot] [PATCH 15/15] efi_selftest: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
  14 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

Provide unit tests for utf_to_lower() utf_to_upper().

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 test/unicode_ut.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/test/unicode_ut.c b/test/unicode_ut.c
index 8e8c4d189e..ae14fb0eff 100644
--- a/test/unicode_ut.c
+++ b/test/unicode_ut.c
@@ -483,6 +483,42 @@ int ut_utf16_utf8_strncpy(void)
 	return 0;
 }
 
+int ut_utf_to_lower(void)
+{
+	if (utf_to_lower('@') != '@')
+		return 1;
+	if (utf_to_lower('A') != 'a')
+		return 1;
+	if (utf_to_lower('Z') != 'z')
+		return 1;
+	if (utf_to_lower('[') != '[')
+		return 1;
+	if (utf_to_lower('m') != 'm')
+		return 1;
+	/* Cyrillic letter I*/
+	if (utf_to_lower(0x0418) != 0x0438)
+		return 0;
+	return 0;
+}
+
+int ut_utf_to_upper(void)
+{
+	if (utf_to_upper('M') != 'M')
+		return 1;
+	if (utf_to_lower('`') != '`')
+		return 1;
+	if (utf_to_upper('a') != 'A')
+		return 1;
+	if (utf_to_upper('z') != 'Z')
+		return 1;
+	if (utf_to_upper('{') != '{')
+		return 1;
+	/* Cyrillic letter I */
+	if (utf_to_upper(0x0438) != 0x0418)
+		return 1;
+	return 0;
+}
+
 int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
 {
 	int ret = 0;
@@ -500,6 +536,8 @@ int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[])
 	ret |= ut_utf16_utf8_strnlen();
 	ret |= ut_utf16_utf8_strcpy();
 	ret |= ut_utf16_utf8_strncpy();
+	ret |= ut_utf_to_lower();
+	ret |= ut_utf_to_upper();
 
 	printf("Test %s\n", ret ? "failed" : "passed");
 
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 14/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (12 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 13/15] test: tests for utf_to_lower() utf_to_upper() Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  2018-08-26 18:31   ` Alexander Graf
  2018-08-11 15:28 ` [U-Boot] [PATCH 15/15] efi_selftest: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
  14 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

The patch implements the EFI_UNICODE_COLLATION_PROTOCOL.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 MAINTAINERS                            |   2 +
 include/cp1250.h                       |  40 +++++
 include/cp437.h                        |  40 +++++
 include/efi_api.h                      |  21 +++
 include/efi_loader.h                   |   5 +
 lib/efi_loader/Makefile                |  18 +-
 lib/efi_loader/efi_boottime.c          |   6 +
 lib/efi_loader/efi_unicode_collation.c | 218 +++++++++++++++++++++++++
 8 files changed, 346 insertions(+), 4 deletions(-)
 create mode 100644 include/cp1250.h
 create mode 100644 include/cp437.h
 create mode 100644 lib/efi_loader/efi_unicode_collation.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a543309f2..33d5d545bd 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -369,6 +369,8 @@ F:	doc/README.uefi
 F:	doc/README.iscsi
 F:	Documentation/efi.rst
 F:	include/capitalization.h
+F:	include/cp1250.h
+F:	include/cp437.h
 F:	include/efi*
 F:	include/pe.h
 F:	include/asm-generic/pe.h
diff --git a/include/cp1250.h b/include/cp1250.h
new file mode 100644
index 0000000000..adacf8a958
--- /dev/null
+++ b/include/cp1250.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+/*
+ * Constant CP1250 contains the Unicode code points for characters 0x80 - 0xff
+ * of the code page 1250.
+ */
+#define CP1250 { \
+	0x20ac, 0x0000, 0x201a, 0x0000, \
+	0x201e, 0x2026, 0x2020, 0x2021, \
+	0x0000, 0x2030, 0x0160, 0x2039, \
+	0x015a, 0x0164, 0x017d, 0x0179, \
+	0x0000, 0x2018, 0x2019, 0x201c, \
+	0x201d, 0x2022, 0x2013, 0x2014, \
+	0x0000, 0x2122, 0x0161, 0x203a, \
+	0x015b, 0x0165, 0x017e, 0x017a, \
+	0x00a0, 0x02c7, 0x02d8, 0x0141, \
+	0x00a4, 0x0104, 0x00a6, 0x00a7, \
+	0x00a8, 0x00a9, 0x015e, 0x00ab, \
+	0x00ac, 0x00ad, 0x00ae, 0x017b, \
+	0x00b0, 0x00b1, 0x02db, 0x0142, \
+	0x00b4, 0x00b5, 0x00b6, 0x00b7, \
+	0x00b8, 0x0105, 0x015f, 0x00bb, \
+	0x013d, 0x02dd, 0x013e, 0x017c, \
+	0x0154, 0x00c1, 0x00c2, 0x0102, \
+	0x00c4, 0x0139, 0x0106, 0x00c7, \
+	0x010c, 0x00c9, 0x0118, 0x00cb, \
+	0x011a, 0x00cd, 0x00ce, 0x010e, \
+	0x0110, 0x0143, 0x0147, 0x00d3, \
+	0x00d4, 0x0150, 0x00d6, 0x00d7, \
+	0x0158, 0x016e, 0x00da, 0x0170, \
+	0x00dc, 0x00dd, 0x0162, 0x00df, \
+	0x0155, 0x00e1, 0x00e2, 0x0103, \
+	0x00e4, 0x013a, 0x0107, 0x00e7, \
+	0x010d, 0x00e9, 0x0119, 0x00eb, \
+	0x011b, 0x00ed, 0x00ee, 0x010f, \
+	0x0111, 0x0144, 0x0148, 0x00f3, \
+	0x00f4, 0x0151, 0x00f6, 0x00f7, \
+	0x0159, 0x016f, 0x00fa, 0x0171, \
+	0x00fc, 0x00fd, 0x0163, 0x02d9, \
+}
diff --git a/include/cp437.h b/include/cp437.h
new file mode 100644
index 0000000000..0b2b97132e
--- /dev/null
+++ b/include/cp437.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+/*
+ * Constant CP437 contains the Unicode code points for characters 0x80 - 0xff
+ * of the code page 437.
+ */
+#define CP437 { \
+	0x00c7, 0x00fc, 0x00e9, 0x00e2, \
+	0x00e4, 0x00e0, 0x00e5, 0x00e7, \
+	0x00ea, 0x00eb, 0x00e8, 0x00ef, \
+	0x00ee, 0x00ec, 0x00c4, 0x00c5, \
+	0x00c9, 0x00e6, 0x00c6, 0x00f4, \
+	0x00f6, 0x00f2, 0x00fb, 0x00f9, \
+	0x00ff, 0x00d6, 0x00dc, 0x00a2, \
+	0x00a3, 0x00a5, 0x20a7, 0x0192, \
+	0x00e1, 0x00ed, 0x00f3, 0x00fa, \
+	0x00f1, 0x00d1, 0x00aa, 0x00ba, \
+	0x00bf, 0x2310, 0x00ac, 0x00bd, \
+	0x00bc, 0x00a1, 0x00ab, 0x00bb, \
+	0x2591, 0x2592, 0x2593, 0x2502, \
+	0x2524, 0x2561, 0x2562, 0x2556, \
+	0x2555, 0x2563, 0x2551, 0x2557, \
+	0x255d, 0x255c, 0x255b, 0x2510, \
+	0x2514, 0x2534, 0x252c, 0x251c, \
+	0x2500, 0x253c, 0x255e, 0x255f, \
+	0x255a, 0x2554, 0x2569, 0x2566, \
+	0x2560, 0x2550, 0x256c, 0x2567, \
+	0x2568, 0x2564, 0x2565, 0x2559, \
+	0x2558, 0x2552, 0x2553, 0x256b, \
+	0x256a, 0x2518, 0x250c, 0x2588, \
+	0x2584, 0x258c, 0x2590, 0x2580, \
+	0x03b1, 0x00df, 0x0393, 0x03c0, \
+	0x03a3, 0x03c3, 0x00b5, 0x03c4, \
+	0x03a6, 0x0398, 0x03a9, 0x03b4, \
+	0x221e, 0x03c6, 0x03b5, 0x2229, \
+	0x2261, 0x00b1, 0x2265, 0x2264, \
+	0x2320, 0x2321, 0x00f7, 0x2248, \
+	0x00b0, 0x2219, 0x00b7, 0x221a, \
+	0x207f, 0x00b2, 0x25a0, 0x00a0, \
+}
diff --git a/include/efi_api.h b/include/efi_api.h
index ebf2a3bc18..1efc448184 100644
--- a/include/efi_api.h
+++ b/include/efi_api.h
@@ -31,6 +31,7 @@ enum efi_timer_delay {
 	EFI_TIMER_RELATIVE = 2
 };
 
+#define efi_intn_t ssize_t
 #define efi_uintn_t size_t
 typedef uint16_t *efi_string_t;
 
@@ -958,4 +959,24 @@ struct efi_driver_binding_protocol {
 	efi_handle_t driver_binding_handle;
 };
 
+#define EFI_UNICODE_COLLATION_PROTOCOL2_GUID \
+	EFI_GUID(0xa4c751fc, 0x23ae, 0x4c3e, \
+		 0x92, 0xe9, 0x49, 0x64, 0xcf, 0x63, 0xf3, 0x49)
+struct efi_unicode_collation_protocol {
+	efi_intn_t (EFIAPI *stri_coll)(
+		struct efi_unicode_collation_protocol *this, u16 *s1, u16 *s2);
+	bool (EFIAPI *metai_match)(struct efi_unicode_collation_protocol *this,
+				   const u16 *string, const u16 *patter);
+	void (EFIAPI *str_lwr)(struct efi_unicode_collation_protocol
+			       *this, u16 *string);
+	void (EFIAPI *str_upr)(struct efi_unicode_collation_protocol *this,
+			       u16 *string);
+	void (EFIAPI *fat_to_str)(struct efi_unicode_collation_protocol *this,
+				  efi_uintn_t fat_size, char *fat, u16 *string);
+	bool (EFIAPI *str_to_fat)(struct efi_unicode_collation_protocol *this,
+				  const u16 *string, efi_uintn_t fat_size,
+				  char *fat);
+	char *supported_languages;
+};
+
 #endif
diff --git a/include/efi_loader.h b/include/efi_loader.h
index f162adfff7..c5ae7c3c36 100644
--- a/include/efi_loader.h
+++ b/include/efi_loader.h
@@ -98,6 +98,9 @@ extern const struct efi_device_path_to_text_protocol efi_device_path_to_text;
 /* implementation of the EFI_DEVICE_PATH_UTILITIES_PROTOCOL */
 extern const struct efi_device_path_utilities_protocol
 					efi_device_path_utilities;
+/* Implementation of the EFI_UNICODE_COLLATION_PROTOCOL */
+extern const struct efi_unicode_collation_protocol
+					efi_unicode_collation_protocol;
 
 uint16_t *efi_dp_str(struct efi_device_path *dp);
 
@@ -127,6 +130,8 @@ extern const efi_guid_t efi_file_info_guid;
 /* GUID for file system information */
 extern const efi_guid_t efi_file_system_info_guid;
 extern const efi_guid_t efi_guid_device_path_utilities_protocol;
+/* GUID of the Unicode collation protocol */
+extern const efi_guid_t efi_guid_unicode_collation_protocol;
 
 extern unsigned int __efi_runtime_start, __efi_runtime_stop;
 extern unsigned int __efi_runtime_rel_start, __efi_runtime_rel_stop;
diff --git a/lib/efi_loader/Makefile b/lib/efi_loader/Makefile
index 1ffbf52a89..0769df20f7 100644
--- a/lib/efi_loader/Makefile
+++ b/lib/efi_loader/Makefile
@@ -17,10 +17,20 @@ always += helloworld.efi
 endif
 
 obj-$(CONFIG_CMD_BOOTEFI_HELLO) += helloworld_efi.o
-obj-y += efi_image_loader.o efi_boottime.o efi_runtime.o efi_console.o
-obj-y += efi_memory.o efi_device_path_to_text.o efi_device_path.o
-obj-y += efi_device_path_utilities.o efi_file.o efi_variable.o efi_bootmgr.o
-obj-y += efi_watchdog.o
+obj-y += \
+efi_bootmgr.o \
+efi_boottime.o \
+efi_console.o \
+efi_device_path.o \
+efi_device_path_to_text.o \
+efi_device_path_utilities.o \
+efi_file.o \
+efi_image_loader.o \
+efi_memory.o \
+efi_runtime.o \
+efi_unicode_collation.o \
+efi_variable.o \
+efi_watchdog.o
 obj-$(CONFIG_LCD) += efi_gop.o
 obj-$(CONFIG_DM_VIDEO) += efi_gop.o
 obj-$(CONFIG_PARTITIONS) += efi_disk.o
diff --git a/lib/efi_loader/efi_boottime.c b/lib/efi_loader/efi_boottime.c
index 618e8a8d8c..8a0701e828 100644
--- a/lib/efi_loader/efi_boottime.c
+++ b/lib/efi_loader/efi_boottime.c
@@ -1525,6 +1525,12 @@ efi_status_t efi_setup_loaded_image(
 	if (ret != EFI_SUCCESS)
 		goto failure;
 
+	ret = efi_add_protocol(obj->handle,
+			       &efi_guid_unicode_collation_protocol,
+			       (void *)&efi_unicode_collation_protocol);
+	if (ret != EFI_SUCCESS)
+		goto failure;
+
 	return ret;
 failure:
 	printf("ERROR: Failure to install protocols for loaded image\n");
diff --git a/lib/efi_loader/efi_unicode_collation.c b/lib/efi_loader/efi_unicode_collation.c
new file mode 100644
index 0000000000..9a16b6a8a0
--- /dev/null
+++ b/lib/efi_loader/efi_unicode_collation.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * EFI Unicode collation protocol
+ *
+ * Copyright (c) 2018 Heinrich Schuchardt <xypron.glpk@gmx.de>
+ */
+
+#include <common.h>
+#include <charset.h>
+#include <cp1250.h>
+#include <cp437.h>
+#include <efi_loader.h>
+
+/* Characters that may not be used in file names */
+static const char illegal[] = "<>:\"/\\|?*";
+
+/*
+ * EDK2 assumes codepage 1250 when creating FAT 8.3 file names.
+ * Linux defaults to codepage 437 for FAT 8.3 file names.
+ */
+#if CONFIG_FAT_DEFAULT_CODEPAGE == 1250
+/* Unicode code points for code page 1250 characters 0x80 - 0xff */
+static const u16 codepage[] = CP1250;
+#else
+/* Unicode code points for code page 437 characters 0x80 - 0xff */
+static const u16 codepage[] = CP437;
+#endif
+
+const efi_guid_t efi_guid_unicode_collation_protocol =
+	EFI_UNICODE_COLLATION_PROTOCOL2_GUID;
+
+static efi_intn_t EFIAPI efi_stri_coll(
+		struct efi_unicode_collation_protocol *this, u16 *s1, u16 *s2)
+{
+	s32 c1, c2;
+	efi_intn_t ret = 0;
+
+	EFI_ENTRY("%p, %ls, %ls", this, s1, s2);
+	for (; *s1 | *s2; ++s1, ++s2) {
+		c1 = utf_to_lower(*s1);
+		c2 = utf_to_lower(*s2);
+		if (c1 < c2) {
+			ret = -1;
+			goto out;
+		} else if (c1 > c2) {
+			ret = 1;
+			goto out;
+		}
+	}
+out:
+	EFI_EXIT(EFI_SUCCESS);
+	return ret;
+}
+
+static bool metai_match(const u16 *s, const u16 *p)
+{
+	u16 first;
+
+	for (; *s && *p; ++s, ++p) {
+		switch (*p) {
+		case '*':
+			/* Match 0 or more characters */
+			++p;
+			for (;; ++s) {
+				if (metai_match(s, p))
+					return true;
+				if (!*s)
+					return false;
+			}
+		case '?':
+			/* Match any one character */
+			break;
+		case '[':
+			/* Match any character in the set */
+			++p;
+			first = *p;
+			if (first == ']')
+				/* Empty set */
+				return false;
+			++p;
+			if (*p == '-') {
+				/* Range */
+				++p;
+				if (*s < first || *s > *p)
+					return false;
+				++p;
+				if (*p != ']')
+					return false;
+			} else {
+				/* Set */
+				bool hit = false;
+
+				if (*s == first)
+					hit = true;
+				for (; *p && *p != ']'; ++p) {
+					if (*p == *s)
+						hit = true;
+				}
+				if (!hit || *p != ']')
+					return false;
+			}
+			break;
+		default:
+			/* Match one character */
+			if (*p != *s)
+				return false;
+		}
+	}
+	if (!*p && !*s)
+		return true;
+	return false;
+}
+
+static bool EFIAPI efi_metai_match(struct efi_unicode_collation_protocol *this,
+				   const u16 *string, const u16 *pattern)
+{
+	bool ret;
+
+	EFI_ENTRY("%p, %ls, %ls", this, string, pattern);
+	ret =  metai_match(string, pattern);
+	EFI_EXIT(EFI_SUCCESS);
+	return ret;
+}
+
+static void EFIAPI efi_str_lwr(struct efi_unicode_collation_protocol *this,
+			       u16 *string)
+{
+	EFI_ENTRY("%p, %ls", this, string);
+	for (; *string; ++string)
+		*string = utf_to_lower(*string);
+	EFI_EXIT(EFI_SUCCESS);
+}
+
+static void EFIAPI efi_str_upr(struct efi_unicode_collation_protocol *this,
+			       u16 *string)
+{
+	EFI_ENTRY("%p, %ls", this, string);
+	for (; *string; ++string)
+		*string = utf_to_upper(*string);
+	EFI_EXIT(EFI_SUCCESS);
+}
+
+static void EFIAPI efi_fat_to_str(struct efi_unicode_collation_protocol *this,
+				  efi_uintn_t fat_size, char *fat, u16 *string)
+{
+	efi_uintn_t i;
+	u16 c;
+
+	EFI_ENTRY("%p, %zu, %s, %p", this, fat_size, fat, string);
+	for (i = 0; i < fat_size; ++i) {
+		c = (unsigned char)fat[i];
+		if (c > 0x80)
+			c = codepage[i - 0x80];
+		string[i] = c;
+		if (!c)
+			break;
+	}
+	string[i] = 0;
+	EFI_EXIT(EFI_SUCCESS);
+}
+
+static bool EFIAPI efi_str_to_fat(struct efi_unicode_collation_protocol *this,
+				  const u16 *string, efi_uintn_t fat_size,
+				  char *fat)
+{
+	efi_uintn_t i;
+	s32 c;
+	bool ret = false;
+
+	EFI_ENTRY("%p, %ls, %zu, %p", this, string, fat_size, fat);
+	for (i = 0; i < fat_size;) {
+		c = utf16_get(&string);
+		switch (c) {
+		/* Ignore period and space */
+		case '.':
+		case ' ':
+			continue;
+		case 0:
+			break;
+		}
+		c = utf_to_upper(c);
+		if (c >= 0x80) {
+			int j;
+
+			/* Look for codepage translation */
+			for (j = 0; j < 0x80; ++j) {
+				if (c == codepage[j]) {
+					c = j + 0x80;
+					break;
+				}
+			}
+			if (j >= 0x80) {
+				c = '_';
+				ret = true;
+			}
+		} else if (c && (c < 0x20 || strchr(illegal, c))) {
+			c = '_';
+			ret = true;
+		}
+
+		fat[i] = c;
+		if (!c)
+			break;
+		++i;
+	}
+	EFI_EXIT(EFI_SUCCESS);
+	return ret;
+}
+
+const struct efi_unicode_collation_protocol efi_unicode_collation_protocol = {
+	.stri_coll = efi_stri_coll,
+	.metai_match = efi_metai_match,
+	.str_lwr = efi_str_lwr,
+	.str_upr = efi_str_upr,
+	.fat_to_str = efi_fat_to_str,
+	.str_to_fat = efi_str_to_fat,
+	.supported_languages = "en-US",
+};
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 15/15] efi_selftest: EFI_UNICODE_COLLATION_PROTOCOL
  2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
                   ` (13 preceding siblings ...)
  2018-08-11 15:28 ` [U-Boot] [PATCH 14/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
@ 2018-08-11 15:28 ` Heinrich Schuchardt
  14 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-11 15:28 UTC (permalink / raw
  To: u-boot

Provide a unit test for the EFI_UNICODE_COLLATION_PROTOCOL.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
---
 lib/efi_selftest/Makefile                     |   1 +
 .../efi_selftest_unicode_collation.c          | 257 ++++++++++++++++++
 2 files changed, 258 insertions(+)
 create mode 100644 lib/efi_selftest/efi_selftest_unicode_collation.c

diff --git a/lib/efi_selftest/Makefile b/lib/efi_selftest/Makefile
index 590f90b16d..7ff879742f 100644
--- a/lib/efi_selftest/Makefile
+++ b/lib/efi_selftest/Makefile
@@ -30,6 +30,7 @@ efi_selftest_snp.o \
 efi_selftest_textinput.o \
 efi_selftest_textoutput.o \
 efi_selftest_tpl.o \
+efi_selftest_unicode_collation.o \
 efi_selftest_util.o \
 efi_selftest_variables.o \
 efi_selftest_watchdog.o
diff --git a/lib/efi_selftest/efi_selftest_unicode_collation.c b/lib/efi_selftest/efi_selftest_unicode_collation.c
new file mode 100644
index 0000000000..ed35b76e86
--- /dev/null
+++ b/lib/efi_selftest/efi_selftest_unicode_collation.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * efi_selftest_unicode_collation
+ *
+ * Copyright (c) 2018 Heinrich Schuchardt <xypron.glpk@gmx.de>
+ *
+ * Test unicode collation protocol.
+ */
+
+#include <efi_selftest.h>
+
+static const efi_guid_t unicode_collation_protocol_guid =
+	EFI_UNICODE_COLLATION_PROTOCOL2_GUID;
+
+static struct efi_boot_services *boottime;
+
+static struct efi_unicode_collation_protocol *unicode_collation_protocol;
+
+/**
+ * setup() - setup unit test.
+ *
+ * @handle:	handle of the loaded image
+ * @systable:	system table
+ * ReturnValue:	EFI_ST_SUCCESS for success
+ */
+static int setup(const efi_handle_t handle,
+		 const struct efi_system_table *systable)
+{
+	efi_status_t ret;
+
+	boottime = systable->boottime;
+
+	ret = boottime->locate_protocol(&unicode_collation_protocol_guid, NULL,
+					(void **)&unicode_collation_protocol);
+	if (ret != EFI_SUCCESS) {
+		unicode_collation_protocol = NULL;
+		efi_st_error("Unicode collation protocol is not available.\n");
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+static int test_stri_coll(void)
+{
+	efi_intn_t ret;
+	u16 c1[] = L"first";
+	u16 c2[] = L"FIRST";
+	u16 c3[] = L"second";
+
+	ret = unicode_collation_protocol->stri_coll(unicode_collation_protocol,
+						    c1, c2);
+	if (ret) {
+		efi_st_error("stri_coll(\"%ps\", \"%ps\") = %zu\n", c1, c2, ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->stri_coll(unicode_collation_protocol,
+						    c1, c3);
+	if (ret >= 0) {
+		efi_st_error("stri_coll(\"%ps\", \"%ps\") = %zu\n", c1, c3, ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->stri_coll(unicode_collation_protocol,
+						    c3, c1);
+	if (ret <= 0) {
+		efi_st_error("stri_coll(\"%ps\", \"%ps\") = %zu\n", c3, c1, ret);
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+static int test_metai_match(void)
+{
+	bool ret;
+	const u16 c[] = L"Das U-Boot";
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"*");
+	if (!ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"Da[rstu] U-Boot");
+	if (!ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"Da[q-v] U-Boot");
+	if (!ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"Da? U-Boot");
+	if (!ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"D*Bo*t");
+	if (!ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"Da[xyz] U-Boot");
+	if (ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"Da[a-d] U-Boot");
+	if (ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"Da?? U-Boot");
+	if (ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	ret = unicode_collation_protocol->metai_match(
+		unicode_collation_protocol, c, L"D*Bo*tt");
+	if (ret) {
+		efi_st_error("metai_match returned %u\n", ret);
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+static int test_str_lwr(void)
+{
+	u16 c[] = L"U-Boot";
+
+	unicode_collation_protocol->str_lwr(unicode_collation_protocol, c);
+	if (efi_st_strcmp_16_8(c, "u-boot")) {
+		efi_st_error("str_lwr returned \"%ps\"\n", c);
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+static int test_str_upr(void)
+{
+	u16 c[] = L"U-Boot";
+
+	unicode_collation_protocol->str_upr(unicode_collation_protocol, c);
+	if (efi_st_strcmp_16_8(c, "U-BOOT")) {
+		efi_st_error("str_lwr returned \"%ps\"\n", c);
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+static int test_fat_to_str(void)
+{
+	u16 str[16];
+
+	boottime->set_mem(str, sizeof(str), 0);
+	unicode_collation_protocol->fat_to_str(unicode_collation_protocol, 6,
+					       "U-BOOT", str);
+	if (efi_st_strcmp_16_8(str, "U-BOOT")) {
+		efi_st_error("fat_to_str returned \"%ps\"\n", str);
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+static int test_str_to_fat(void)
+{
+	char fat[16];
+	bool ret;
+
+	boottime->set_mem(fat, sizeof(fat), 0);
+	ret = unicode_collation_protocol->str_to_fat(unicode_collation_protocol,
+						     L"U -Boo.t", 6, fat);
+	if (ret || efi_st_strcmp_16_8(L"U-BOOT", fat)) {
+		efi_st_error("str_to_fat returned %u, \"%s\"\n", ret, fat);
+		return EFI_ST_FAILURE;
+	}
+
+	boottime->set_mem(fat, 16, 0);
+	ret = unicode_collation_protocol->str_to_fat(unicode_collation_protocol,
+						     L"U\\Boot", 6, fat);
+	if (!ret || efi_st_strcmp_16_8(L"U_BOOT", fat)) {
+		efi_st_error("str_to_fat returned %u, \"%s\"\n", ret, fat);
+		return EFI_ST_FAILURE;
+	}
+
+	return EFI_ST_SUCCESS;
+}
+
+/**
+ * execute() - Execute unit test.
+ *
+ * ReturnValue:	EFI_ST_SUCCESS for success
+ */
+static int execute(void)
+{
+	int ret;
+
+	if (!unicode_collation_protocol) {
+		efi_st_printf("Unicode collation protocol missing\n");
+		return EFI_ST_FAILURE;
+	}
+
+	ret = test_stri_coll();
+	if (ret != EFI_ST_SUCCESS)
+		return ret;
+
+	ret = test_metai_match();
+	if (ret != EFI_ST_SUCCESS)
+		return ret;
+
+	ret = test_str_lwr();
+	if (ret != EFI_ST_SUCCESS)
+		return ret;
+
+	ret = test_str_upr();
+	if (ret != EFI_ST_SUCCESS)
+		return ret;
+
+	ret = test_fat_to_str();
+	if (ret != EFI_ST_SUCCESS)
+		return ret;
+
+	ret = test_str_to_fat();
+	if (ret != EFI_ST_SUCCESS)
+		return ret;
+
+	return EFI_ST_SUCCESS;
+}
+
+EFI_UNIT_TEST(unicoll) = {
+	.name = "unicode collation",
+	.phase = EFI_EXECUTE_BEFORE_BOOTTIME_EXIT,
+	.execute = execute,
+	.setup = setup,
+};
-- 
2.18.0

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 01/15] lib: build charset.o only if needed
  2018-08-11 15:28 ` [U-Boot] [PATCH 01/15] lib: build charset.o only if needed Heinrich Schuchardt
@ 2018-08-26 17:45   ` Alexander Graf
  2018-08-26 18:06     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 17:45 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> charset.o is only needed for the EFI subsystem
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  lib/Makefile   |  5 ++++-
>  lib/vsprintf.c | 12 ++++++++----
>  2 files changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/Makefile b/lib/Makefile
> index 5f583aed37..2fd32798a0 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -19,7 +19,10 @@ obj-$(CONFIG_ARCH_AT91) += at91/
>  obj-$(CONFIG_OPTEE) += optee/
>  
>  obj-$(CONFIG_AES) += aes.o
> -obj-y += charset.o
> +
> +ifndef API_BUILD
> +obj-$(CONFIG_EFI_LOADER) += charset.o
> +endif
>  obj-$(CONFIG_USB_TTY) += circbuf.o
>  obj-y += crc7.o
>  obj-y += crc8.o
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index 914fbd30cb..6100357858 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -274,6 +274,8 @@ static char *string(char *buf, char *end, char *s, int field_width,
>  	return buf;
>  }
>  
> +#if defined(CONFIG_EFI_LOADER) && \
> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)

I believe you want #if CONFIG_IS_ENABLED(EFI_LOADER) here, right? We
probably should move to that at other places where we explicitly check
for SPL_BUILD too.

>  static char *string16(char *buf, char *end, u16 *s, int field_width,
>  		int precision, int flags)
>  {
> @@ -294,8 +296,6 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
>  	return buf;
>  }
>  
> -#if defined(CONFIG_EFI_LOADER) && \
> -	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
>  static char *device_path_string(char *buf, char *end, void *dp, int field_width,
>  				int precision, int flags)
>  {
> @@ -612,10 +612,14 @@ repeat:
>  			continue;
>  
>  		case 's':
> -			if (qualifier == 'l' && !IS_ENABLED(CONFIG_SPL_BUILD)) {
> +#if defined(CONFIG_EFI_LOADER) && \
> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)

This #if deserves a comment.


Alex

> +			if (qualifier == 'l') {
>  				str = string16(str, end, va_arg(args, u16 *),
>  					       field_width, precision, flags);
> -			} else {
> +			} else
> +#endif
> +			{
>  				str = string(str, end, va_arg(args, char *),
>  					     field_width, precision, flags);
>  			}
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen
  2018-08-11 15:28 ` [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen Heinrich Schuchardt
@ 2018-08-26 17:52   ` Alexander Graf
  2018-08-26 18:21     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 17:52 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> The function names utf16_strlen() and utf16_strnlen() are misnomers.
> The functions do not count utf-16 characters but non-zero words.
> So let's rename them to u16_strlen and u16_strnlen().
> 
> In utf16_dup() avoid assignment in if clause.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  include/charset.h             | 28 +++++++++++-----------------
>  lib/charset.c                 | 10 +++++++---
>  lib/efi_loader/efi_bootmgr.c  |  2 +-
>  lib/efi_loader/efi_console.c  |  2 +-
>  lib/efi_loader/efi_file.c     |  2 +-
>  lib/efi_loader/efi_variable.c |  2 +-
>  lib/vsprintf.c                |  2 +-
>  7 files changed, 23 insertions(+), 25 deletions(-)
> 
> diff --git a/include/charset.h b/include/charset.h
> index 11832cbd12..2307559890 100644
> --- a/include/charset.h
> +++ b/include/charset.h
> @@ -13,29 +13,23 @@
>  #define MAX_UTF8_PER_UTF16 3
>  
>  /**
> - * utf16_strlen() - Get the length of an utf16 string
> + * u16_strlen - count non-zero words

This really just implements wcslen() now, right?

>   *
> - * Returns the number of 16 bit characters in an utf16 string, not
> - * including the terminating NULL character.
> - *
> - * @in     the string to measure
> - * @return the string length
> + * @in:			utf-16 string

Is "in" really a utf-16 string? Probably rather a null-terminated string
of words.

> + * ReturnValue:		number of non-zero words.
> + *			This is not the number of utf-16 letters!
>   */
> -size_t utf16_strlen(const uint16_t *in);
> +size_t u16_strlen(const u16 *in);
>  
>  /**
> - * utf16_strnlen() - Get the length of a fixed-size utf16 string.
> - *
> - * Returns the number of 16 bit characters in an utf16 string,
> - * not including the terminating NULL character, but at most
> - * 'count' number of characters.  In doing this, utf16_strnlen()
> - * looks at only the first 'count' characters.
> + * u16_strlen - count non-zero words

This really just implements wcsnlen() now, right?

>   *
> - * @in     the string to measure
> - * @count  the maximum number of characters to count
> - * @return the string length, up to a maximum of 'count'
> + * @in:			utf-16 string

Same comment here.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 03/15] lib: charset: utility functions for Unicode
  2018-08-11 15:28 ` [U-Boot] [PATCH 03/15] lib: charset: utility functions for Unicode Heinrich Schuchardt
@ 2018-08-26 17:59   ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 17:59 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> utf8_get() - get next UTF-8 code point from buffer
> utf8_put() - write UTF-8 code point to buffer
> utf8_utf16_strnlen() - length of a utf-8 string after conversion to utf-16
> utf8_utf16_strncpy() - copy a utf-8 string to utf-16
> utf16_get() - get next UTF-16 code point from buffer
> utf16_put() - write UTF-16 code point to buffer
> utf16_utf8_strnlen() - length of a utf-16 string after conversion to utf-8
> utf16_utf8_strncpy() - copy a utf-16 string to utf-8
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>

Reviewed-by: Alexander Graf <agraf@suse.de>

I can't promise that all the conversion logic is correct though ;).


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions
  2018-08-11 15:28 ` [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions Heinrich Schuchardt
@ 2018-08-26 18:02   ` Alexander Graf
  2018-08-26 18:27     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:02 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> Provide unit tests for Unicode functions.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  MAINTAINERS           |   1 +
>  include/test/suites.h |   3 +-
>  test/Kconfig          |   8 +
>  test/Makefile         |   1 +
>  test/cmd_ut.c         |  14 +-
>  test/unicode_ut.c     | 470 ++++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 493 insertions(+), 4 deletions(-)
>  create mode 100644 test/unicode_ut.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 51a1472cf8..a324139471 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -374,6 +374,7 @@ F:	include/asm-generic/pe.h
>  F:	lib/charset.c
>  F:	lib/efi*/
>  F:	test/py/tests/test_efi*
> +F:	test/unicode_ut.c
>  F:	cmd/bootefi.c
>  F:	tools/file2include.c
>  
> diff --git a/include/test/suites.h b/include/test/suites.h
> index b5019a7cd2..8e4eac60eb 100644
> --- a/include/test/suites.h
> +++ b/include/test/suites.h
> @@ -23,10 +23,11 @@ struct unit_test;
>  int cmd_ut_category(const char *name, struct unit_test *tests, int n_ents,
>  		    int argc, char * const argv[]);
>  
> +int do_ut_compression(cmd_tbl_t *cmdtp, int flag, int argc, char *const argv[]);
>  int do_ut_dm(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>  int do_ut_env(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>  int do_ut_overlay(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>  int do_ut_time(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
> -int do_ut_compression(cmd_tbl_t *cmdtp, int flag, int argc, char *const argv[]);
> +int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>  
>  #endif /* __TEST_SUITES_H__ */
> diff --git a/test/Kconfig b/test/Kconfig
> index 3643761bc6..de16d179d0 100644
> --- a/test/Kconfig
> +++ b/test/Kconfig
> @@ -15,6 +15,14 @@ config UT_TIME
>  	  problems. But if you are having problems with udelay() and the like,
>  	  this is a good place to start.
>  
> +config UT_UNICODE

You probably want to move the hunk that actually builds charset.o with
CONFIG_UT_UNICODE set to this patch as well ;).

> +	bool "Unit tests for Unicode functions"
> +	depends on UNIT_TEST
> +	default y
> +	help
> +	  Enables the 'ut unicode' command which tests that the functions for
> +	  manipulating Unicode strings work correctly.
> +
>  source "test/dm/Kconfig"
>  source "test/env/Kconfig"
>  source "test/overlay/Kconfig"
> diff --git a/test/Makefile b/test/Makefile
> index 1092011fdb..a5f52fd5ad 100644
> --- a/test/Makefile
> +++ b/test/Makefile
> @@ -8,4 +8,5 @@ obj-$(CONFIG_SANDBOX) += command_ut.o
>  obj-$(CONFIG_SANDBOX) += compression.o
>  obj-$(CONFIG_SANDBOX) += print_ut.o
>  obj-$(CONFIG_UT_TIME) += time_ut.o
> +obj-$(CONFIG_UT_UNICODE) += unicode_ut.o
>  obj-$(CONFIG_$(SPL_)LOG) += log/
> diff --git a/test/cmd_ut.c b/test/cmd_ut.c
> index 934a5a931b..d6a2593850 100644
> --- a/test/cmd_ut.c
> +++ b/test/cmd_ut.c
> @@ -49,6 +49,10 @@ static cmd_tbl_t cmd_ut_sub[] = {
>  #ifdef CONFIG_UT_TIME
>  	U_BOOT_CMD_MKENT(time, CONFIG_SYS_MAXARGS, 1, do_ut_time, "", ""),
>  #endif
> +#if defined(CONFIG_UT_UNICODE) && \
> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)

CONFIG_IS_ENABLED()

Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings
  2018-08-11 15:28 ` [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings Heinrich Schuchardt
@ 2018-08-26 18:05   ` Alexander Graf
  2018-08-26 18:34     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:05 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> The width and precision of the printf() function refer to the number of
> characters not to the number of bytes printed.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  lib/vsprintf.c | 22 +++++++++++++---------
>  1 file changed, 13 insertions(+), 9 deletions(-)
> 
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index a07128ad96..b7eb9d5f5e 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -280,18 +280,22 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
>  		int precision, int flags)
>  {
>  	u16 *str = s ? s : L"<NULL>";
> -	int utf16_len = u16_strnlen(str, precision);
> -	u8 utf8[utf16_len * MAX_UTF8_PER_UTF16];
> -	int utf8_len, i;
> -
> -	utf8_len = utf16_to_utf8(utf8, str, utf16_len) - utf8;
> +	ssize_t i, len = utf16_strnlen(str, precision);
>  
>  	if (!(flags & LEFT))
> -		while (utf8_len < field_width--)
> +		for (; len < field_width; --field_width)
>  			ADDCH(buf, ' ');
> -	for (i = 0; i < utf8_len; ++i)
> -		ADDCH(buf, utf8[i]);
> -	while (utf8_len < field_width--)
> +	for (i = 0; i < len; ++i) {
> +		s32 code = utf16_get((const u16 **)&str);
> +
> +		if (code < 0) {
> +			code = '?';
> +			if (*str)
> +				++str;
> +		}
> +		utf8_put(code, &buf);

Can you introduce or reuse a strcpy() helper in charset.c for this? That
way the compiler has the chance to inline utf16_get() and utf8_put() and
make the function fast.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 01/15] lib: build charset.o only if needed
  2018-08-26 17:45   ` Alexander Graf
@ 2018-08-26 18:06     ` Heinrich Schuchardt
  0 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 18:06 UTC (permalink / raw
  To: u-boot

On 08/26/2018 07:45 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> charset.o is only needed for the EFI subsystem
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  lib/Makefile   |  5 ++++-
>>  lib/vsprintf.c | 12 ++++++++----
>>  2 files changed, 12 insertions(+), 5 deletions(-)
>>
>> diff --git a/lib/Makefile b/lib/Makefile
>> index 5f583aed37..2fd32798a0 100644
>> --- a/lib/Makefile
>> +++ b/lib/Makefile
>> @@ -19,7 +19,10 @@ obj-$(CONFIG_ARCH_AT91) += at91/
>>  obj-$(CONFIG_OPTEE) += optee/
>>  
>>  obj-$(CONFIG_AES) += aes.o
>> -obj-y += charset.o
>> +
>> +ifndef API_BUILD
>> +obj-$(CONFIG_EFI_LOADER) += charset.o
>> +endif
>>  obj-$(CONFIG_USB_TTY) += circbuf.o
>>  obj-y += crc7.o
>>  obj-y += crc8.o
>> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
>> index 914fbd30cb..6100357858 100644
>> --- a/lib/vsprintf.c
>> +++ b/lib/vsprintf.c
>> @@ -274,6 +274,8 @@ static char *string(char *buf, char *end, char *s, int field_width,
>>  	return buf;
>>  }
>>  
>> +#if defined(CONFIG_EFI_LOADER) && \
>> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
> 
> I believe you want #if CONFIG_IS_ENABLED(EFI_LOADER) here, right? We
> probably should move to that at other places where we explicitly check
> for SPL_BUILD too.

Thanks for reviewing

Yes we can use that shorthand notation here. We still have to check
API_BUILD.

> 
>>  static char *string16(char *buf, char *end, u16 *s, int field_width,
>>  		int precision, int flags)
>>  {
>> @@ -294,8 +296,6 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
>>  	return buf;
>>  }
>>  
>> -#if defined(CONFIG_EFI_LOADER) && \
>> -	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
>>  static char *device_path_string(char *buf, char *end, void *dp, int field_width,
>>  				int precision, int flags)
>>  {
>> @@ -612,10 +612,14 @@ repeat:
>>  			continue;
>>  
>>  		case 's':
>> -			if (qualifier == 'l' && !IS_ENABLED(CONFIG_SPL_BUILD)) {
>> +#if defined(CONFIG_EFI_LOADER) && \
>> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
> 
> This #if deserves a comment.

ok

Best regards

Heinrich

> 
> 
> Alex
> 
>> +			if (qualifier == 'l') {
>>  				str = string16(str, end, va_arg(args, u16 *),
>>  					       field_width, precision, flags);
>> -			} else {
>> +			} else
>> +#endif
>> +			{
>>  				str = string(str, end, va_arg(args, char *),
>>  					     field_width, precision, flags);
>>  			}
>>
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 06/15] test: test printing Unicode
  2018-08-11 15:28 ` [U-Boot] [PATCH 06/15] test: test printing Unicode Heinrich Schuchardt
@ 2018-08-26 18:06   ` Alexander Graf
  2018-08-26 18:36     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:06 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> Test printing of Unicode strings
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  test/unicode_ut.c | 37 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 37 insertions(+)
> 
> diff --git a/test/unicode_ut.c b/test/unicode_ut.c
> index 29316606c4..8e8c4d189e 100644
> --- a/test/unicode_ut.c
> +++ b/test/unicode_ut.c
> @@ -34,6 +34,42 @@ static const char d3[] = {0xe6, 0xbd, 0x9c, 0xe6, 0xb0, 0xb4, 0xe8, 0x89,
>  static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
>  			  0xf0, 0x90, 0x92, 0x87, 0x00};
>  
> +static int ut_string16(void)
> +{
> +#if defined(CONFIG_EFI_LOADER) && \
> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
> +	char buf[20];
> +
> +	memset(buf, 0xff, sizeof(buf));
> +	sprintf(buf, "%8.6ls", c2);
> +	if (buf[1] != ' ')
> +		return -1;
> +	if (strncmp(&buf[2], d2, 7))
> +		return -1;
> +	if (buf[9])
> +		return -1;
> +
> +	memset(buf, 0xff, sizeof(buf));
> +	sprintf(buf, "%8.6ls", c4);
> +	if (buf[4] != ' ')
> +		return -1;
> +	if (strncmp(&buf[5], d4, 12))
> +		return -1;
> +	if (buf[17])
> +		return -1;
> +
> +	memset(buf, 0xff, sizeof(buf));
> +	sprintf(buf, "%-8.2ls", c4);
> +	if (strncmp(buf, d4, 8))
> +		return -1;
> +	if (buf[8] != ' ')
> +		return -1;
> +	if (buf[14])
> +		return -1;
> +#endif

Does this check the illegal character '?' case as well?


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length
  2018-08-11 15:28 ` [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length Heinrich Schuchardt
@ 2018-08-26 18:13   ` Alexander Graf
  2018-08-26 18:40     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:13 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> The EFI spec does not provide a length limit for variables.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  lib/efi_loader/efi_variable.c | 52 ++++++++++++++++++++---------------
>  1 file changed, 30 insertions(+), 22 deletions(-)
> 
> diff --git a/lib/efi_loader/efi_variable.c b/lib/efi_loader/efi_variable.c
> index 770c67abb9..495738884b 100644
> --- a/lib/efi_loader/efi_variable.c
> +++ b/lib/efi_loader/efi_variable.c
> @@ -44,10 +44,7 @@
>   * converted to utf16?
>   */
>  
> -#define MAX_VAR_NAME 31
> -#define MAX_NATIVE_VAR_NAME \
> -	(strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_") + \
> -		(MAX_VAR_NAME * MAX_UTF8_PER_UTF16))
> +#define PREFIX_LEN (strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_"))
>  
>  static int hex(int ch)
>  {
> @@ -101,18 +98,20 @@ static char *mem2hex(char *hexstr, const u8 *mem, int count)
>  	return hexstr;
>  }
>  
> -static efi_status_t efi_to_native(char *native, u16 *variable_name,
> +static efi_status_t efi_to_native(char **native, const u16 *variable_name,
>  				  efi_guid_t *vendor)
>  {
>  	size_t len;
> +	char *pos;
>  
> -	len = u16_strlen((u16 *)variable_name);
> -	if (len >= MAX_VAR_NAME)
> -		return EFI_DEVICE_ERROR;
> +	len = PREFIX_LEN + utf16_utf8_strlen(variable_name) + 1;
> +	*native = malloc(len);
> +	if (!*native)
> +		return EFI_OUT_OF_RESOURCES;
>  
> -	native += sprintf(native, "efi_%pUl_", vendor);
> -	native  = (char *)utf16_to_utf8((u8 *)native, (u16 *)variable_name, len);
> -	*native = '\0';
> +	pos = *native;
> +	pos += sprintf(pos, "efi_%pUl_", vendor);
> +	utf16_utf8_strcpy(&pos, variable_name);
>  
>  	return EFI_SUCCESS;
>  }
> @@ -168,7 +167,7 @@ efi_status_t EFIAPI efi_get_variable(u16 *variable_name, efi_guid_t *vendor,
>  				     u32 *attributes, efi_uintn_t *data_size,
>  				     void *data)
>  {
> -	char native_name[MAX_NATIVE_VAR_NAME + 1];
> +	char *native_name;

I think you want to predefine this to = NULL to make sure that an error
path doesn't give you uninitialized values on free().


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 08/15] efi_loader: don't use unlimited stack as buffer
  2018-08-11 15:28 ` [U-Boot] [PATCH 08/15] efi_loader: don't use unlimited stack as buffer Heinrich Schuchardt
@ 2018-08-26 18:16   ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:16 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> The length of a string printed to the console by the
> EFI_SIMPLE_TEXT_OUTPUT_PROTOCOL is not limited by the UEFI spec.
> Hence should not allocate a buffer for it on the stack.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>

Reviewed-by: Alexander Graf <agraf@suse.de>


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 09/15] efi_loader: buffer size for load options
  2018-08-11 15:28 ` [U-Boot] [PATCH 09/15] efi_loader: buffer size for load options Heinrich Schuchardt
@ 2018-08-26 18:17   ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:17 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> The number of bytes in an utf-8 string is an upper limit for the number of
> words in the equivalent utf-16 string. In so far the inumbant coding works
> correctly. For non-ASCII characters the utf-16 string is shorter. With the
> patch only the necessary buffer size is allocated for the load options.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>

Reviewed-by: Alexander Graf <agraf@suse.de>


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen
  2018-08-26 17:52   ` Alexander Graf
@ 2018-08-26 18:21     ` Heinrich Schuchardt
  2018-08-26 18:33       ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 18:21 UTC (permalink / raw
  To: u-boot

On 08/26/2018 07:52 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> The function names utf16_strlen() and utf16_strnlen() are misnomers.
>> The functions do not count utf-16 characters but non-zero words.
>> So let's rename them to u16_strlen and u16_strnlen().
>>
>> In utf16_dup() avoid assignment in if clause.
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  include/charset.h             | 28 +++++++++++-----------------
>>  lib/charset.c                 | 10 +++++++---
>>  lib/efi_loader/efi_bootmgr.c  |  2 +-
>>  lib/efi_loader/efi_console.c  |  2 +-
>>  lib/efi_loader/efi_file.c     |  2 +-
>>  lib/efi_loader/efi_variable.c |  2 +-
>>  lib/vsprintf.c                |  2 +-
>>  7 files changed, 23 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/charset.h b/include/charset.h
>> index 11832cbd12..2307559890 100644
>> --- a/include/charset.h
>> +++ b/include/charset.h
>> @@ -13,29 +13,23 @@
>>  #define MAX_UTF8_PER_UTF16 3
>>  
>>  /**
>> - * utf16_strlen() - Get the length of an utf16 string
>> + * u16_strlen - count non-zero words
> 
> This really just implements wcslen() now, right?
> 
>>   *
>> - * Returns the number of 16 bit characters in an utf16 string, not
>> - * including the terminating NULL character.
>> - *
>> - * @in     the string to measure
>> - * @return the string length
>> + * @in:			utf-16 string
> 
> Is "in" really a utf-16 string? Probably rather a null-terminated string
> of words.

I will update the comment.

> 
>> + * ReturnValue:		number of non-zero words.
>> + *			This is not the number of utf-16 letters!
>>   */
>> -size_t utf16_strlen(const uint16_t *in);
>> +size_t u16_strlen(const u16 *in);
>>  
>>  /**
>> - * utf16_strnlen() - Get the length of a fixed-size utf16 string.
>> - *
>> - * Returns the number of 16 bit characters in an utf16 string,
>> - * not including the terminating NULL character, but at most
>> - * 'count' number of characters.  In doing this, utf16_strnlen()
>> - * looks at only the first 'count' characters.
>> + * u16_strlen - count non-zero words
> 
> This really just implements wcsnlen() now, right?

Currently we have set wchar size to 16bit using a compiler flag. In my
opinion this was not necessary. In C11 we could have use the u"text"
notation for utf-8 string constants instead of L"text".

This function really is for u16[] and not for wchar_t[].

I would hesitate to call this function wcsnlen() as the working of
wcsnlen() depends on said compiler setting.

> 
>>   *
>> - * @in     the string to measure
>> - * @count  the maximum number of characters to count
>> - * @return the string length, up to a maximum of 'count'
>> + * @in:			utf-16 string
>

Yes u16 string.

Thanks for reviewing.

Heinrich


> Same comment here.
> 
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-11 15:28 ` [U-Boot] [PATCH 11/15] efi_loader: capitalization table Heinrich Schuchardt
@ 2018-08-26 18:22   ` Alexander Graf
  2018-08-26 19:00     ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:22 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> This patch provides a define to initialize a table that maps lower to
> capital letters for Unicode code point 0x0000 - 0xffff.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
> ---
>  MAINTAINERS              |    1 +
>  include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>  2 files changed, 1910 insertions(+)
>  create mode 100644 include/capitalization.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a324139471..0a543309f2 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -368,6 +368,7 @@ F:	doc/DocBook/efi.tmpl
>  F:	doc/README.uefi
>  F:	doc/README.iscsi
>  F:	Documentation/efi.rst
> +F:	include/capitalization.h
>  F:	include/efi*
>  F:	include/pe.h
>  F:	include/asm-generic/pe.h
> diff --git a/include/capitalization.h b/include/capitalization.h
> new file mode 100644
> index 0000000000..50d5108f98
> --- /dev/null
> +++ b/include/capitalization.h
> @@ -0,0 +1,1909 @@
> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
> +/*
> + * Correspondence table for small and capital Unicode letters in the range of
> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
> + */
> +
> +struct capitalization_table {
> +	u16 upper;
> +	u16 lower;
> +};
> +
> +#define UNICODE_CAPITALIZATION_TABLE { \

Ugh, that is a *lot* of data. How much does the binary size grow with
the table compiled in?

Is there any slightly more sophisticated pattern in the table maybe that
we could just express as code? Would that turn out smaller maybe?


Alex

> +	{ 0x0531, /* ARMENIAN CAPITAL LETTER AYB */ \
> +	  0x0561, /* ARMENIAN SMALL LETTER AYB */ }, \
> +	{ 0x0532, /* ARMENIAN CAPITAL LETTER BEN */ \
> +	  0x0562, /* ARMENIAN SMALL LETTER BEN */ }, \
> +	{ 0x053E, /* ARMENIAN CAPITAL LETTER CA */ \
> +	  0x056E, /* ARMENIAN SMALL LETTER CA */ }, \
> +	{ 0x0549, /* ARMENIAN CAPITAL LETTER CHA */ \

[...]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions
  2018-08-26 18:02   ` Alexander Graf
@ 2018-08-26 18:27     ` Heinrich Schuchardt
  0 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 18:27 UTC (permalink / raw
  To: u-boot

On 08/26/2018 08:02 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> Provide unit tests for Unicode functions.
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  MAINTAINERS           |   1 +
>>  include/test/suites.h |   3 +-
>>  test/Kconfig          |   8 +
>>  test/Makefile         |   1 +
>>  test/cmd_ut.c         |  14 +-
>>  test/unicode_ut.c     | 470 ++++++++++++++++++++++++++++++++++++++++++
>>  6 files changed, 493 insertions(+), 4 deletions(-)
>>  create mode 100644 test/unicode_ut.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 51a1472cf8..a324139471 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -374,6 +374,7 @@ F:	include/asm-generic/pe.h
>>  F:	lib/charset.c
>>  F:	lib/efi*/
>>  F:	test/py/tests/test_efi*
>> +F:	test/unicode_ut.c
>>  F:	cmd/bootefi.c
>>  F:	tools/file2include.c
>>  
>> diff --git a/include/test/suites.h b/include/test/suites.h
>> index b5019a7cd2..8e4eac60eb 100644
>> --- a/include/test/suites.h
>> +++ b/include/test/suites.h
>> @@ -23,10 +23,11 @@ struct unit_test;
>>  int cmd_ut_category(const char *name, struct unit_test *tests, int n_ents,
>>  		    int argc, char * const argv[]);
>>  
>> +int do_ut_compression(cmd_tbl_t *cmdtp, int flag, int argc, char *const argv[]);
>>  int do_ut_dm(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>>  int do_ut_env(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>>  int do_ut_overlay(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>>  int do_ut_time(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>> -int do_ut_compression(cmd_tbl_t *cmdtp, int flag, int argc, char *const argv[]);
>> +int do_ut_unicode(cmd_tbl_t *cmdtp, int flag, int argc, char * const argv[]);
>>  
>>  #endif /* __TEST_SUITES_H__ */
>> diff --git a/test/Kconfig b/test/Kconfig
>> index 3643761bc6..de16d179d0 100644
>> --- a/test/Kconfig
>> +++ b/test/Kconfig
>> @@ -15,6 +15,14 @@ config UT_TIME
>>  	  problems. But if you are having problems with udelay() and the like,
>>  	  this is a good place to start.
>>  
>> +config UT_UNICODE
> 
> You probably want to move the hunk that actually builds charset.o with
> CONFIG_UT_UNICODE set to this patch as well ;).

Yes there is a bit in patch "lib: charset: utility functions for
Unicode" that could move to this patch.

> 
>> +	bool "Unit tests for Unicode functions"
>> +	depends on UNIT_TEST
>> +	default y
>> +	help
>> +	  Enables the 'ut unicode' command which tests that the functions for
>> +	  manipulating Unicode strings work correctly.
>> +
>>  source "test/dm/Kconfig"
>>  source "test/env/Kconfig"
>>  source "test/overlay/Kconfig"
>> diff --git a/test/Makefile b/test/Makefile
>> index 1092011fdb..a5f52fd5ad 100644
>> --- a/test/Makefile
>> +++ b/test/Makefile
>> @@ -8,4 +8,5 @@ obj-$(CONFIG_SANDBOX) += command_ut.o
>>  obj-$(CONFIG_SANDBOX) += compression.o
>>  obj-$(CONFIG_SANDBOX) += print_ut.o
>>  obj-$(CONFIG_UT_TIME) += time_ut.o
>> +obj-$(CONFIG_UT_UNICODE) += unicode_ut.o
>>  obj-$(CONFIG_$(SPL_)LOG) += log/
>> diff --git a/test/cmd_ut.c b/test/cmd_ut.c
>> index 934a5a931b..d6a2593850 100644
>> --- a/test/cmd_ut.c
>> +++ b/test/cmd_ut.c
>> @@ -49,6 +49,10 @@ static cmd_tbl_t cmd_ut_sub[] = {
>>  #ifdef CONFIG_UT_TIME
>>  	U_BOOT_CMD_MKENT(time, CONFIG_SYS_MAXARGS, 1, do_ut_time, "", ""),
>>  #endif
>> +#if defined(CONFIG_UT_UNICODE) && \
>> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
> 
> CONFIG_IS_ENABLED()

ok

Heinrich.

> 
> Alex
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 14/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL
  2018-08-11 15:28 ` [U-Boot] [PATCH 14/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
@ 2018-08-26 18:31   ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:31 UTC (permalink / raw
  To: u-boot



On 11.08.18 17:28, Heinrich Schuchardt wrote:
> The patch implements the EFI_UNICODE_COLLATION_PROTOCOL.
> 
> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>

Can you please add documentation to each function?


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen
  2018-08-26 18:21     ` Heinrich Schuchardt
@ 2018-08-26 18:33       ` Alexander Graf
  2018-08-26 19:36         ` Heinrich Schuchardt
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 18:33 UTC (permalink / raw
  To: u-boot



On 26.08.18 20:21, Heinrich Schuchardt wrote:
> On 08/26/2018 07:52 PM, Alexander Graf wrote:
>>
>>
>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>> The function names utf16_strlen() and utf16_strnlen() are misnomers.
>>> The functions do not count utf-16 characters but non-zero words.
>>> So let's rename them to u16_strlen and u16_strnlen().
>>>
>>> In utf16_dup() avoid assignment in if clause.
>>>
>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>> ---
>>>  include/charset.h             | 28 +++++++++++-----------------
>>>  lib/charset.c                 | 10 +++++++---
>>>  lib/efi_loader/efi_bootmgr.c  |  2 +-
>>>  lib/efi_loader/efi_console.c  |  2 +-
>>>  lib/efi_loader/efi_file.c     |  2 +-
>>>  lib/efi_loader/efi_variable.c |  2 +-
>>>  lib/vsprintf.c                |  2 +-
>>>  7 files changed, 23 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/include/charset.h b/include/charset.h
>>> index 11832cbd12..2307559890 100644
>>> --- a/include/charset.h
>>> +++ b/include/charset.h
>>> @@ -13,29 +13,23 @@
>>>  #define MAX_UTF8_PER_UTF16 3
>>>  
>>>  /**
>>> - * utf16_strlen() - Get the length of an utf16 string
>>> + * u16_strlen - count non-zero words
>>
>> This really just implements wcslen() now, right?
>>
>>>   *
>>> - * Returns the number of 16 bit characters in an utf16 string, not
>>> - * including the terminating NULL character.
>>> - *
>>> - * @in     the string to measure
>>> - * @return the string length
>>> + * @in:			utf-16 string
>>
>> Is "in" really a utf-16 string? Probably rather a null-terminated string
>> of words.
> 
> I will update the comment.
> 
>>
>>> + * ReturnValue:		number of non-zero words.
>>> + *			This is not the number of utf-16 letters!
>>>   */
>>> -size_t utf16_strlen(const uint16_t *in);
>>> +size_t u16_strlen(const u16 *in);
>>>  
>>>  /**
>>> - * utf16_strnlen() - Get the length of a fixed-size utf16 string.
>>> - *
>>> - * Returns the number of 16 bit characters in an utf16 string,
>>> - * not including the terminating NULL character, but at most
>>> - * 'count' number of characters.  In doing this, utf16_strnlen()
>>> - * looks at only the first 'count' characters.
>>> + * u16_strlen - count non-zero words
>>
>> This really just implements wcsnlen() now, right?
> 
> Currently we have set wchar size to 16bit using a compiler flag. In my
> opinion this was not necessary. In C11 we could have use the u"text"
> notation for utf-8 string constants instead of L"text".

I thought the idea was to get utf-16 string constants?

> 
> This function really is for u16[] and not for wchar_t[].
> 
> I would hesitate to call this function wcsnlen() as the working of
> wcsnlen() depends on said compiler setting.

*shrug* either way works for me. By calling them their official names we
could've potentially given gcc the chance to optimize/inline them better.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings
  2018-08-26 18:05   ` Alexander Graf
@ 2018-08-26 18:34     ` Heinrich Schuchardt
  2018-08-26 22:01       ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 18:34 UTC (permalink / raw
  To: u-boot

On 08/26/2018 08:05 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> The width and precision of the printf() function refer to the number of
>> characters not to the number of bytes printed.
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  lib/vsprintf.c | 22 +++++++++++++---------
>>  1 file changed, 13 insertions(+), 9 deletions(-)
>>
>> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
>> index a07128ad96..b7eb9d5f5e 100644
>> --- a/lib/vsprintf.c
>> +++ b/lib/vsprintf.c
>> @@ -280,18 +280,22 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
>>  		int precision, int flags)
>>  {
>>  	u16 *str = s ? s : L"<NULL>";
>> -	int utf16_len = u16_strnlen(str, precision);
>> -	u8 utf8[utf16_len * MAX_UTF8_PER_UTF16];
>> -	int utf8_len, i;
>> -
>> -	utf8_len = utf16_to_utf8(utf8, str, utf16_len) - utf8;
>> +	ssize_t i, len = utf16_strnlen(str, precision);
>>  
>>  	if (!(flags & LEFT))
>> -		while (utf8_len < field_width--)
>> +		for (; len < field_width; --field_width)
>>  			ADDCH(buf, ' ');
>> -	for (i = 0; i < utf8_len; ++i)
>> -		ADDCH(buf, utf8[i]);
>> -	while (utf8_len < field_width--)
>> +	for (i = 0; i < len; ++i) {
>> +		s32 code = utf16_get((const u16 **)&str);
>> +
>> +		if (code < 0) {
>> +			code = '?';
>> +			if (*str)
>> +				++str;
>> +		}
>> +		utf8_put(code, &buf);
> 
> Can you introduce or reuse a strcpy() helper in charset.c for this? That
> way the compiler has the chance to inline utf16_get() and utf8_put() and
> make the function fast.

strcpy() works on bytes not on multi-byte utf-8 characters. So it is
unclear to me how I should make use of strcpy() here.

Of cause we could define utf8_put() and utf8_get() as inline function.
But that would increase code size. Is this what you would prefer? I
would guess that the serial interface is always the slowest part of text
output anyway.

Regards

Heinrich

> 
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 06/15] test: test printing Unicode
  2018-08-26 18:06   ` Alexander Graf
@ 2018-08-26 18:36     ` Heinrich Schuchardt
  0 siblings, 0 replies; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 18:36 UTC (permalink / raw
  To: u-boot

On 08/26/2018 08:06 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> Test printing of Unicode strings
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  test/unicode_ut.c | 37 +++++++++++++++++++++++++++++++++++++
>>  1 file changed, 37 insertions(+)
>>
>> diff --git a/test/unicode_ut.c b/test/unicode_ut.c
>> index 29316606c4..8e8c4d189e 100644
>> --- a/test/unicode_ut.c
>> +++ b/test/unicode_ut.c
>> @@ -34,6 +34,42 @@ static const char d3[] = {0xe6, 0xbd, 0x9c, 0xe6, 0xb0, 0xb4, 0xe8, 0x89,
>>  static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
>>  			  0xf0, 0x90, 0x92, 0x87, 0x00};
>>  
>> +static int ut_string16(void)
>> +{
>> +#if defined(CONFIG_EFI_LOADER) && \
>> +	!defined(CONFIG_SPL_BUILD) && !defined(API_BUILD)
>> +	char buf[20];
>> +
>> +	memset(buf, 0xff, sizeof(buf));
>> +	sprintf(buf, "%8.6ls", c2);
>> +	if (buf[1] != ' ')
>> +		return -1;
>> +	if (strncmp(&buf[2], d2, 7))
>> +		return -1;
>> +	if (buf[9])
>> +		return -1;
>> +
>> +	memset(buf, 0xff, sizeof(buf));
>> +	sprintf(buf, "%8.6ls", c4);
>> +	if (buf[4] != ' ')
>> +		return -1;
>> +	if (strncmp(&buf[5], d4, 12))
>> +		return -1;
>> +	if (buf[17])
>> +		return -1;
>> +
>> +	memset(buf, 0xff, sizeof(buf));
>> +	sprintf(buf, "%-8.2ls", c4);
>> +	if (strncmp(buf, d4, 8))
>> +		return -1;
>> +	if (buf[8] != ' ')
>> +		return -1;
>> +	if (buf[14])
>> +		return -1;
>> +#endif
> 
> Does this check the illegal character '?' case as well?

No. I can add this.

Best regards

Heinrich

> 
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length
  2018-08-26 18:13   ` Alexander Graf
@ 2018-08-26 18:40     ` Heinrich Schuchardt
  2018-08-26 22:04       ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 18:40 UTC (permalink / raw
  To: u-boot

On 08/26/2018 08:13 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> The EFI spec does not provide a length limit for variables.
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  lib/efi_loader/efi_variable.c | 52 ++++++++++++++++++++---------------
>>  1 file changed, 30 insertions(+), 22 deletions(-)
>>
>> diff --git a/lib/efi_loader/efi_variable.c b/lib/efi_loader/efi_variable.c
>> index 770c67abb9..495738884b 100644
>> --- a/lib/efi_loader/efi_variable.c
>> +++ b/lib/efi_loader/efi_variable.c
>> @@ -44,10 +44,7 @@
>>   * converted to utf16?
>>   */
>>  
>> -#define MAX_VAR_NAME 31
>> -#define MAX_NATIVE_VAR_NAME \
>> -	(strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_") + \
>> -		(MAX_VAR_NAME * MAX_UTF8_PER_UTF16))
>> +#define PREFIX_LEN (strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_"))
>>  
>>  static int hex(int ch)
>>  {
>> @@ -101,18 +98,20 @@ static char *mem2hex(char *hexstr, const u8 *mem, int count)
>>  	return hexstr;
>>  }
>>  
>> -static efi_status_t efi_to_native(char *native, u16 *variable_name,
>> +static efi_status_t efi_to_native(char **native, const u16 *variable_name,
>>  				  efi_guid_t *vendor)
>>  {
>>  	size_t len;
>> +	char *pos;
>>  
>> -	len = u16_strlen((u16 *)variable_name);
>> -	if (len >= MAX_VAR_NAME)
>> -		return EFI_DEVICE_ERROR;
>> +	len = PREFIX_LEN + utf16_utf8_strlen(variable_name) + 1;
>> +	*native = malloc(len);
>> +	if (!*native)
>> +		return EFI_OUT_OF_RESOURCES;
>>  
>> -	native += sprintf(native, "efi_%pUl_", vendor);
>> -	native  = (char *)utf16_to_utf8((u8 *)native, (u16 *)variable_name, len);
>> -	*native = '\0';
>> +	pos = *native;
>> +	pos += sprintf(pos, "efi_%pUl_", vendor);
>> +	utf16_utf8_strcpy(&pos, variable_name);
>>  
>>  	return EFI_SUCCESS;
>>  }
>> @@ -168,7 +167,7 @@ efi_status_t EFIAPI efi_get_variable(u16 *variable_name, efi_guid_t *vendor,
>>  				     u32 *attributes, efi_uintn_t *data_size,
>>  				     void *data)
>>  {
>> -	char native_name[MAX_NATIVE_VAR_NAME + 1];
>> +	char *native_name;
> 
> I think you want to predefine this to = NULL to make sure that an error
> path doesn't give you uninitialized values on free().

efi_to_native() returns EFI_OUT_OF_RESOURCES if the pointer cannot be
assigned and the return value is checked. So how should I reach
free(native_name) in this case?

Best regards

Heinrich

> 
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-26 18:22   ` Alexander Graf
@ 2018-08-26 19:00     ` Heinrich Schuchardt
  2018-08-26 22:06       ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 19:00 UTC (permalink / raw
  To: u-boot

On 08/26/2018 08:22 PM, Alexander Graf wrote:
> 
> 
> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>> This patch provides a define to initialize a table that maps lower to
>> capital letters for Unicode code point 0x0000 - 0xffff.
>>
>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>> ---
>>  MAINTAINERS              |    1 +
>>  include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 1910 insertions(+)
>>  create mode 100644 include/capitalization.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index a324139471..0a543309f2 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -368,6 +368,7 @@ F:	doc/DocBook/efi.tmpl
>>  F:	doc/README.uefi
>>  F:	doc/README.iscsi
>>  F:	Documentation/efi.rst
>> +F:	include/capitalization.h
>>  F:	include/efi*
>>  F:	include/pe.h
>>  F:	include/asm-generic/pe.h
>> diff --git a/include/capitalization.h b/include/capitalization.h
>> new file mode 100644
>> index 0000000000..50d5108f98
>> --- /dev/null
>> +++ b/include/capitalization.h
>> @@ -0,0 +1,1909 @@
>> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
>> +/*
>> + * Correspondence table for small and capital Unicode letters in the range of
>> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
>> + */
>> +
>> +struct capitalization_table {
>> +	u16 upper;
>> +	u16 lower;
>> +};
>> +
>> +#define UNICODE_CAPITALIZATION_TABLE { \
> 
> Ugh, that is a *lot* of data. How much does the binary size grow with
> the table compiled in?
> 
> Is there any slightly more sophisticated pattern in the table maybe that
> we could just express as code? Would that turn out smaller maybe?

This is 3792 bytes of data. Unicode capitalization is quite random in
arranging lower and upper letters.

We could resort to zlib or gzip. But these libraries are not built by
default.

Most urgently we will need the capitalization table for generating and
checking short FAT filenames, so we could create a configuration switch
that would reduce this table to codepage 437 or codepage 1250 letters
depending on the chosen native character set.

In EDK2 I only found code for codepage 1250.

Best regards

Heinrich

> 
> 
> Alex
> 
>> +	{ 0x0531, /* ARMENIAN CAPITAL LETTER AYB */ \
>> +	  0x0561, /* ARMENIAN SMALL LETTER AYB */ }, \
>> +	{ 0x0532, /* ARMENIAN CAPITAL LETTER BEN */ \
>> +	  0x0562, /* ARMENIAN SMALL LETTER BEN */ }, \
>> +	{ 0x053E, /* ARMENIAN CAPITAL LETTER CA */ \
>> +	  0x056E, /* ARMENIAN SMALL LETTER CA */ }, \
>> +	{ 0x0549, /* ARMENIAN CAPITAL LETTER CHA */ \
> 
> [...]
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen
  2018-08-26 18:33       ` Alexander Graf
@ 2018-08-26 19:36         ` Heinrich Schuchardt
  2018-08-26 21:57           ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Heinrich Schuchardt @ 2018-08-26 19:36 UTC (permalink / raw
  To: u-boot

On 08/26/2018 08:33 PM, Alexander Graf wrote:
> 
> 
> On 26.08.18 20:21, Heinrich Schuchardt wrote:
>> On 08/26/2018 07:52 PM, Alexander Graf wrote:
>>>
>>>
>>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>>> The function names utf16_strlen() and utf16_strnlen() are misnomers.
>>>> The functions do not count utf-16 characters but non-zero words.
>>>> So let's rename them to u16_strlen and u16_strnlen().
>>>>
>>>> In utf16_dup() avoid assignment in if clause.
>>>>
>>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>>> ---
>>>>  include/charset.h             | 28 +++++++++++-----------------
>>>>  lib/charset.c                 | 10 +++++++---
>>>>  lib/efi_loader/efi_bootmgr.c  |  2 +-
>>>>  lib/efi_loader/efi_console.c  |  2 +-
>>>>  lib/efi_loader/efi_file.c     |  2 +-
>>>>  lib/efi_loader/efi_variable.c |  2 +-
>>>>  lib/vsprintf.c                |  2 +-
>>>>  7 files changed, 23 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/include/charset.h b/include/charset.h
>>>> index 11832cbd12..2307559890 100644
>>>> --- a/include/charset.h
>>>> +++ b/include/charset.h
>>>> @@ -13,29 +13,23 @@
>>>>  #define MAX_UTF8_PER_UTF16 3
>>>>  
>>>>  /**
>>>> - * utf16_strlen() - Get the length of an utf16 string
>>>> + * u16_strlen - count non-zero words
>>>
>>> This really just implements wcslen() now, right?
>>>
>>>>   *
>>>> - * Returns the number of 16 bit characters in an utf16 string, not
>>>> - * including the terminating NULL character.
>>>> - *
>>>> - * @in     the string to measure
>>>> - * @return the string length
>>>> + * @in:			utf-16 string
>>>
>>> Is "in" really a utf-16 string? Probably rather a null-terminated string
>>> of words.
>>
>> I will update the comment.
>>
>>>
>>>> + * ReturnValue:		number of non-zero words.
>>>> + *			This is not the number of utf-16 letters!
>>>>   */
>>>> -size_t utf16_strlen(const uint16_t *in);
>>>> +size_t u16_strlen(const u16 *in);
>>>>  
>>>>  /**
>>>> - * utf16_strnlen() - Get the length of a fixed-size utf16 string.
>>>> - *
>>>> - * Returns the number of 16 bit characters in an utf16 string,
>>>> - * not including the terminating NULL character, but at most
>>>> - * 'count' number of characters.  In doing this, utf16_strnlen()
>>>> - * looks at only the first 'count' characters.
>>>> + * u16_strlen - count non-zero words
>>>
>>> This really just implements wcsnlen() now, right?
>>
>> Currently we have set wchar size to 16bit using a compiler flag. In my
>> opinion this was not necessary. In C11 we could have use the u"text"
>> notation for utf-8 string constants instead of L"text".
> 
> I thought the idea was to get utf-16 string constants?

Yes, but the C11 way is not using a compiler flag but using the right
prefix (which was not available with C99). So once we do not have to
support outdated gcc versions anymore we could get rid of the compiler flag.

> 
>>
>> This function really is for u16[] and not for wchar_t[].
>>
>> I would hesitate to call this function wcsnlen() as the working of
>> wcsnlen() depends on said compiler setting.
> 
> *shrug* either way works for me. By calling them their official names we
> could've potentially given gcc the chance to optimize/inline them better.

Do have any hint that gcc really provides better compilation results
based on the function name?

From what I see in the gcc code only the address sanitizer is aware that
wcsnlen() is accessing a memory range.

Best regards

Heinrich

> 
> 
> Alex
> 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen
  2018-08-26 19:36         ` Heinrich Schuchardt
@ 2018-08-26 21:57           ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 21:57 UTC (permalink / raw
  To: u-boot



On 26.08.18 21:36, Heinrich Schuchardt wrote:
> On 08/26/2018 08:33 PM, Alexander Graf wrote:
>>
>>
>> On 26.08.18 20:21, Heinrich Schuchardt wrote:
>>> On 08/26/2018 07:52 PM, Alexander Graf wrote:
>>>>
>>>>
>>>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>>>> The function names utf16_strlen() and utf16_strnlen() are misnomers.
>>>>> The functions do not count utf-16 characters but non-zero words.
>>>>> So let's rename them to u16_strlen and u16_strnlen().
>>>>>
>>>>> In utf16_dup() avoid assignment in if clause.
>>>>>
>>>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>>>> ---
>>>>>  include/charset.h             | 28 +++++++++++-----------------
>>>>>  lib/charset.c                 | 10 +++++++---
>>>>>  lib/efi_loader/efi_bootmgr.c  |  2 +-
>>>>>  lib/efi_loader/efi_console.c  |  2 +-
>>>>>  lib/efi_loader/efi_file.c     |  2 +-
>>>>>  lib/efi_loader/efi_variable.c |  2 +-
>>>>>  lib/vsprintf.c                |  2 +-
>>>>>  7 files changed, 23 insertions(+), 25 deletions(-)
>>>>>
>>>>> diff --git a/include/charset.h b/include/charset.h
>>>>> index 11832cbd12..2307559890 100644
>>>>> --- a/include/charset.h
>>>>> +++ b/include/charset.h
>>>>> @@ -13,29 +13,23 @@
>>>>>  #define MAX_UTF8_PER_UTF16 3
>>>>>  
>>>>>  /**
>>>>> - * utf16_strlen() - Get the length of an utf16 string
>>>>> + * u16_strlen - count non-zero words
>>>>
>>>> This really just implements wcslen() now, right?
>>>>
>>>>>   *
>>>>> - * Returns the number of 16 bit characters in an utf16 string, not
>>>>> - * including the terminating NULL character.
>>>>> - *
>>>>> - * @in     the string to measure
>>>>> - * @return the string length
>>>>> + * @in:			utf-16 string
>>>>
>>>> Is "in" really a utf-16 string? Probably rather a null-terminated string
>>>> of words.
>>>
>>> I will update the comment.
>>>
>>>>
>>>>> + * ReturnValue:		number of non-zero words.
>>>>> + *			This is not the number of utf-16 letters!
>>>>>   */
>>>>> -size_t utf16_strlen(const uint16_t *in);
>>>>> +size_t u16_strlen(const u16 *in);
>>>>>  
>>>>>  /**
>>>>> - * utf16_strnlen() - Get the length of a fixed-size utf16 string.
>>>>> - *
>>>>> - * Returns the number of 16 bit characters in an utf16 string,
>>>>> - * not including the terminating NULL character, but at most
>>>>> - * 'count' number of characters.  In doing this, utf16_strnlen()
>>>>> - * looks at only the first 'count' characters.
>>>>> + * u16_strlen - count non-zero words
>>>>
>>>> This really just implements wcsnlen() now, right?
>>>
>>> Currently we have set wchar size to 16bit using a compiler flag. In my
>>> opinion this was not necessary. In C11 we could have use the u"text"
>>> notation for utf-8 string constants instead of L"text".
>>
>> I thought the idea was to get utf-16 string constants?
> 
> Yes, but the C11 way is not using a compiler flag but using the right
> prefix (which was not available with C99). So once we do not have to
> support outdated gcc versions anymore we could get rid of the compiler flag.

I think we are in agreement, I just wanted to point out that u"text"
creates a utf-16 string rather than a utf-8 one :).

Unfortunately I don't know when we can bump the compiler requirement,
but it can't be forever and we should keep it in mind, I agree.

> 
>>
>>>
>>> This function really is for u16[] and not for wchar_t[].
>>>
>>> I would hesitate to call this function wcsnlen() as the working of
>>> wcsnlen() depends on said compiler setting.
>>
>> *shrug* either way works for me. By calling them their official names we
>> could've potentially given gcc the chance to optimize/inline them better.
> 
> Do have any hint that gcc really provides better compilation results
> based on the function name?

I seem to recall that gcc at least allowed for strcpy() and friends to
get inlined. The point was really more about using function names that
do exactly what people are used to.

I don't have a terribly string feeling towards it though, as strlen() is
definitely much wider used than wcslen() and thus much more well known.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings
  2018-08-26 18:34     ` Heinrich Schuchardt
@ 2018-08-26 22:01       ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 22:01 UTC (permalink / raw
  To: u-boot



On 26.08.18 20:34, Heinrich Schuchardt wrote:
> On 08/26/2018 08:05 PM, Alexander Graf wrote:
>>
>>
>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>> The width and precision of the printf() function refer to the number of
>>> characters not to the number of bytes printed.
>>>
>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>> ---
>>>  lib/vsprintf.c | 22 +++++++++++++---------
>>>  1 file changed, 13 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
>>> index a07128ad96..b7eb9d5f5e 100644
>>> --- a/lib/vsprintf.c
>>> +++ b/lib/vsprintf.c
>>> @@ -280,18 +280,22 @@ static char *string16(char *buf, char *end, u16 *s, int field_width,
>>>  		int precision, int flags)
>>>  {
>>>  	u16 *str = s ? s : L"<NULL>";
>>> -	int utf16_len = u16_strnlen(str, precision);
>>> -	u8 utf8[utf16_len * MAX_UTF8_PER_UTF16];
>>> -	int utf8_len, i;
>>> -
>>> -	utf8_len = utf16_to_utf8(utf8, str, utf16_len) - utf8;
>>> +	ssize_t i, len = utf16_strnlen(str, precision);
>>>  
>>>  	if (!(flags & LEFT))
>>> -		while (utf8_len < field_width--)
>>> +		for (; len < field_width; --field_width)
>>>  			ADDCH(buf, ' ');
>>> -	for (i = 0; i < utf8_len; ++i)
>>> -		ADDCH(buf, utf8[i]);
>>> -	while (utf8_len < field_width--)
>>> +	for (i = 0; i < len; ++i) {
>>> +		s32 code = utf16_get((const u16 **)&str);
>>> +
>>> +		if (code < 0) {
>>> +			code = '?';
>>> +			if (*str)
>>> +				++str;
>>> +		}
>>> +		utf8_put(code, &buf);
>>
>> Can you introduce or reuse a strcpy() helper in charset.c for this? That
>> way the compiler has the chance to inline utf16_get() and utf8_put() and
>> make the function fast.
> 
> strcpy() works on bytes not on multi-byte utf-8 characters. So it is
> unclear to me how I should make use of strcpy() here.

What I was trying to imply is that what you're doing here is very
similar to utf8_utf16_strncpy(). Maybe we can reuse the same function or
at least something very similar.

> Of cause we could define utf8_put() and utf8_get() as inline function.
> But that would increase code size. Is this what you would prefer? I
> would guess that the serial interface is always the slowest part of text
> output anyway.

Real serial output is definitely orders of magnitude slower, I agree.
But if we can make the code easier to read along the way I'm all for it ;).

I think what it boils down to is that I'd prefer if we keep
utf{8,16}_{get,put}() as local to charset.c as we can and instead put
slightly higher level wrappers around them, like you did for pretty much
everything else.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length
  2018-08-26 18:40     ` Heinrich Schuchardt
@ 2018-08-26 22:04       ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 22:04 UTC (permalink / raw
  To: u-boot



On 26.08.18 20:40, Heinrich Schuchardt wrote:
> On 08/26/2018 08:13 PM, Alexander Graf wrote:
>>
>>
>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>> The EFI spec does not provide a length limit for variables.
>>>
>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>> ---
>>>  lib/efi_loader/efi_variable.c | 52 ++++++++++++++++++++---------------
>>>  1 file changed, 30 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/lib/efi_loader/efi_variable.c b/lib/efi_loader/efi_variable.c
>>> index 770c67abb9..495738884b 100644
>>> --- a/lib/efi_loader/efi_variable.c
>>> +++ b/lib/efi_loader/efi_variable.c
>>> @@ -44,10 +44,7 @@
>>>   * converted to utf16?
>>>   */
>>>  
>>> -#define MAX_VAR_NAME 31
>>> -#define MAX_NATIVE_VAR_NAME \
>>> -	(strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_") + \
>>> -		(MAX_VAR_NAME * MAX_UTF8_PER_UTF16))
>>> +#define PREFIX_LEN (strlen("efi_xxxxxxxx-xxxx-xxxx-xxxxxxxxxxxxxxxx_"))
>>>  
>>>  static int hex(int ch)
>>>  {
>>> @@ -101,18 +98,20 @@ static char *mem2hex(char *hexstr, const u8 *mem, int count)
>>>  	return hexstr;
>>>  }
>>>  
>>> -static efi_status_t efi_to_native(char *native, u16 *variable_name,
>>> +static efi_status_t efi_to_native(char **native, const u16 *variable_name,
>>>  				  efi_guid_t *vendor)
>>>  {
>>>  	size_t len;
>>> +	char *pos;
>>>  
>>> -	len = u16_strlen((u16 *)variable_name);
>>> -	if (len >= MAX_VAR_NAME)
>>> -		return EFI_DEVICE_ERROR;
>>> +	len = PREFIX_LEN + utf16_utf8_strlen(variable_name) + 1;
>>> +	*native = malloc(len);
>>> +	if (!*native)
>>> +		return EFI_OUT_OF_RESOURCES;
>>>  
>>> -	native += sprintf(native, "efi_%pUl_", vendor);
>>> -	native  = (char *)utf16_to_utf8((u8 *)native, (u16 *)variable_name, len);
>>> -	*native = '\0';
>>> +	pos = *native;
>>> +	pos += sprintf(pos, "efi_%pUl_", vendor);
>>> +	utf16_utf8_strcpy(&pos, variable_name);
>>>  
>>>  	return EFI_SUCCESS;
>>>  }
>>> @@ -168,7 +167,7 @@ efi_status_t EFIAPI efi_get_variable(u16 *variable_name, efi_guid_t *vendor,
>>>  				     u32 *attributes, efi_uintn_t *data_size,
>>>  				     void *data)
>>>  {
>>> -	char native_name[MAX_NATIVE_VAR_NAME + 1];
>>> +	char *native_name;
>>
>> I think you want to predefine this to = NULL to make sure that an error
>> path doesn't give you uninitialized values on free().
> 
> efi_to_native() returns EFI_OUT_OF_RESOURCES if the pointer cannot be
> assigned and the return value is checked. So how should I reach
> free(native_name) in this case?

True, convinced. I thought I saw a case where you could hit an
uninitialized native_name variable, but now I can't see it anymore.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-26 19:00     ` Heinrich Schuchardt
@ 2018-08-26 22:06       ` Alexander Graf
  2018-08-27  8:30         ` Mike FABIAN
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-26 22:06 UTC (permalink / raw
  To: u-boot



On 26.08.18 21:00, Heinrich Schuchardt wrote:
> On 08/26/2018 08:22 PM, Alexander Graf wrote:
>>
>>
>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>> This patch provides a define to initialize a table that maps lower to
>>> capital letters for Unicode code point 0x0000 - 0xffff.
>>>
>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>> ---
>>>  MAINTAINERS              |    1 +
>>>  include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 1910 insertions(+)
>>>  create mode 100644 include/capitalization.h
>>>
>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>> index a324139471..0a543309f2 100644
>>> --- a/MAINTAINERS
>>> +++ b/MAINTAINERS
>>> @@ -368,6 +368,7 @@ F:	doc/DocBook/efi.tmpl
>>>  F:	doc/README.uefi
>>>  F:	doc/README.iscsi
>>>  F:	Documentation/efi.rst
>>> +F:	include/capitalization.h
>>>  F:	include/efi*
>>>  F:	include/pe.h
>>>  F:	include/asm-generic/pe.h
>>> diff --git a/include/capitalization.h b/include/capitalization.h
>>> new file mode 100644
>>> index 0000000000..50d5108f98
>>> --- /dev/null
>>> +++ b/include/capitalization.h
>>> @@ -0,0 +1,1909 @@
>>> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
>>> +/*
>>> + * Correspondence table for small and capital Unicode letters in the range of
>>> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
>>> + */
>>> +
>>> +struct capitalization_table {
>>> +	u16 upper;
>>> +	u16 lower;
>>> +};
>>> +
>>> +#define UNICODE_CAPITALIZATION_TABLE { \
>>
>> Ugh, that is a *lot* of data. How much does the binary size grow with
>> the table compiled in?
>>
>> Is there any slightly more sophisticated pattern in the table maybe that
>> we could just express as code? Would that turn out smaller maybe?
> 
> This is 3792 bytes of data. Unicode capitalization is quite random in
> arranging lower and upper letters.
> 
> We could resort to zlib or gzip. But these libraries are not built by
> default.

Yeah, and that only adds to more overhead.

> Most urgently we will need the capitalization table for generating and
> checking short FAT filenames, so we could create a configuration switch
> that would reduce this table to codepage 437 or codepage 1250 letters
> depending on the chosen native character set.

I think that's a great idea. There probably is a lot of overlap even
between the two, so maybe just make it a config option for "non-latin
upper/lower case conversion".

> In EDK2 I only found code for codepage 1250.

Yeah, I'd be surprised if people really needed more. In fact, how about
you just default the config option to =n by default?


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-26 22:06       ` Alexander Graf
@ 2018-08-27  8:30         ` Mike FABIAN
  2018-08-27  8:37           ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Mike FABIAN @ 2018-08-27  8:30 UTC (permalink / raw
  To: u-boot

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="windows-1254", Size: 2917 bytes --]

Alexander Graf <agraf@suse.de> さんは書きました:

> On 26.08.18 21:00, Heinrich Schuchardt wrote:
>> On 08/26/2018 08:22 PM, Alexander Graf wrote:
>>>
>>>
>>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>>> This patch provides a define to initialize a table that maps lower to
>>>> capital letters for Unicode code point 0x0000 - 0xffff.
>>>>
>>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>>> ---
>>>>  MAINTAINERS              |    1 +
>>>>  include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 1910 insertions(+)
>>>>  create mode 100644 include/capitalization.h
>>>>
>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>> index a324139471..0a543309f2 100644
>>>> --- a/MAINTAINERS
>>>> +++ b/MAINTAINERS
>>>> @@ -368,6 +368,7 @@ F:	doc/DocBook/efi.tmpl
>>>>  F:	doc/README.uefi
>>>>  F:	doc/README.iscsi
>>>>  F:	Documentation/efi.rst
>>>> +F:	include/capitalization.h
>>>>  F:	include/efi*
>>>>  F:	include/pe.h
>>>>  F:	include/asm-generic/pe.h
>>>> diff --git a/include/capitalization.h b/include/capitalization.h
>>>> new file mode 100644
>>>> index 0000000000..50d5108f98
>>>> --- /dev/null
>>>> +++ b/include/capitalization.h
>>>> @@ -0,0 +1,1909 @@
>>>> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
>>>> +/*
>>>> + * Correspondence table for small and capital Unicode letters in the range of
>>>> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
>>>> + */
>>>> +
>>>> +struct capitalization_table {
>>>> +	u16 upper;
>>>> +	u16 lower;
>>>> +};
>>>> +
>>>> +#define UNICODE_CAPITALIZATION_TABLE { \
>>>
>>> Ugh, that is a *lot* of data. How much does the binary size grow with
>>> the table compiled in?

That data is also in glibc. I don’t know whether you use glibc though
...

>>> Is there any slightly more sophisticated pattern in the table maybe that
>>> we could just express as code? Would that turn out smaller maybe?
>> 
>> This is 3792 bytes of data. Unicode capitalization is quite random in
>> arranging lower and upper letters.
>> 
>> We could resort to zlib or gzip. But these libraries are not built by
>> default.
>
> Yeah, and that only adds to more overhead.
>
>> Most urgently we will need the capitalization table for generating and
>> checking short FAT filenames, so we could create a configuration switch
>> that would reduce this table to codepage 437 or codepage 1250 letters
>> depending on the chosen native character set.
>
> I think that's a great idea. There probably is a lot of overlap even
> between the two, so maybe just make it a config option for "non-latin
> upper/lower case conversion".
>
>> In EDK2 I only found code for codepage 1250.
>
> Yeah, I'd be surprised if people really needed more. In fact, how about
> you just default the config option to =n by default?
>
>
> Alex
>

-- 
📧 Mike FABIAN   <mike.fabian@gmx.de>
睡眠不足はいい仕事の敵だ。

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-27  8:30         ` Mike FABIAN
@ 2018-08-27  8:37           ` Alexander Graf
  2018-08-30  2:51             ` Simon Glass
  0 siblings, 1 reply; 44+ messages in thread
From: Alexander Graf @ 2018-08-27  8:37 UTC (permalink / raw
  To: u-boot



> Am 27.08.2018 um 10:30 schrieb Mike FABIAN <maiku.fabian@gmail.com>:
> 
> Alexander Graf <agraf@suse.de> さんは書きました:
> 
>>> On 26.08.18 21:00, Heinrich Schuchardt wrote:
>>>> On 08/26/2018 08:22 PM, Alexander Graf wrote:
>>>> 
>>>> 
>>>>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>>>> This patch provides a define to initialize a table that maps lower to
>>>>> capital letters for Unicode code point 0x0000 - 0xffff.
>>>>> 
>>>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>>>> ---
>>>>> MAINTAINERS              |    1 +
>>>>> include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>>>>> 2 files changed, 1910 insertions(+)
>>>>> create mode 100644 include/capitalization.h
>>>>> 
>>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>>> index a324139471..0a543309f2 100644
>>>>> --- a/MAINTAINERS
>>>>> +++ b/MAINTAINERS
>>>>> @@ -368,6 +368,7 @@ F:    doc/DocBook/efi.tmpl
>>>>> F:    doc/README.uefi
>>>>> F:    doc/README.iscsi
>>>>> F:    Documentation/efi.rst
>>>>> +F:    include/capitalization.h
>>>>> F:    include/efi*
>>>>> F:    include/pe.h
>>>>> F:    include/asm-generic/pe.h
>>>>> diff --git a/include/capitalization.h b/include/capitalization.h
>>>>> new file mode 100644
>>>>> index 0000000000..50d5108f98
>>>>> --- /dev/null
>>>>> +++ b/include/capitalization.h
>>>>> @@ -0,0 +1,1909 @@
>>>>> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
>>>>> +/*
>>>>> + * Correspondence table for small and capital Unicode letters in the range of
>>>>> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
>>>>> + */
>>>>> +
>>>>> +struct capitalization_table {
>>>>> +    u16 upper;
>>>>> +    u16 lower;
>>>>> +};
>>>>> +
>>>>> +#define UNICODE_CAPITALIZATION_TABLE { \
>>>> 
>>>> Ugh, that is a *lot* of data. How much does the binary size grow with
>>>> the table compiled in?
> 
> That data is also in glibc. I don’t know whether you use glibc though
> ...

U-Boot is a standalone OS so to say, we do not use glibc (except for sandbox, but that's a special target).

The main problem is that some times people cram U-Boot into very tight spaces, like small flash chips or even on-chip RAM. So we have to be very cautious on space requirements.

Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-27  8:37           ` Alexander Graf
@ 2018-08-30  2:51             ` Simon Glass
  2018-08-30  6:41               ` Alexander Graf
  0 siblings, 1 reply; 44+ messages in thread
From: Simon Glass @ 2018-08-30  2:51 UTC (permalink / raw
  To: u-boot

Hi,

On 27 August 2018 at 02:37, Alexander Graf <agraf@suse.de> wrote:
>
>
>> Am 27.08.2018 um 10:30 schrieb Mike FABIAN <maiku.fabian@gmail.com>:
>>
>> Alexander Graf <agraf@suse.de> さんは書きました:
>>
>>>> On 26.08.18 21:00, Heinrich Schuchardt wrote:
>>>>> On 08/26/2018 08:22 PM, Alexander Graf wrote:
>>>>>
>>>>>
>>>>>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>>>>> This patch provides a define to initialize a table that maps lower to
>>>>>> capital letters for Unicode code point 0x0000 - 0xffff.
>>>>>>
>>>>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>>>>> ---
>>>>>> MAINTAINERS              |    1 +
>>>>>> include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>>>>>> 2 files changed, 1910 insertions(+)
>>>>>> create mode 100644 include/capitalization.h
>>>>>>
>>>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>>>> index a324139471..0a543309f2 100644
>>>>>> --- a/MAINTAINERS
>>>>>> +++ b/MAINTAINERS
>>>>>> @@ -368,6 +368,7 @@ F:    doc/DocBook/efi.tmpl
>>>>>> F:    doc/README.uefi
>>>>>> F:    doc/README.iscsi
>>>>>> F:    Documentation/efi.rst
>>>>>> +F:    include/capitalization.h
>>>>>> F:    include/efi*
>>>>>> F:    include/pe.h
>>>>>> F:    include/asm-generic/pe.h
>>>>>> diff --git a/include/capitalization.h b/include/capitalization.h
>>>>>> new file mode 100644
>>>>>> index 0000000000..50d5108f98
>>>>>> --- /dev/null
>>>>>> +++ b/include/capitalization.h
>>>>>> @@ -0,0 +1,1909 @@
>>>>>> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
>>>>>> +/*
>>>>>> + * Correspondence table for small and capital Unicode letters in the range of
>>>>>> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
>>>>>> + */
>>>>>> +
>>>>>> +struct capitalization_table {
>>>>>> +    u16 upper;
>>>>>> +    u16 lower;
>>>>>> +};
>>>>>> +
>>>>>> +#define UNICODE_CAPITALIZATION_TABLE { \
>>>>>
>>>>> Ugh, that is a *lot* of data. How much does the binary size grow with
>>>>> the table compiled in?
>>
>> That data is also in glibc. I don’t know whether you use glibc though
>> ...
>
> U-Boot is a standalone OS so to say, we do not use glibc (except for sandbox, but that's a special target).
>
> The main problem is that some times people cram U-Boot into very tight spaces, like small flash chips or even on-chip RAM. So we have to be very cautious on space requirements.

Indeed.

Most archs use a private glib so do not build with one that supports this table.

Wouldn't it be better to put this in a C file as a const struct?

If it is big that's OK, we just need to add a CONFIG option for it. If
we already have EFI enabled, perhaps it doesn't matter, since it is
pretty big.

Regards,
Simon

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [U-Boot] [PATCH 11/15] efi_loader: capitalization table
  2018-08-30  2:51             ` Simon Glass
@ 2018-08-30  6:41               ` Alexander Graf
  0 siblings, 0 replies; 44+ messages in thread
From: Alexander Graf @ 2018-08-30  6:41 UTC (permalink / raw
  To: u-boot



On 30.08.18 04:51, Simon Glass wrote:
> Hi,
> 
> On 27 August 2018 at 02:37, Alexander Graf <agraf@suse.de> wrote:
>>
>>
>>> Am 27.08.2018 um 10:30 schrieb Mike FABIAN <maiku.fabian@gmail.com>:
>>>
>>> Alexander Graf <agraf@suse.de> さんは書きました:
>>>
>>>>> On 26.08.18 21:00, Heinrich Schuchardt wrote:
>>>>>> On 08/26/2018 08:22 PM, Alexander Graf wrote:
>>>>>>
>>>>>>
>>>>>>> On 11.08.18 17:28, Heinrich Schuchardt wrote:
>>>>>>> This patch provides a define to initialize a table that maps lower to
>>>>>>> capital letters for Unicode code point 0x0000 - 0xffff.
>>>>>>>
>>>>>>> Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
>>>>>>> ---
>>>>>>> MAINTAINERS              |    1 +
>>>>>>> include/capitalization.h | 1909 ++++++++++++++++++++++++++++++++++++++
>>>>>>> 2 files changed, 1910 insertions(+)
>>>>>>> create mode 100644 include/capitalization.h
>>>>>>>
>>>>>>> diff --git a/MAINTAINERS b/MAINTAINERS
>>>>>>> index a324139471..0a543309f2 100644
>>>>>>> --- a/MAINTAINERS
>>>>>>> +++ b/MAINTAINERS
>>>>>>> @@ -368,6 +368,7 @@ F:    doc/DocBook/efi.tmpl
>>>>>>> F:    doc/README.uefi
>>>>>>> F:    doc/README.iscsi
>>>>>>> F:    Documentation/efi.rst
>>>>>>> +F:    include/capitalization.h
>>>>>>> F:    include/efi*
>>>>>>> F:    include/pe.h
>>>>>>> F:    include/asm-generic/pe.h
>>>>>>> diff --git a/include/capitalization.h b/include/capitalization.h
>>>>>>> new file mode 100644
>>>>>>> index 0000000000..50d5108f98
>>>>>>> --- /dev/null
>>>>>>> +++ b/include/capitalization.h
>>>>>>> @@ -0,0 +1,1909 @@
>>>>>>> +/* SPDX-License-Identifier: Unicode-DFS-2016 */
>>>>>>> +/*
>>>>>>> + * Correspondence table for small and capital Unicode letters in the range of
>>>>>>> + * 0x0000 - 0xffff based on http://www.unicode.org/Public/UCA/11.0.0/allkeys.txt
>>>>>>> + */
>>>>>>> +
>>>>>>> +struct capitalization_table {
>>>>>>> +    u16 upper;
>>>>>>> +    u16 lower;
>>>>>>> +};
>>>>>>> +
>>>>>>> +#define UNICODE_CAPITALIZATION_TABLE { \
>>>>>>
>>>>>> Ugh, that is a *lot* of data. How much does the binary size grow with
>>>>>> the table compiled in?
>>>
>>> That data is also in glibc. I don’t know whether you use glibc though
>>> ...
>>
>> U-Boot is a standalone OS so to say, we do not use glibc (except for sandbox, but that's a special target).
>>
>> The main problem is that some times people cram U-Boot into very tight spaces, like small flash chips or even on-chip RAM. So we have to be very cautious on space requirements.
> 
> Indeed.
> 
> Most archs use a private glib so do not build with one that supports this table.
> 
> Wouldn't it be better to put this in a C file as a const struct?
> 
> If it is big that's OK, we just need to add a CONFIG option for it. If
> we already have EFI enabled, perhaps it doesn't matter, since it is
> pretty big.


EFI support when it started was about 10kb. I assume it grew by now, so
maybe we're at 15kb by now. Adding another 4kb for the upper/lower
translation is *massive* by comparison.

Edk2 only supports upper/lower conversion for English anyway, so we're
not worse off if we don't support all the fancy unicode conversions.


Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2018-08-30  6:41 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-08-11 15:28 [U-Boot] [PATCH 00/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 01/15] lib: build charset.o only if needed Heinrich Schuchardt
2018-08-26 17:45   ` Alexander Graf
2018-08-26 18:06     ` Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 02/15] efi_loader: rename utf16_strlen, utf16_strnlen Heinrich Schuchardt
2018-08-26 17:52   ` Alexander Graf
2018-08-26 18:21     ` Heinrich Schuchardt
2018-08-26 18:33       ` Alexander Graf
2018-08-26 19:36         ` Heinrich Schuchardt
2018-08-26 21:57           ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 03/15] lib: charset: utility functions for Unicode Heinrich Schuchardt
2018-08-26 17:59   ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 04/15] test: unit tests for Unicode functions Heinrich Schuchardt
2018-08-26 18:02   ` Alexander Graf
2018-08-26 18:27     ` Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 05/15] lib: vsprintf: correct printing of Unicode strings Heinrich Schuchardt
2018-08-26 18:05   ` Alexander Graf
2018-08-26 18:34     ` Heinrich Schuchardt
2018-08-26 22:01       ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 06/15] test: test printing Unicode Heinrich Schuchardt
2018-08-26 18:06   ` Alexander Graf
2018-08-26 18:36     ` Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 07/15] efi_loader: remove limit on variable length Heinrich Schuchardt
2018-08-26 18:13   ` Alexander Graf
2018-08-26 18:40     ` Heinrich Schuchardt
2018-08-26 22:04       ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 08/15] efi_loader: don't use unlimited stack as buffer Heinrich Schuchardt
2018-08-26 18:16   ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 09/15] efi_loader: buffer size for load options Heinrich Schuchardt
2018-08-26 18:17   ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 10/15] lib: charset: remove obsolete functions Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 11/15] efi_loader: capitalization table Heinrich Schuchardt
2018-08-26 18:22   ` Alexander Graf
2018-08-26 19:00     ` Heinrich Schuchardt
2018-08-26 22:06       ` Alexander Graf
2018-08-27  8:30         ` Mike FABIAN
2018-08-27  8:37           ` Alexander Graf
2018-08-30  2:51             ` Simon Glass
2018-08-30  6:41               ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 12/15] lib: charset: upper/lower case conversion Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 13/15] test: tests for utf_to_lower() utf_to_upper() Heinrich Schuchardt
2018-08-11 15:28 ` [U-Boot] [PATCH 14/15] efi_loader: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt
2018-08-26 18:31   ` Alexander Graf
2018-08-11 15:28 ` [U-Boot] [PATCH 15/15] efi_selftest: EFI_UNICODE_COLLATION_PROTOCOL Heinrich Schuchardt

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.