lib: charset: utility functions for Unicode

author Heinrich Schuchardt <xypron.glpk@gmx.de>

Fri, 31 Aug 2018 19:31:27 +0000 (21:31 +0200)

committer Alexander Graf <agraf@suse.de>

Sun, 23 Sep 2018 19:55:29 +0000 (21:55 +0200)
author Heinrich Schuchardt <xypron.glpk@gmx.de>
Fri, 31 Aug 2018 19:31:27 +0000 (21:31 +0200)
committer Alexander Graf <agraf@suse.de>
Sun, 23 Sep 2018 19:55:29 +0000 (21:55 +0200)
diff --git a/include/charset.h b/include/charset.h

index 2c6deb8034fd63a0c6fae15a8f4917201a048835..cf41eb5e5fb290eb3d50520ac4e037a820524751 100644 (file)
--- a/include/charset.h
+++ b/include/charset.h
@@ -8,10 +8,140 @@
  #ifndef __CHARSET_H_
  #define __CHARSET_H_
  
+#include <linux/kernel.h>
  #include <linux/types.h>
  
  #define MAX_UTF8_PER_UTF16 3
  
+/**
+ * utf8_get() - get next UTF-8 code point from buffer
+ *
+ * @src:               pointer to current byte, updated to point to next byte
+ * Return:             code point, or 0 for end of string, or -1 if no legal
+ *                     code point is found. In case of an error src points to
+ *                     the incorrect byte.
+ */
+s32 utf8_get(const char **src);
+
+/**
+ * utf8_put() - write UTF-8 code point to buffer
+ *
+ * @code:              code point
+ * @dst:               pointer to destination buffer, updated to next position
+ * Return:             -1 if the input parameters are invalid
+ */
+int utf8_put(s32 code, char **dst);
+
+/**
+ * utf8_utf16_strnlen() - length of a truncated utf-8 string after conversion
+ *                       to utf-16
+ *
+ * @src:               utf-8 string
+ * @count:             maximum number of code points to convert
+ * Return:             length in bytes after conversion to utf-16 without the
+ *                     trailing \0. If an invalid UTF-8 sequence is hit one
+ *                     word will be reserved for a replacement character.
+ */
+size_t utf8_utf16_strnlen(const char *src, size_t count);
+
+/**
+ * utf8_utf16_strlen() - length of a utf-8 string after conversion to utf-16
+ *
+ * @src:               utf-8 string
+ * Return:             length in bytes after conversion to utf-16 without the
+ *                     trailing \0. -1 if the utf-8 string is not valid.
+ */
+#define utf8_utf16_strlen(a) utf8_utf16_strnlen((a), SIZE_MAX)
+
+/**
+ * utf8_utf16_strncpy() - copy utf-8 string to utf-16 string
+ *
+ * @dst:               destination buffer
+ * @src:               source buffer
+ * @count:             maximum number of code points to copy
+ * Return:             -1 if the input parameters are invalid
+ */
+int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count);
+
+/**
+ * utf8_utf16_strcpy() - copy utf-8 string to utf-16 string
+ *
+ * @dst:               destination buffer
+ * @src:               source buffer
+ * Return:             -1 if the input parameters are invalid
+ */
+#define utf8_utf16_strcpy(d, s) utf8_utf16_strncpy((d), (s), SIZE_MAX)
+
+/**
+ * utf16_get() - get next UTF-16 code point from buffer
+ *
+ * @src:               pointer to current word, updated to point to next word
+ * Return:             code point, or 0 for end of string, or -1 if no legal
+ *                     code point is found. In case of an error src points to
+ *                     the incorrect word.
+ */
+s32 utf16_get(const u16 **src);
+
+/**
+ * utf16_put() - write UTF-16 code point to buffer
+ *
+ * @code:              code point
+ * @dst:               pointer to destination buffer, updated to next position
+ * Return:             -1 if the input parameters are invalid
+ */
+int utf16_put(s32 code, u16 **dst);
+
+/**
+ * utf16_strnlen() - length of a truncated utf-16 string
+ *
+ * @src:               utf-16 string
+ * @count:             maximum number of code points to convert
+ * Return:             length in code points. If an invalid UTF-16 sequence is
+ *                     hit one position will be reserved for a replacement
+ *                     character.
+ */
+size_t utf16_strnlen(const u16 *src, size_t count);
+
+/**
+ * utf16_utf8_strnlen() - length of a truncated utf-16 string after conversion
+ *                       to utf-8
+ *
+ * @src:               utf-16 string
+ * @count:             maximum number of code points to convert
+ * Return:             length in bytes after conversion to utf-8 without the
+ *                     trailing \0. If an invalid UTF-16 sequence is hit one
+ *                     byte will be reserved for a replacement character.
+ */
+size_t utf16_utf8_strnlen(const u16 *src, size_t count);
+
+/**
+ * utf16_utf8_strlen() - length of a utf-16 string after conversion to utf-8
+ *
+ * @src:               utf-16 string
+ * Return:             length in bytes after conversion to utf-8 without the
+ *                     trailing \0. -1 if the utf-16 string is not valid.
+ */
+#define utf16_utf8_strlen(a) utf16_utf8_strnlen((a), SIZE_MAX)
+
+/**
+ * utf16_utf8_strncpy() - copy utf-16 string to utf-8 string
+ *
+ * @dst:               destination buffer
+ * @src:               source buffer
+ * @count:             maximum number of code points to copy
+ * Return:             -1 if the input parameters are invalid
+ */
+int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count);
+
+/**
+ * utf16_utf8_strcpy() - copy utf-16 string to utf-8 string
+ *
+ * @dst:               destination buffer
+ * @src:               source buffer
+ * Return:             -1 if the input parameters are invalid
+ */
+#define utf16_utf8_strcpy(d, s) utf16_utf8_strncpy((d), (s), SIZE_MAX)
+
  /**
   * u16_strlen - count non-zero words
   *
diff --git a/lib/charset.c b/lib/charset.c

index 8ff8d59957d6502cf15fd646d4b6a5e32b4b2b69..e82622a7f8797d7adaddd7b19e1de03315fe4df4 100644 (file)
--- a/lib/charset.c
+++ b/lib/charset.c
@@ -8,9 +8,239 @@
  #include <charset.h>
  #include <malloc.h>
  
-/*
- * utf8/utf16 conversion mostly lifted from grub
- */
+s32 utf8_get(const char **src)
+{
+       s32 code = 0;
+       unsigned char c;
+
+       if (!src || !*src)
+               return -1;
+       if (!**src)
+               return 0;
+       c = **src;
+       if (c >= 0x80) {
+               ++*src;
+               if (!**src)
+                       return -1;
+               /*
+                * We do not expect a continuation byte (0x80 - 0xbf).
+                * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
+                * here.
+                * The highest code point is 0x10ffff which is coded as
+                * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
+                */
+               if (c < 0xc2 || code > 0xf4)
+                       return -1;
+               if (c >= 0xe0) {
+                       if (c >= 0xf0) {
+                               /* 0xf0 - 0xf4 */
+                               c &= 0x07;
+                               code = c << 18;
+                               c = **src;
+                               ++*src;
+                               if (!**src)
+                                       return -1;
+                               if (c < 0x80 || c > 0xbf)
+                                       return -1;
+                               c &= 0x3f;
+                       } else {
+                               /* 0xe0 - 0xef */
+                               c &= 0x0f;
+                       }
+                       code += c << 12;
+                       if ((code >= 0xD800 && code <= 0xDFFF) ||
+                           code >= 0x110000)
+                               return -1;
+                       c = **src;
+                       ++*src;
+                       if (!**src)
+                               return -1;
+                       if (c < 0x80 || c > 0xbf)
+                               return -1;
+               }
+               /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
+               c &= 0x3f;
+               code += c << 6;
+               c = **src;
+               if (c < 0x80 || c > 0xbf)
+                       return -1;
+               c &= 0x3f;
+       }
+       code += c;
+       ++*src;
+       return code;
+}
+
+int utf8_put(s32 code, char **dst)
+{
+       if (!dst || !*dst)
+               return -1;
+       if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
+               return -1;
+       if (code <= 0x007F) {
+               **dst = code;
+       } else {
+               if (code <= 0x07FF) {
+                       **dst = code >> 6 | 0xC0;
+               } else {
+                       if (code < 0x10000) {
+                               **dst = code >> 12 | 0xE0;
+                       } else {
+                               **dst = code >> 18 | 0xF0;
+                               ++*dst;
+                               **dst = (code >> 12 & 0x3F) | 0x80;
+                       }
+                       ++*dst;
+                       **dst = (code >> 6 & 0x3F) | 0x80;
+               }
+               ++*dst;
+               **dst = (code & 0x3F) | 0x80;
+       }
+       ++*dst;
+       return 0;
+}
+
+size_t utf8_utf16_strnlen(const char *src, size_t count)
+{
+       size_t len = 0;
+
+       for (; *src && count; --count)  {
+               s32 code = utf8_get(&src);
+
+               if (!code)
+                       break;
+               if (code < 0) {
+                       /* Reserve space for a replacement character */
+                       len += 1;
+               } else if (code < 0x10000) {
+                       len += 1;
+               } else {
+                       len += 2;
+               }
+       }
+       return len;
+}
+
+int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
+{
+       if (!src || !dst || !*dst)
+               return -1;
+
+       for (; count && *src; --count) {
+               s32 code = utf8_get(&src);
+
+               if (code < 0)
+                       code = '?';
+               utf16_put(code, dst);
+       }
+       **dst = 0;
+       return 0;
+}
+
+s32 utf16_get(const u16 **src)
+{
+       s32 code, code2;
+
+       if (!src || !*src)
+               return -1;
+       if (!**src)
+               return 0;
+       code = **src;
+       ++*src;
+       if (code >= 0xDC00 && code <= 0xDFFF)
+               return -1;
+       if (code >= 0xD800 && code <= 0xDBFF) {
+               if (!**src)
+                       return -1;
+               code &= 0x3ff;
+               code <<= 10;
+               code += 0x10000;
+               code2 = **src;
+               ++*src;
+               if (code2 <= 0xDC00 || code2 >= 0xDFFF)
+                       return -1;
+               code2 &= 0x3ff;
+               code += code2;
+       }
+       return code;
+}
+
+int utf16_put(s32 code, u16 **dst)
+{
+       if (!dst || !*dst)
+               return -1;
+       if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
+               return -1;
+       if (code < 0x10000) {
+               **dst = code;
+       } else {
+               code -= 0x10000;
+               **dst = code >> 10 | 0xD800;
+               ++*dst;
+               **dst = (code & 0x3ff) | 0xDC00;
+       }
+       ++*dst;
+       return 0;
+}
+
+size_t utf16_strnlen(const u16 *src, size_t count)
+{
+       size_t len = 0;
+
+       for (; *src && count; --count)  {
+               s32 code = utf16_get(&src);
+
+               if (!code)
+                       break;
+               /*
+                * In case of an illegal sequence still reserve space for a
+                * replacement character.
+                */
+               ++len;
+       }
+       return len;
+}
+
+size_t utf16_utf8_strnlen(const u16 *src, size_t count)
+{
+       size_t len = 0;
+
+       for (; *src && count; --count)  {
+               s32 code = utf16_get(&src);
+
+               if (!code)
+                       break;
+               if (code < 0)
+                       /* Reserve space for a replacement character */
+                       len += 1;
+               else if (code < 0x80)
+                       len += 1;
+               else if (code < 0x800)
+                       len += 2;
+               else if (code < 0x10000)
+                       len += 3;
+               else
+                       len += 4;
+       }
+       return len;
+}
+
+int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
+{
+       if (!src || !dst || !*dst)
+               return -1;
+
+       for (; count && *src; --count) {
+               s32 code = utf16_get(&src);
+
+               if (code < 0)
+                       code = '?';
+               utf8_put(code, dst);
+       }
+       **dst = 0;
+       return 0;
+}
+
  
  size_t u16_strlen(const u16 *in)
  {
author	Heinrich Schuchardt <xypron.glpk@gmx.de>
	Fri, 31 Aug 2018 19:31:27 +0000 (21:31 +0200)
committer	Alexander Graf <agraf@suse.de>
	Sun, 23 Sep 2018 19:55:29 +0000 (21:55 +0200)
include/charset.h		patch \| blob \| history
lib/charset.c		patch \| blob \| history