]> git.dujemihanovic.xyz Git - u-boot.git/commitdiff
lib/charset: UTF-8 stream conversion
authorHeinrich Schuchardt <xypron.glpk@gmx.de>
Sat, 27 Feb 2021 13:08:38 +0000 (14:08 +0100)
committerHeinrich Schuchardt <xypron.glpk@gmx.de>
Sun, 7 Mar 2021 16:37:13 +0000 (17:37 +0100)
Provide functions to convert an UTF-8 stream to code page 437 or UTF-32.

Add unit tests.

Signed-off-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
include/charset.h
lib/charset.c
test/unicode_ut.c

index 52e7d1474eba6f7f7643f340e79ba0fead68605a..a911160f192ed95b41e7ad949401ee035f00080e 100644 (file)
@@ -286,4 +286,22 @@ uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size);
  */
 int utf_to_cp(s32 *c, const u16 *codepage);
 
+/**
+ * utf8_to_cp437_stream() - convert UTF-8 stream to codepage 437
+ *
+ * @c:         next UTF-8 character to convert
+ * @buffer:    buffer, at least 5 characters
+ * Return:     next codepage 437 character or 0
+ */
+int utf8_to_cp437_stream(u8 c, char *buffer);
+
+/**
+ * utf8_to_utf32_stream() - convert UTF-8 stream to UTF-32
+ *
+ * @c:         next UTF-8 character to convert
+ * @buffer:    buffer, at least 5 characters
+ * Return:     next codepage 437 character or 0
+ */
+int utf8_to_utf32_stream(u8 c, char *buffer);
+
 #endif /* __CHARSET_H_ */
index 946d5ee23eb7e86034a1448b3003b1edd122f800..f44c58d9d81a29517e5092c795257d55fe88830b 100644 (file)
@@ -481,15 +481,6 @@ uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
        return dest;
 }
 
-/**
- * utf_to_cp() - translate Unicode code point to 8bit codepage
- *
- * Codepoints that do not exist in the codepage are rendered as question mark.
- *
- * @c:         pointer to Unicode code point to be translated
- * @codepage:  Unicode to codepage translation table
- * Return:     0 on success, -ENOENT if codepoint cannot be translated
- */
 int utf_to_cp(s32 *c, const u16 *codepage)
 {
        if (*c >= 0x80) {
@@ -507,3 +498,49 @@ int utf_to_cp(s32 *c, const u16 *codepage)
        }
        return 0;
 }
+
+int utf8_to_cp437_stream(u8 c, char *buffer)
+{
+       char *end;
+       const char *pos;
+       s32 s;
+       int ret;
+
+       for (;;) {
+               pos = buffer;
+               end = buffer + strlen(buffer);
+               *end++ = c;
+               *end = 0;
+               s = utf8_get(&pos);
+               if (s > 0) {
+                       *buffer = 0;
+                       ret = utf_to_cp(&s, codepage_437);
+                       return s;
+                       }
+               if (pos == end)
+                       return 0;
+               *buffer = 0;
+       }
+}
+
+int utf8_to_utf32_stream(u8 c, char *buffer)
+{
+       char *end;
+       const char *pos;
+       s32 s;
+
+       for (;;) {
+               pos = buffer;
+               end = buffer + strlen(buffer);
+               *end++ = c;
+               *end = 0;
+               s = utf8_get(&pos);
+               if (s > 0) {
+                       *buffer = 0;
+                       return s;
+               }
+               if (pos == end)
+                       return 0;
+               *buffer = 0;
+       }
+}
index 154361aea7da5c64d2526866d20d4da7386e24c6..6f6aea5f602f0114b4106340fbad477c3824c478 100644 (file)
@@ -47,6 +47,9 @@ static const char d3[] = {0xe6, 0xbd, 0x9c, 0xe6, 0xb0, 0xb4, 0xe8, 0x89,
 /* Three letters translating to two utf-16 word each */
 static const char d4[] = {0xf0, 0x90, 0x92, 0x8d, 0xf0, 0x90, 0x92, 0x96,
                          0xf0, 0x90, 0x92, 0x87, 0x00};
+/* Letter not in code page 437 */
+static const char d5[] = {0xCE, 0x92, 0x20, 0x69, 0x73, 0x20, 0x6E, 0x6F,
+                         0x74, 0x20, 0x42, 0x00};
 
 /* Illegal utf-8 strings */
 static const char j1[] = {0x6a, 0x31, 0xa1, 0x6c, 0x00};
@@ -631,6 +634,81 @@ static int unicode_test_utf_to_cp(struct unit_test_state *uts)
 }
 UNICODE_TEST(unicode_test_utf_to_cp);
 
+static void utf8_to_cp437_stream_helper(const char *in, char *out)
+{
+       char buffer[5];
+       int ret;
+
+       *buffer = 0;
+       for (; *in; ++in) {
+               ret = utf8_to_cp437_stream(*in, buffer);
+               if (ret)
+                       *out++ = ret;
+       }
+       *out = 0;
+}
+
+static int unicode_test_utf8_to_cp437_stream(struct unit_test_state *uts)
+{
+       char buf[16];
+
+       utf8_to_cp437_stream_helper(d1, buf);
+       ut_asserteq_str("U-Boot", buf);
+       utf8_to_cp437_stream_helper(d2, buf);
+       ut_asserteq_str("kafb\xa0tur", buf);
+       utf8_to_cp437_stream_helper(d5, buf);
+       ut_asserteq_str("? is not B", buf);
+       utf8_to_cp437_stream_helper(j2, buf);
+       ut_asserteq_str("j2l", buf);
+
+       return 0;
+}
+UNICODE_TEST(unicode_test_utf8_to_cp437_stream);
+
+static void utf8_to_utf32_stream_helper(const char *in, s32 *out)
+{
+       char buffer[5];
+       int ret;
+
+       *buffer = 0;
+       for (; *in; ++in) {
+               ret = utf8_to_utf32_stream(*in, buffer);
+               if (ret)
+                       *out++ = ret;
+       }
+       *out = 0;
+}
+
+static int unicode_test_utf8_to_utf32_stream(struct unit_test_state *uts)
+{
+       s32 buf[16];
+
+       const u32 u1[] = {0x55, 0x2D, 0x42, 0x6F, 0x6F, 0x74, 0x0000};
+       const u32 u2[] = {0x6B, 0x61, 0x66, 0x62, 0xE1, 0x74, 0x75, 0x72, 0x00};
+       const u32 u3[] = {0x0392, 0x20, 0x69, 0x73, 0x20, 0x6E, 0x6F, 0x74,
+                         0x20, 0x42, 0x00};
+       const u32 u4[] = {0x6A, 0x32, 0x6C, 0x00};
+
+       memset(buf, 0, sizeof(buf));
+       utf8_to_utf32_stream_helper(d1, buf);
+       ut_asserteq_mem(u1, buf, sizeof(u1));
+
+       memset(buf, 0, sizeof(buf));
+       utf8_to_utf32_stream_helper(d2, buf);
+       ut_asserteq_mem(u2, buf, sizeof(u2));
+
+       memset(buf, 0, sizeof(buf));
+       utf8_to_utf32_stream_helper(d5, buf);
+       ut_asserteq_mem(u3, buf, sizeof(u3));
+
+       memset(buf, 0, sizeof(buf));
+       utf8_to_utf32_stream_helper(j2, buf);
+       ut_asserteq_mem(u4, buf, sizeof(u4));
+
+       return 0;
+}
+UNICODE_TEST(unicode_test_utf8_to_utf32_stream);
+
 #ifdef CONFIG_EFI_LOADER
 static int unicode_test_efi_create_indexed_name(struct unit_test_state *uts)
 {