src/test-unicode.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71

#define BOOST_TEST_MODULE unicode_test

#include <boost/test/included/unit_test.hpp>

#include <string>
#include <tuple>
#include <type_traits>

#include <unicode.h>

std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> t {
 u8"Täst", u"Täst", U"Täst"
};

template<size_t i = 0, size_t j = 0, typename... Ts>
void test_utf_to_utf(std::tuple<Ts...>& t)
{
 typedef typename std::tuple_element<i,typename std::remove_reference<decltype(t)>::type>::type From;
 typedef typename std::tuple_element<j,typename std::remove_reference<decltype(t)>::type>::type To;

 // test
 To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) };

 BOOST_CHECK(std::get<j>(t) == result);

 //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl;

 // iterate over other combinations
 if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
  test_utf_to_utf<i + 1, j>(t);
 else if constexpr (j + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value)
  test_utf_to_utf<0, j + 1>(t);
}

BOOST_AUTO_TEST_CASE(utf_to_utf)
{
 test_utf_to_utf(t);
}

BOOST_AUTO_TEST_CASE(utf8_to_utf16)
{
 std::u8string u8{u8"ascii string1"};
 
 std::u16string u16{unicode::utf_to_utf<char8_t, char16_t>(u8)};

 BOOST_CHECK(u16 == u"ascii string1");
}

BOOST_AUTO_TEST_CASE(utf16_to_utf8)
{
 std::u16string u16{u"ascii string1"};
 
 std::u8string u8{unicode::utf_to_utf<char16_t, char8_t>(u16)};

 BOOST_CHECK(u8 == u8"ascii string1");
}

// TODO:
// UTF-8
//  invalid bytes
//  an unexpected continuation byte
//  a non-continuation byte before the end of the character
//  the string ending before the end of the character (which can happen in simple string truncation)
//  an overlong encoding
//  a sequence that decodes to an invalid code point
//
//  high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF)
//
// char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux)
// string, vector?
// uint8_t, uint16_t, uint32_t?