#define BOOST_TEST_MODULE unicode_test #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std::chrono_literals; typedef std::tuple, std::basic_string, std::basic_string> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} // Success cases: convert string to all other types, respectively std::vector success_sets { SUCCESS_TUPLE(""), SUCCESS_TUPLE("ASCII string1"), SUCCESS_TUPLE("Täst just looks like German"), SUCCESS_TUPLE("\u732b is chinese for cat"), SUCCESS_TUPLE("\U0001F63A"), SUCCESS_TUPLE("\U0001F63A is a smiling cat"), }; // Error cases: throwing upon convert to all other types std::vector> failure_strings_char8_t { u8"\x80", // utf-8 continuation byte u8"\x81", // utf-8 continuation byte u8"\xc3\xc3\xa4", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" u8"\xF8\x80\x80\x80\x80", // overlong encoding u8"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector> failure_strings_char16_t { u"\xD801", // single high surrogate u"\xDFFF", // single low surrogate u"\xDFFF\xD801", // bad surrogate pair order }; std::vector> failure_strings_char32_t { U"blabla \xD801", // invalid unicode (surrogate half) U"\x10000000", // invalid unicode (number too big) }; // output operators must be in same namespace as the type itself namespace std { #ifdef __cpp_char8_t std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } #endif std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } } template void test_utf_to_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type From; typedef typename std::tuple_element::type>::type To; // test base type interface To result { unicode::convert(std::get(t)) }; BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); //std::cout << std::to_string(std::tuple_size::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; // test facet interface result = unicode::convert::Facet, typename unicode::Encoding::Facet>(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_utf_to_utf(t); else if constexpr (j + 1 < std::tuple_size::type>::value) test_utf_to_utf<0, j + 1>(t); } // We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign // a new variable to each tuple element which we don't want // https://lists.boost.org/boost-bugs/2016/05/45214.php BOOST_AUTO_TEST_CASE(utf_to_utf_success) { for (auto& t: success_sets) test_utf_to_utf(t); } template void test_is_valid_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type T; // test via basic type bool result { unicode::is_valid_utf(std::get(t)) }; BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via Facet result = unicode::is_valid_utf::Facet>(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding::Facet).name() << "(" << i << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_is_valid_utf(t); } BOOST_AUTO_TEST_CASE(is_valid_utf_success) { for (auto& t: success_sets) test_is_valid_utf(t); } // iterate over std::tuple T types template void test_utf_to_utf_failure(std::basic_string& s) { typedef typename std::tuple_element::type::value_type To; // via base type try { (void) unicode::convert(s); BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via facet try { (void) unicode::convert::Facet,typename unicode::Encoding::Facet>(s); BOOST_ERROR("Facet: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_utf_to_utf_failure(s); } BOOST_AUTO_TEST_CASE(utf_to_utf_failure) { for (auto& s: failure_strings_char8_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); } // iterate over std::tuple T types template void test_is_valid_utf_failure(std::basic_string& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding::Facet).name()); // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_is_valid_utf_failure(s); } BOOST_AUTO_TEST_CASE(is_valid_utf_failure) { for (auto& s: failure_strings_char8_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); } BOOST_AUTO_TEST_CASE(is_valid_unicode) { BOOST_CHECK(unicode::is_valid_unicode('\0')); BOOST_CHECK(unicode::is_valid_unicode(U'a')); BOOST_CHECK(unicode::is_valid_unicode(U'ä')); BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley BOOST_CHECK(!unicode::is_valid_unicode(0x00110000)); BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567)); BOOST_CHECK(!unicode::is_valid_unicode(0x12345678)); BOOST_CHECK(!unicode::is_valid_unicode(0xD800)); BOOST_CHECK(!unicode::is_valid_unicode(0xD987)); BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } struct random_context { std::random_device rd; // OS random number engine to seed RNG (below) std::mt19937 gen{rd()}; std::uniform_int_distribution sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units }; template T generate_random(random_context& rc, size_t length) { // Using unsigned long for std::uniform_int_distribution<> because it needs to be basic type according to MSVC std::uniform_int_distribution code_unit(std::numeric_limits::max()); // code unit value T result; std::generate_n(std::back_inserter(result), length, [&](){return static_cast(code_unit(rc.gen));}); return result; } template void test_random(random_context& rc, size_t length) { //std::cerr << "LENGTH: " << length << std::endl; typedef typename std::tuple_element::type To; From r {static_cast(generate_random(rc, length))}; // base type interface try { To result{unicode::convert(r)}; if (r.empty()) { BOOST_CHECK(result.empty()); } else { BOOST_CHECK(!result.empty()); } } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } // facet interface try { To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(r)}; if (r.empty()) { BOOST_CHECK(result.empty()); } else { BOOST_CHECK(!result.empty()); } } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } // iterate over remaining To types if constexpr (i + 1 < std::tuple_size::value) test_random(rc, length); } BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) { random_context rc; int i{}; // run for 1s (debug) 10s (release) = total time for all random_sequences types! #ifdef _DEBUG const auto timeout{1.0s}; #else const auto timeout{10.0s}; #endif auto timeout_stamp { std::chrono::steady_clock::now() + (timeout / std::tuple_size::value)}; while (!(std::chrono::steady_clock::now() > timeout_stamp)) { test_random(rc, rc.sequence_length(rc.gen)); i++; } BOOST_CHECK_MESSAGE(i > 1, "Not enough iterations done!"); } // Test ISO and UTF encodings BOOST_AUTO_TEST_CASE(convert) { BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); BOOST_CHECK((std::string{unicode::convert("äöü")}) == std::string{"äöü"}); BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(u8"äöü")) == std::u32string{U"äöü"}); #ifdef _WIN32 BOOST_CHECK(sizeof(wchar_t) == 2); #else // Unix like BOOST_CHECK(sizeof(wchar_t) == 4); #endif // For the following checks, wchar_t size and encoding is system dependent: // Windows: UTF-16 // Linux: UTF-32 BOOST_CHECK((unicode::convert(u8"äöü")) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert(u8"\u732b")) == std::wstring{L"\u732b"}); BOOST_CHECK((unicode::convert(u8"\U0001F63A")) == std::wstring{L"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::string{u8"\U0001F63A"}); BOOST_CHECK((unicode::convert(std::string{"äöü"})) == std::wstring{L"äöü"}); //BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); } BOOST_AUTO_TEST_CASE(is_valid_utf) { BOOST_CHECK(unicode::is_valid_utf(u"äöü")); BOOST_CHECK(unicode::is_valid_utf(u8"äöü")); } BOOST_AUTO_TEST_CASE(string_u8string) { std::string a{"\xc3\xa4"}; std::basic_string b{a.begin(), a.end()}; BOOST_CHECK(b == std::basic_string{u8"ä"}); a = std::string{b.begin(), b.end()}; BOOST_CHECK(a == std::string{"\xc3\xa4"}); } // TODO: // // string, vector? // uint8_t, uint16_t, uint32_t?