#define BOOST_TEST_MODULE unicode_test #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using namespace std::chrono_literals; typedef std::tuple, std::basic_string, std::basic_string> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} // Success cases: convert string to all other types, respectively std::vector success_sets { SUCCESS_TUPLE(""), SUCCESS_TUPLE("ASCII string1"), SUCCESS_TUPLE("Täst just looks like German"), SUCCESS_TUPLE("\u732b is chinese for cat"), SUCCESS_TUPLE("\U0001F63A"), SUCCESS_TUPLE("\U0001F63A is a smiling cat"), }; // Error cases: throwing upon convert to all other types std::vector> failure_strings_char8_t { // using u8"" here doesn't work on MSVC (utf8_t*)"\x80", // utf-8 continuation byte (utf8_t*)"\x81", // utf-8 continuation byte (utf8_t*)"\xc3\xc3\xa4", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" (utf8_t*)"\xF8\x80\x80\x80\x80", // overlong encoding (utf8_t*)"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector> failure_strings_char16_t { u"\xD801", // single high surrogate u"\xDFFF", // single low surrogate u"\xDFFF\xD801", // bad surrogate pair order }; std::vector> failure_strings_char32_t { U"\xD800 and more text", // invalid unicode (surrogate half) U"blabla \xD801", // invalid unicode (surrogate half) U"moreblabla \xDFFF", // invalid unicode (surrogate half) U"\x10000000", // invalid unicode (number too big) }; // output operators must be in same namespace as the type itself namespace std { #ifdef __cpp_char8_t std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } #endif std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } } template void test_utf_to_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type From; typedef typename std::tuple_element::type>::type To; // test base type interface To result { unicode::convert(std::get(t)) }; BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(typename From::value_type).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(typename To::value_type).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test container interface result = unicode::convert(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Container: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // test facet interface result = unicode::convert::Facet, typename unicode::Encoding::Facet>(std::get(t)); BOOST_CHECK_MESSAGE(std::get(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_utf_to_utf(t); else if constexpr (j + 1 < std::tuple_size::type>::value) test_utf_to_utf<0, j + 1>(t); } // We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign // a new variable to each tuple element which we don't want // https://lists.boost.org/boost-bugs/2016/05/45214.php BOOST_AUTO_TEST_CASE(utf_to_utf_success) { for (auto& t: success_sets) test_utf_to_utf(t); } template void test_is_valid_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type T; // test via basic type bool result { unicode::is_valid_utf(std::get(t)) }; BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename T::value_type).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via container type result = unicode::is_valid_utf(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get(t) << "), got " << result); // test via Facet result = unicode::is_valid_utf::Facet>(std::get(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding::Facet).name() << "(" << i << ", " << std::get(t) << "), got " << result); // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_is_valid_utf(t); } BOOST_AUTO_TEST_CASE(is_valid_utf_success) { for (auto& t: success_sets) test_is_valid_utf(t); } // iterate over std::tuple T types template void test_utf_to_utf_failure(std::basic_string& s) { typedef typename std::tuple_element::type::value_type To; // via base type try { (void) unicode::convert(s); BOOST_ERROR("Base type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via container try { (void) unicode::convert::Facet::string_type, typename unicode::Encoding::Facet::string_type>(s); BOOST_ERROR("Container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // via facet try { (void) unicode::convert::Facet,typename unicode::Encoding::Facet>(s); BOOST_ERROR("Facet: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); }; // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_utf_to_utf_failure(s); } BOOST_AUTO_TEST_CASE(utf_to_utf_failure) { for (auto& s: failure_strings_char8_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); } // iterate over std::tuple T types template void test_is_valid_utf_failure(std::basic_string& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); BOOST_CHECK_MESSAGE(unicode::is_valid_utf::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding::Facet).name()); // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_is_valid_utf_failure(s); } BOOST_AUTO_TEST_CASE(is_valid_utf_failure) { for (auto& s: failure_strings_char8_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_is_valid_utf_failure::type::value_type, types_collection_type>(s); } BOOST_AUTO_TEST_CASE(is_valid_unicode) { BOOST_CHECK(unicode::is_valid_unicode('\0')); BOOST_CHECK(unicode::is_valid_unicode(U'a')); BOOST_CHECK(unicode::is_valid_unicode(U'ä')); BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley BOOST_CHECK(!unicode::is_valid_unicode(0x00110000)); BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567)); BOOST_CHECK(!unicode::is_valid_unicode(0x12345678)); BOOST_CHECK(!unicode::is_valid_unicode(0xD800)); BOOST_CHECK(!unicode::is_valid_unicode(0xD987)); BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } struct random_context { random_context(int max_value = 0x10FFFF - 0x800): code_point_distribution(0, max_value) {} std::random_device rd; // OS random number engine to seed RNG (below) std::mt19937 gen{rd()}; std::uniform_int_distribution sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units std::uniform_int_distribution code_point_distribution; }; // generates valid and invalid strings of different type template T generate_random_invalid(random_context& rc, size_t length) { // Using unsigned long for std::uniform_int_distribution<> because it needs to be basic type according to MSVC std::uniform_int_distribution code_unit{0, std::numeric_limits::max()}; // code unit value T result; std::generate_n(std::back_inserter(result), length, [&](){return static_cast(code_unit(rc.gen));}); return result; } char32_t generate_random_char(random_context& rc) { auto result {rc.code_point_distribution(rc.gen)}; if (result >= 0xD800) result += 0x800; return static_cast(result); } std::u32string generate_random_string(random_context& rc, size_t length) { std::u32string result; std::generate_n(std::back_inserter(result), length, [&](){return generate_random_char(rc);}); return result; } template void test_random_invalid(random_context& rc, size_t length) { //std::cerr << "LENGTH: " << length << std::endl; typedef typename std::tuple_element::type To; From r {static_cast(generate_random_invalid(rc, length))}; // base type interface try { To result{unicode::convert(r)}; if (r.empty()) { BOOST_CHECK(result.empty()); } else { BOOST_CHECK(!result.empty()); } } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } // container type interface try { To result{unicode::convert(r)}; if (r.empty()) { BOOST_CHECK(result.empty()); } else { BOOST_CHECK(!result.empty()); } } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } // facet interface try { To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(r)}; if (r.empty()) { BOOST_CHECK(result.empty()); } else { BOOST_CHECK(!result.empty()); } } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } // iterate over remaining To types if constexpr (i + 1 < std::tuple_size::value) test_random_invalid(rc, length); } BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_invalid, T, types_collection_type) { random_context rc; for (int i = 0; i < 10; i++) { test_random_invalid(rc, rc.sequence_length(rc.gen)); } } // utility wrapper to adapt locale-bound facets for wstring/wbuffer convert template struct deletable_facet : Facet { template deletable_facet(Args&& ...args) : Facet(std::forward(args)...) {} ~deletable_facet() {} }; namespace { // char8_t instead of char doesn't work w/ clang++-13 + C++20 (yet?) std::wstring_convert>, char16_t> conv16; std::wstring_convert>, char32_t> conv32; template std::basic_string std_convert(const std::basic_string& s); template<> std::basic_string std_convert(const std::basic_string& s) { std::string a{s.begin(), s.end()}; a = conv32.to_bytes(conv32.from_bytes(a)); return std::basic_string{a.begin(), a.end()}; } template<> std::basic_string std_convert(const std::basic_string& s) { std::string a{s.begin(), s.end()}; return conv16.from_bytes(a); } template<> std::basic_string std_convert(const std::basic_string& s) { std::string a{s.begin(), s.end()}; return conv32.from_bytes(a); } template<> std::basic_string std_convert(const std::basic_string& s) { auto result{conv16.to_bytes(s)}; return std::basic_string(result.begin(), result.end()); } template<> std::basic_string std_convert(const std::basic_string& s) { return conv16.from_bytes(conv16.to_bytes(s)); } template<> std::basic_string std_convert(const std::basic_string& s) { return conv32.from_bytes(conv16.to_bytes(s)); } template<> std::basic_string std_convert(const std::basic_string& s) { auto result{conv32.to_bytes(s)}; return std::basic_string(result.begin(), result.end()); } template<> std::basic_string std_convert(const std::basic_string& s) { return conv16.from_bytes(conv32.to_bytes(s)); } template<> std::basic_string std_convert(const std::basic_string& s) { return conv32.from_bytes(conv32.to_bytes(s)); } } template void test_random_valid(random_context& rc, size_t length, const std::string& description) { typedef typename std::tuple_element::type To; // Fill UTF-32 data list: source for tests std::vector u32list; std::generate_n(std::back_inserter(u32list), 1000, [&](){return generate_random_string(rc, rc.sequence_length(rc.gen));}); // Fill From data list std::vector list; std::transform(u32list.begin(), u32list.end(), std::back_inserter(list), [](const std::u32string& s){ return unicode::convert::Facet>(s); }); for (int i = 0; i < list.size(); i++) { BOOST_CHECK(list[i].size() >= u32list[i].size()); To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(list[i])}; BOOST_CHECK(result.size() >= u32list[i].size()); auto boost_result{boost::locale::conv::utf_to_utf(list[i])}; BOOST_CHECK_EQUAL(result, boost_result); } { auto t0{std::chrono::steady_clock::now()}; for (const auto& i: list) To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(i)}; std::cout << "Performance test for converting " << list.size() << " " << description << " from UTF-" << (sizeof(typename From::value_type) * 8) << " to UTF-" << (sizeof(typename To::value_type) * 8) << ": " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << "s" << std::endl; } { auto t0{std::chrono::steady_clock::now()}; for (const auto& i: list) To result{boost::locale::conv::utf_to_utf(i)}; std::cout << " -> Compare to boost::locale::conv::utf_to_utf: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << "s" << std::endl; } { auto t0{std::chrono::steady_clock::now()}; for (const auto& i: list) To result{std_convert(i)}; std::cout << " -> Compare to std::wstring_convert: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << "s" << std::endl; } // iterate over remaining To types if constexpr (index + 1 < std::tuple_size::value) test_random_valid(rc, length, description); } BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_valid_all_unicode, T, types_collection_type) { random_context rc; test_random_valid(rc, rc.sequence_length(rc.gen), "All Unicode strings"); } BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_valid_ascii, T, types_collection_type) { random_context rc{127}; test_random_valid(rc, rc.sequence_length(rc.gen), "ASCII only strings"); } // Test ISO and UTF encodings BOOST_AUTO_TEST_CASE(convert) { BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); BOOST_CHECK((std::string{unicode::convert("äöü")}) == std::string{"äöü"}); BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(U"\u20ac")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u8"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(u"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW(((void)std::string{unicode::convert(U"\u732b")}), std::invalid_argument); BOOST_CHECK_THROW((unicode::convert(std::u32string{(char32_t*)"\x00\xD8\x00\x00\x00\x00\x00\x00"})) , std::invalid_argument); BOOST_CHECK((unicode::convert(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert("äöü")) == std::u32string{U"äöü"}); #ifdef _WIN32 BOOST_CHECK(sizeof(wchar_t) == 2); #else // Unix like BOOST_CHECK(sizeof(wchar_t) == 4); #endif // For the following checks, wchar_t size and encoding is system dependent: // Windows: UTF-16 // Linux: UTF-32 BOOST_CHECK((unicode::convert("äöü")) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert("\u732b")) == std::wstring{L"\u732b"}); BOOST_CHECK((unicode::convert("\U0001F63A")) == std::wstring{L"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::u32string{U"\U0001F63A"}); BOOST_CHECK((unicode::convert(L"\U0001F63A")) == std::basic_string{(utf8_t*)"\U0001F63A"}); BOOST_CHECK((unicode::convert(std::string{"äöü"})) == std::wstring{L"äöü"}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{})) == std::vector{}); BOOST_CHECK((unicode::convert, std::vector>(std::vector{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector{L'ä', L'ö', L'ü'})); // deque BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque{L'ä', L'ö', L'ü'})); // yet unsupported: //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); //BOOST_CHECK((unicode::convert(std::deque{u8'\xc3', u8'\xa4', u8'\xc3', u8'\xb6', u8'\xc3', u8'\xbc'})) == (std::deque{u'ä', u'ö', u'ü'})); // deque with uint8_t, uint16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::deque{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); // deque with int8_t, int16_t BOOST_CHECK((unicode::convert, std::deque>(std::deque{ static_cast(0xc3), static_cast(0xa4), static_cast(0xc3), static_cast(0xb6), static_cast(0xc3), static_cast(0xbc)})) == (std::deque{L'ä', L'ö', L'ü'})); // list BOOST_CHECK((unicode::convert, std::list>(std::list{})) == std::list{}); BOOST_CHECK((unicode::convert, std::list>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); // list -> deque BOOST_CHECK((unicode::convert, std::deque>(std::list{})) == std::deque{}); BOOST_CHECK((unicode::convert, std::deque>(std::list{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque{L'ä', L'ö', L'ü'})); // array BOOST_CHECK((unicode::convert, std::list>(std::array{})) == std::list{}); BOOST_CHECK((unicode::convert, std::list>(std::array{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list{L'ä', L'ö', L'ü'})); } BOOST_AUTO_TEST_CASE(is_valid_utf) { BOOST_CHECK(unicode::is_valid_utf(u"äöü")); BOOST_CHECK(unicode::is_valid_utf(u8"äöü")); } // check assumptions about environment BOOST_AUTO_TEST_CASE(string_u8string) { std::string a{"\xc3\xa4"}; std::basic_string b{a.begin(), a.end()}; BOOST_CHECK(b == std::basic_string{u8"ä"}); a = std::string{b.begin(), b.end()}; BOOST_CHECK(a == std::string{"\xc3\xa4"}); } // check environment: demonstrate how boost convert u8->u8 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_boost_u8_u8) { for (auto& s: failure_strings_char8_t) { try { auto result1{boost::locale::conv::utf_to_utf(s, boost::locale::conv::stop)}; BOOST_FAIL("Expected boost convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how boost convert u8->u16 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_boost_u8_u16) { for (auto& s: failure_strings_char8_t) { try { auto result1{boost::locale::conv::utf_to_utf(s, boost::locale::conv::stop)}; BOOST_FAIL("Expected boost convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how std u8->u8 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_std_u8_u8) { for (auto& s: failure_strings_char8_t) { try { auto result2{std_convert(s)}; BOOST_FAIL("Expected std_convert to fail"); } catch(...) { // expected } } } // check environment: demonstrate how std u8->u16 throws exception on invalid input BOOST_AUTO_TEST_CASE(utf_to_utf_failure_std_u8_u16) { for (auto& s: failure_strings_char8_t) { try { auto result2{std_convert(s)}; BOOST_FAIL("Expected std_convert to fail"); } catch(...) { // expected } } }