#define BOOST_TEST_MODULE unicode_test #include #include #include #include #include #include #include #include #include #include typedef std::tuple, std::basic_string, std::basic_string> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 #define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} // Success cases: convert string to all other types, respectively std::vector success_sets { SUCCESS_TUPLE(""), SUCCESS_TUPLE("ASCII string1"), SUCCESS_TUPLE("Täst just looks like German"), SUCCESS_TUPLE("\u732b is chinese for cat"), SUCCESS_TUPLE("\U0001F63A"), SUCCESS_TUPLE("\U0001F63A is a smiling cat"), }; // Error cases: throwing upon convert to all other types std::vector> failure_strings_char8_t { u8"\x80", u8"\x81" }; std::vector> failure_strings_char16_t { u"\xD801", }; std::vector> failure_strings_char32_t { U"\xD801", U"\x10000000", }; // output operators must be in same namespace as the type itself namespace std { std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } std::ostream& operator<<(std::ostream& os, std::basic_string const& s) { os << "["; for (auto& c: s) os << " " << std::to_string(static_cast(c)); os << "]"; return os; } } template void test_utf_to_utf(std::tuple& t) { typedef typename std::tuple_element::type>::type From; typedef typename std::tuple_element::type>::type To; // test To result { unicode::utf_to_utf(std::get(t)) }; BOOST_CHECK_MESSAGE(std::get(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); //std::cout << std::to_string(std::tuple_size::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_utf_to_utf(t); else if constexpr (j + 1 < std::tuple_size::type>::value) test_utf_to_utf<0, j + 1>(t); } // We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign // a new variable to each tuple element which we don't want // https://lists.boost.org/boost-bugs/2016/05/45214.php BOOST_AUTO_TEST_CASE(utf_to_utf_success) { for (auto& t: success_sets) test_utf_to_utf(t); } // iterate over std::tuple T types template void test_utf_to_utf_failure(std::basic_string& s) { typedef typename std::tuple_element::type::value_type To; try { unicode::utf_to_utf(s); BOOST_FAIL("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (...) { // OK }; // iterate over remaining types if constexpr (index + 1 < std::tuple_size::value) test_utf_to_utf_failure(s); } BOOST_AUTO_TEST_CASE(utf_to_utf_failure) { for (auto& s: failure_strings_char8_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char16_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); for (auto& s: failure_strings_char32_t) test_utf_to_utf_failure::type::value_type, types_collection_type>(s); } BOOST_AUTO_TEST_CASE(is_valid_unicode) { BOOST_CHECK(unicode::is_valid_unicode('\0')); BOOST_CHECK(unicode::is_valid_unicode(U'a')); BOOST_CHECK(unicode::is_valid_unicode(U'ä')); BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley BOOST_CHECK(!unicode::is_valid_unicode(0x00110000)); BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567)); BOOST_CHECK(!unicode::is_valid_unicode(0x12345678)); BOOST_CHECK(!unicode::is_valid_unicode(0xD800)); BOOST_CHECK(!unicode::is_valid_unicode(0xD987)); BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } // TODO: // UTF-8 // invalid bytes // an unexpected continuation byte // a non-continuation byte before the end of the character // the string ending before the end of the character (which can happen in simple string truncation) // an overlong encoding // a sequence that decodes to an invalid code point // // high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) // // char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux) // string, vector? // uint8_t, uint16_t, uint32_t?