diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/test-unicode.cpp | 175 |
1 files changed, 158 insertions, 17 deletions
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index c793399..29e5c2e 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -53,7 +53,9 @@ std::vector<std::basic_string<char16_t>> failure_strings_char16_t { }; std::vector<std::basic_string<char32_t>> failure_strings_char32_t { + U"\xD800 and more text", // invalid unicode (surrogate half) U"blabla \xD801", // invalid unicode (surrogate half) + U"moreblabla \xDFFF", // invalid unicode (surrogate half) U"\x10000000", // invalid unicode (number too big) }; @@ -259,26 +261,44 @@ struct random_context { std::random_device rd; // OS random number engine to seed RNG (below) std::mt19937 gen{rd()}; std::uniform_int_distribution<size_t> sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units + std::uniform_int_distribution<unsigned long> code_point_distribution{0, 0x10FFFF - 0x800}; }; +// generates valid and invalid strings of different type template<typename T> -T generate_random(random_context& rc, size_t length) +T generate_random_invalid(random_context& rc, size_t length) { // Using unsigned long for std::uniform_int_distribution<> because it needs to be basic type according to MSVC - std::uniform_int_distribution<unsigned long> code_unit(std::numeric_limits<typename T::value_type>::max()); // code unit value + std::uniform_int_distribution<unsigned long> code_unit{0, std::numeric_limits<typename T::value_type>::max()}; // code unit value T result; std::generate_n(std::back_inserter(result), length, [&](){return static_cast<typename T::value_type>(code_unit(rc.gen));}); return result; } +char32_t generate_random_char(random_context& rc) +{ + auto result {rc.code_point_distribution(rc.gen)}; + if (result >= 0xD800) + result += 0x800; + return static_cast<char32_t>(result); +} + +std::u32string generate_random_string(random_context& rc, size_t length) +{ + std::u32string result; + std::generate_n(std::back_inserter(result), length, [&](){return generate_random_char(rc);}); + + return result; +} + template<typename From, typename ToTypesCollectionType, size_t i = 0> void test_random(random_context& rc, size_t length) { //std::cerr << "LENGTH: " << length << std::endl; typedef typename std::tuple_element<i,ToTypesCollectionType>::type To; - From r {static_cast<From>(generate_random<From>(rc, length))}; + From r {static_cast<From>(generate_random_invalid<From>(rc, length))}; // base type interface try { @@ -330,28 +350,139 @@ void test_random(random_context& rc, size_t length) test_random<From, ToTypesCollectionType, i + 1>(rc, length); } -BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) +BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_invalid, T, types_collection_type) { random_context rc; - int i{}; - // run for 1s (debug) 10s (release) = total time for all random_sequences types! -#ifdef _DEBUG - const auto timeout{1.0s}; -#else - const auto timeout{10.0s}; -#endif + for (int i = 0; i < 10; i++) { + test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen)); + } +} - auto timeout_stamp { std::chrono::steady_clock::now() + (timeout / std::tuple_size<types_collection_type>::value)}; +BOOST_AUTO_TEST_CASE(random_sequences_valid) +{ + random_context rc; - while (!(std::chrono::steady_clock::now() > timeout_stamp)) { - test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen)); - i++; + // Fill UTF-32 data list + std::vector<std::u32string> u32list; + std::generate_n(std::back_inserter(u32list), 1000, [&](){return generate_random_string(rc, rc.sequence_length(rc.gen));}); + + // Fill UTF-16 data list + std::vector<std::u16string> u16list; + std::transform(u32list.begin(), u32list.end(), std::back_inserter(u16list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_16>(s);}); + + // Fill UTF-8 data list + std::vector<std::u8string> u8list; + std::transform(u32list.begin(), u32list.end(), std::back_inserter(u8list), [](const std::u32string& s){return unicode::convert<unicode::UTF_32, unicode::UTF_8>(s);}); + + for (const auto& i : u32list) { + std::u32string s32{unicode::convert<unicode::UTF_32, unicode::UTF_32>(i)}; + BOOST_CHECK(s32.size() == i.size()); + std::u16string s16{unicode::convert<unicode::UTF_32, unicode::UTF_16>(i)}; + BOOST_CHECK(s16.size() >= i.size()); + std::u8string s8{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)}; + BOOST_CHECK(s8.size() >= i.size()); + } + + for (const auto& i : u16list) { + std::u32string s32{unicode::convert<unicode::UTF_16, unicode::UTF_32>(i)}; + BOOST_CHECK(s32.size() > 0 || i.size() == 0); + std::u16string s16{unicode::convert<unicode::UTF_16, unicode::UTF_16>(i)}; + BOOST_CHECK(s16.size() == i.size()); + std::u8string s8{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)}; + BOOST_CHECK(s8.size() >= i.size()); + } + + for (const auto& i : u8list) { + std::u32string s32{unicode::convert<unicode::UTF_8, unicode::UTF_32>(i)}; + BOOST_CHECK(s32.size() > 0 || i.size() == 0); + std::u16string s16{unicode::convert<unicode::UTF_8, unicode::UTF_16>(i)}; + BOOST_CHECK(s16.size() > 0 || i.size() == 0); + std::u8string s8{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)}; + BOOST_CHECK(s8.size() == i.size()); + } + + { + // Performance test UTF-32 -> UTF-32 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u32list) { + std::u32string s{unicode::convert<unicode::UTF_32, unicode::UTF_32>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-32: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; } - BOOST_CHECK_MESSAGE(i > 1, "Not enough iterations done!"); + { + // Performance test UTF-32 -> UTF-16 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u32list) { + std::u16string s{unicode::convert<unicode::UTF_32, unicode::UTF_16>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-16: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-32 -> UTF-8 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u32list) { + std::u8string s{unicode::convert<unicode::UTF_32, unicode::UTF_8>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-16 -> UTF-32 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u16list) { + std::u32string s{unicode::convert<unicode::UTF_16, unicode::UTF_32>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-32: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-16 -> UTF-16 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u16list) { + std::u16string s{unicode::convert<unicode::UTF_16, unicode::UTF_16>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-16: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-16 -> UTF-8 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u16list) { + std::u8string s{unicode::convert<unicode::UTF_16, unicode::UTF_8>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-8 -> UTF-32 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u8list) { + std::u32string s{unicode::convert<unicode::UTF_8, unicode::UTF_32>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-32: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-8 -> UTF-16 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u8list) { + std::u16string s{unicode::convert<unicode::UTF_8, unicode::UTF_16>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-16: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } + + { + // Performance test UTF-8 -> UTF-8 + auto t0{std::chrono::steady_clock::now()}; + for (const auto& i : u8list) { + std::u8string s{unicode::convert<unicode::UTF_8, unicode::UTF_8>(i)}; + } + std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-8: " << std::chrono::duration<double>(std::chrono::steady_clock::now() - t0).count() << std::endl; + } - std::cout << "random_sequences: Completed " << i << " iterations for long random code unit sequences for " << typeid(typename T::value_type).name() << std::endl; } // Test ISO and UTF encodings @@ -366,6 +497,15 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::ISO_8859_15,unicode::ISO_8859_1>("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::UTF_8,unicode::ISO_8859_1>(u8"\u20ac")}), std::invalid_argument); + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::UTF_16,unicode::ISO_8859_1>(u"\u20ac")}), std::invalid_argument); + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::UTF_32,unicode::ISO_8859_1>(U"\u20ac")}), std::invalid_argument); + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::UTF_8,unicode::ISO_8859_15>(u8"\u732b")}), std::invalid_argument); + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::UTF_16,unicode::ISO_8859_15>(u"\u732b")}), std::invalid_argument); + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::UTF_32,unicode::ISO_8859_15>(U"\u732b")}), std::invalid_argument); + + BOOST_CHECK_THROW((unicode::convert<unicode::UTF_32,unicode::UTF_8>(std::u32string{(char32_t*)"\x00\xD8\x00\x00\x00\x00\x00\x00"})) , std::invalid_argument); + BOOST_CHECK((unicode::convert<unicode::UTF_8,unicode::UTF_16>(u8"abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert<unicode::UTF_32,unicode::UTF_16>(U"abc")) == std::u16string{u"abc"}); @@ -431,6 +571,7 @@ BOOST_AUTO_TEST_CASE(is_valid_utf) BOOST_CHECK(unicode::is_valid_utf<unicode::UTF_8>(u8"äöü")); } +// check assumptions about environment BOOST_AUTO_TEST_CASE(string_u8string) { std::string a{"\xc3\xa4"}; |