From 3ca9f389084a2defe1fff2046dd3450e0b242e58 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Tue, 21 Dec 2021 15:36:48 +0100 Subject: Added comparison tests with boost::locale::conv and std::wstring_convert --- Makefile | 2 +- src/test-unicode.cpp | 205 +++++++++++++++++++++++++++++---------------------- 2 files changed, 116 insertions(+), 91 deletions(-) diff --git a/Makefile b/Makefile index 346f8a0..6ed0e68 100644 --- a/Makefile +++ b/Makefile @@ -50,7 +50,7 @@ LIBS+=-fuse-ld=lld endif endif -CXXFLAGS+=-Wall -Iinclude -std=$(STANDARD) +CXXFLAGS+=-Wall -Wno-deprecated-declarations -Iinclude -std=$(STANDARD) LDLIBS+=\ -lboost_context \ diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index d00a33d..c325f6c 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -5,12 +5,16 @@ #include #include +#include + #include #include +#include #include #include #include #include +#include #include #include #include @@ -258,10 +262,11 @@ BOOST_AUTO_TEST_CASE(is_valid_unicode) } struct random_context { + random_context(int max_value = 0x10FFFF - 0x800): code_point_distribution(0, max_value) {} std::random_device rd; // OS random number engine to seed RNG (below) std::mt19937 gen{rd()}; std::uniform_int_distribution sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units - std::uniform_int_distribution code_point_distribution{0, 0x10FFFF - 0x800}; + std::uniform_int_distribution code_point_distribution; }; // generates valid and invalid strings of different type @@ -293,7 +298,7 @@ std::u32string generate_random_string(random_context& rc, size_t length) } template -void test_random(random_context& rc, size_t length) +void test_random_invalid(random_context& rc, size_t length) { //std::cerr << "LENGTH: " << length << std::endl; typedef typename std::tuple_element::type To; @@ -347,7 +352,7 @@ void test_random(random_context& rc, size_t length) // iterate over remaining To types if constexpr (i + 1 < std::tuple_size::value) - test_random(rc, length); + test_random_invalid(rc, length); } BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_invalid, T, types_collection_type) @@ -355,134 +360,154 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_invalid, T, types_collection_type random_context rc; for (int i = 0; i < 10; i++) { - test_random(rc, rc.sequence_length(rc.gen)); + test_random_invalid(rc, rc.sequence_length(rc.gen)); } } -BOOST_AUTO_TEST_CASE(random_sequences_valid) +// utility wrapper to adapt locale-bound facets for wstring/wbuffer convert +template +struct deletable_facet : Facet { - random_context rc; + template + deletable_facet(Args&& ...args) : Facet(std::forward(args)...) {} + ~deletable_facet() {} +}; - // Fill UTF-32 data list - std::vector u32list; - std::generate_n(std::back_inserter(u32list), 1000, [&](){return generate_random_string(rc, rc.sequence_length(rc.gen));}); +namespace { + // char8_t instead of char doesn't work w/ clang++-13 + C++20 (yet?) + std::wstring_convert>, char16_t> conv16; + std::wstring_convert>, char32_t> conv32; + + template + std::basic_string std_convert(const std::basic_string& s); - // Fill UTF-16 data list - std::vector u16list; - std::transform(u32list.begin(), u32list.end(), std::back_inserter(u16list), [](const std::u32string& s){return unicode::convert(s);}); - - // Fill UTF-8 data list - std::vector> u8list; - std::transform(u32list.begin(), u32list.end(), std::back_inserter(u8list), [](const std::u32string& s){return unicode::convert(s);}); - - for (const auto& i : u32list) { - std::u32string s32{unicode::convert(i)}; - BOOST_CHECK(s32.size() == i.size()); - std::u16string s16{unicode::convert(i)}; - BOOST_CHECK(s16.size() >= i.size()); - std::basic_string s8{unicode::convert(i)}; - BOOST_CHECK(s8.size() >= i.size()); + template<> + std::basic_string std_convert(const std::basic_string& s) + { + return s; } - for (const auto& i : u16list) { - std::u32string s32{unicode::convert(i)}; - BOOST_CHECK(s32.size() > 0 || i.size() == 0); - std::u16string s16{unicode::convert(i)}; - BOOST_CHECK(s16.size() == i.size()); - std::basic_string s8{unicode::convert(i)}; - BOOST_CHECK(s8.size() >= i.size()); + template<> + std::basic_string std_convert(const std::basic_string& s) + { + std::string a{s.begin(), s.end()}; + return conv16.from_bytes(a); } - for (const auto& i : u8list) { - std::u32string s32{unicode::convert(i)}; - BOOST_CHECK(s32.size() > 0 || i.size() == 0); - std::u16string s16{unicode::convert(i)}; - BOOST_CHECK(s16.size() > 0 || i.size() == 0); - std::basic_string s8{unicode::convert(i)}; - BOOST_CHECK(s8.size() == i.size()); + template<> + std::basic_string std_convert(const std::basic_string& s) + { + std::string a{s.begin(), s.end()}; + return conv32.from_bytes(a); } + template<> + std::basic_string std_convert(const std::basic_string& s) { - // Performance test UTF-32 -> UTF-32 - auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u32list) { - std::u32string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-32: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + auto result{conv16.to_bytes(s)}; + return std::basic_string(result.begin(), result.end()); } + template<> + std::basic_string std_convert(const std::basic_string& s) { - // Performance test UTF-32 -> UTF-16 - auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u32list) { - std::u16string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-16: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + return s; } + template<> + std::basic_string std_convert(const std::basic_string& s) { - // Performance test UTF-32 -> UTF-8 - auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u32list) { - std::basic_string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-32 to UTF-8: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + return conv32.from_bytes(conv16.to_bytes(s)); } + template<> + std::basic_string std_convert(const std::basic_string& s) { - // Performance test UTF-16 -> UTF-32 - auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u16list) { - std::u32string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-32: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + auto result{conv32.to_bytes(s)}; + return std::basic_string(result.begin(), result.end()); } + template<> + std::basic_string std_convert(const std::basic_string& s) { - // Performance test UTF-16 -> UTF-16 - auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u16list) { - std::u16string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-16: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + return conv16.from_bytes(conv32.to_bytes(s)); } + template<> + std::basic_string std_convert(const std::basic_string& s) { - // Performance test UTF-16 -> UTF-8 - auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u16list) { - std::basic_string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-16 to UTF-8: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + return s; } +} +template +void test_random_valid(random_context& rc, size_t length, const std::string& description) +{ + typedef typename std::tuple_element::type To; + + // Fill UTF-32 data list: source for tests + std::vector u32list; + std::generate_n(std::back_inserter(u32list), 1000, [&](){return generate_random_string(rc, rc.sequence_length(rc.gen));}); + + // Fill From data list + std::vector list; + std::transform(u32list.begin(), u32list.end(), std::back_inserter(list), [](const std::u32string& s){ + return unicode::convert::Facet>(s); + }); + + for (int i = 0; i < list.size(); i++) { + BOOST_CHECK(list[i].size() >= u32list[i].size()); + To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(list[i])}; + BOOST_CHECK(result.size() >= u32list[i].size()); + } + { - // Performance test UTF-8 -> UTF-32 auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u8list) { - std::u32string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-32: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + for (const auto& i: list) + To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(i)}; + std::cout << "Performance test for converting " << list.size() << + " " << description << + " from UTF-" << (sizeof(typename From::value_type) * 8) << + " to UTF-" << (sizeof(typename To::value_type) * 8) << ": " << + std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << "s" << + std::endl; } - + { - // Performance test UTF-8 -> UTF-16 auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u8list) { - std::u16string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-16: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + for (const auto& i: list) + To result{boost::locale::conv::utf_to_utf(i)}; + std::cout << " -> Compare to boost::locale::conv::utf_to_utf: " << + std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << "s" << + std::endl; } { - // Performance test UTF-8 -> UTF-8 auto t0{std::chrono::steady_clock::now()}; - for (const auto& i : u8list) { - std::basic_string s{unicode::convert(i)}; - } - std::cout << "Performance test for converting 1M strings from UTF-8 to UTF-8: " << std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << std::endl; + for (const auto& i: list) + To result{std_convert(i)}; + std::cout << " -> Compare to std::wstring_convert: " << + std::chrono::duration(std::chrono::steady_clock::now() - t0).count() << "s" << + std::endl; } + // iterate over remaining To types + if constexpr (index + 1 < std::tuple_size::value) + test_random_valid(rc, length, description); +} + +BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_valid_all_unicode, T, types_collection_type) +{ + random_context rc; + + test_random_valid(rc, rc.sequence_length(rc.gen), "All Unicode strings"); +} + +BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences_valid_ascii, T, types_collection_type) +{ + random_context rc{127}; + + test_random_valid(rc, rc.sequence_length(rc.gen), "ASCII only strings"); } // Test ISO and UTF encodings -- cgit v1.2.3