diff options
Diffstat (limited to 'src/test-unicode.cpp')
-rw-r--r-- | src/test-unicode.cpp | 87 |
1 files changed, 73 insertions, 14 deletions
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 2cc8393..2dfabef 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -5,7 +5,10 @@ #include <boost/test/data/monomorphic.hpp> #include <boost/test/data/test_case.hpp> +#include <chrono> #include <exception> +#include <limits> +#include <random> #include <string> #include <tuple> #include <type_traits> @@ -13,6 +16,8 @@ #include <unicode.h> +using namespace std::chrono_literals; + typedef std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type; // create tuple of the same string, in UTF-8, UTF-16 and UTF-32 @@ -30,17 +35,22 @@ std::vector<types_collection_type> success_sets { // Error cases: throwing upon convert to all other types std::vector<std::basic_string<char8_t>> failure_strings_char8_t { - u8"\x80", - u8"\x81" + u8"\x80", // utf-8 continuation byte + u8"\x81", // utf-8 continuation byte + u8"\xc3ä", // initial byte of utf-8 "ä", followed by valid utf-8 "ä" + u8"\xF8\x80\x80\x80\x80", // overlong encoding + u8"\xF7\xBF\xBF\xBF", // valid encoding of invalid code point }; std::vector<std::basic_string<char16_t>> failure_strings_char16_t { - u"\xD801", + u"\xD801", // single high surrogate + u"\xDFFF", // single low surrogate + u"\xDFFF\xD801", // bad surrogate pair order }; std::vector<std::basic_string<char32_t>> failure_strings_char32_t { - U"\xD801", - U"\x10000000", + U"blabla \xD801", // invalid unicode (surrogate half) + U"\x10000000", // invalid unicode (number too big) }; // output operators must be in same namespace as the type itself @@ -156,16 +166,65 @@ BOOST_AUTO_TEST_CASE(is_valid_unicode) BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF)); } +struct random_context { + std::random_device rd; // OS random number engine to seed RNG (below) + std::mt19937 gen{rd()}; + std::uniform_int_distribution<> sequence_length{0, 100000}; // length of sequence: 0 ... 100000 code units +}; + +template<typename T> +T generate_random(random_context& rc, size_t length) +{ + std::uniform_int_distribution<> code_unit(0, std::numeric_limits<typename T::value_type>::max()); // code unit value + T result; + std::generate_n(std::back_inserter(result), length, [&](){return code_unit(rc.gen);}); + + return result; +} + +template<typename From, typename ToTypesCollectionType, size_t i = 0> +void test_random(random_context& rc, size_t length) +{ + //std::cerr << "LENGTH: " << length << std::endl; + typedef typename std::tuple_element<i,ToTypesCollectionType>::type To; + + From r {generate_random<From>(rc, length)}; + + try { + To result{unicode::utf_to_utf<typename From::value_type,typename To::value_type>(r)}; + } catch (const std::runtime_error&) { + // OK: this is an expected exception for utf_to_utf on bad input + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for utf_to_utf on bad input + } + + //std::cerr << "DEBUG: " << typeid(From).name() << std::endl; + //std::cerr << " DEBUG2: " << typeid(To).name() << std::endl; + + // iterate over remaining To types + if constexpr (i + 1 < std::tuple_size<ToTypesCollectionType>::value) + test_random<From, ToTypesCollectionType, i + 1>(rc, length); +} + +BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) +{ + random_context rc; + + // run for 1s (debug) 10s (release) +#ifdef _DEBUG + const auto timeout{1.0s}; +#else + const auto timeout{10.0s}; +#endif + + auto timeout_stamp { std::chrono::steady_clock::now() + (timeout / std::tuple_size<types_collection_type>::value)}; + + while (!(std::chrono::steady_clock::now() > timeout_stamp)) { + test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen)); + } +} + // TODO: -// UTF-8 -// invalid bytes -// an unexpected continuation byte -// a non-continuation byte before the end of the character -// the string ending before the end of the character (which can happen in simple string truncation) -// an overlong encoding -// a sequence that decodes to an invalid code point -// -// high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) // // char8_t, char16_t, char32_t, char, wchar_t (UTF-16 on Windows, UTF-32 on Linux) // string, vector? |