diff options
| author | Roland Reichwein <mail@reichwein.it> | 2021-01-27 22:21:04 +0100 | 
|---|---|---|
| committer | Roland Reichwein <mail@reichwein.it> | 2021-01-27 22:21:04 +0100 | 
| commit | cd4fad54c0be9fb7fca57e8e03228b8b649b5b51 (patch) | |
| tree | 6b688a27597791bfea60d533f985061f1e6f9e06 | |
| parent | fad8b697dff7c7b47f034124ea6eef25e74bd7af (diff) | |
Bugfixes, tests
| -rw-r--r-- | include/unicode.h | 40 | ||||
| -rw-r--r-- | src/test-unicode.cpp | 129 | 
2 files changed, 136 insertions, 33 deletions
| diff --git a/include/unicode.h b/include/unicode.h index a55eac3..f539e6b 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,5 +1,4 @@  // libunicode -// Copyright (C) 2021 Roland Reichwein  #pragma once @@ -7,11 +6,20 @@  #include <stdexcept>  #include <string> -#ifdef __has_cpp_attribute -#if __has_cpp_attribute(__cpp_char8_t) +#ifdef __cpp_char8_t  // char8_t available  #endif -#endif + +namespace unicode { + + // usually, char32_t, uint32_t etc. + template<typename T> + static inline bool is_valid_unicode(const T& value) + { +   return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000); + } + +}  namespace { @@ -50,6 +58,8 @@ namespace {    template<typename T1>    void calculate_value()    { +   static_assert(sizeof(T1) == 4); +     size_t remaining{remaining_code_units()};     if (!remaining) @@ -57,7 +67,7 @@ namespace {     value = get_code_unit<0>(); -   if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000)) +   if (!unicode::is_valid_unicode(value))      throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));     sequence_length = 1; @@ -88,7 +98,7 @@ namespace {    template<typename... Targs>    inline static char32_t continuation_value(T b, Targs... Fargs)    { -   return continuation_value(b) << 6 | continuation_value(Fargs...); +   return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);    }    template<size_t n> @@ -159,7 +169,7 @@ namespace {      if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)       throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); -    value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF); +    value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;      sequence_length = 2;     }    } @@ -185,7 +195,7 @@ namespace {    typename string_type::const_iterator iterator;    typename string_type::const_iterator end_iterator; -  value_type value{}; +  char32_t value{}; // always save complete unicode code point at this point    size_t sequence_length{};   }; @@ -276,8 +286,9 @@ namespace {     if (value <= 0xFFFF) { // expect value to be already valid Unicode values      s.push_back(value);     } else { -    s.push_back((value >> 10) + 0xD800); -    s.push_back((value & 0x3FF) + 0xDC00); +    char32_t value_reduced{value - 0x10000}; +    s.push_back((value_reduced >> 10) + 0xD800); +    s.push_back((value_reduced & 0x3FF) + 0xDC00);     }     return *this;    } @@ -317,14 +328,5 @@ std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)   return result;  } -//std::u8string utf16_to_utf8(const std::u16string& s) -//{ -// std::u8string result; -// -// std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result)); -// -// return result; -//} -  } // namespace unicode diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 0560c1b..2cc8393 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -1,17 +1,83 @@  #define BOOST_TEST_MODULE unicode_test  #include <boost/test/included/unit_test.hpp> +#include <boost/test/data/dataset.hpp> +#include <boost/test/data/monomorphic.hpp> +#include <boost/test/data/test_case.hpp> +#include <exception>  #include <string>  #include <tuple>  #include <type_traits> +#include <vector>  #include <unicode.h> -std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> t { - u8"Täst", u"Täst", U"Täst" +typedef std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type; + +// create tuple of the same string, in UTF-8, UTF-16 and UTF-32 +#define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x} + +// Success cases: convert string to all other types, respectively +std::vector<types_collection_type> success_sets { + SUCCESS_TUPLE(""), + SUCCESS_TUPLE("ASCII string1"), + SUCCESS_TUPLE("Täst just looks like German"), + SUCCESS_TUPLE("\u732b is chinese for cat"), + SUCCESS_TUPLE("\U0001F63A"), + SUCCESS_TUPLE("\U0001F63A is a smiling cat"), +}; + +// Error cases: throwing upon convert to all other types +std::vector<std::basic_string<char8_t>> failure_strings_char8_t { + u8"\x80", + u8"\x81" +}; + +std::vector<std::basic_string<char16_t>> failure_strings_char16_t { + u"\xD801", +}; + +std::vector<std::basic_string<char32_t>> failure_strings_char32_t { + U"\xD801", + U"\x10000000",  }; +// output operators must be in same namespace as the type itself +namespace std { + +std::ostream& operator<<(std::ostream& os, std::basic_string<char8_t> const& s) +{ + os << "["; + for (auto& c: s) +  os << " " << std::to_string(static_cast<uint8_t>(c)); + os << "]"; + + return os; +} + +std::ostream& operator<<(std::ostream& os, std::basic_string<char16_t> const& s) +{ + os << "["; + for (auto& c: s) +  os << " " << std::to_string(static_cast<uint16_t>(c)); + os << "]"; + + return os; +} + +std::ostream& operator<<(std::ostream& os, std::basic_string<char32_t> const& s) +{ + os << "["; + for (auto& c: s) +  os << " " << std::to_string(static_cast<uint32_t>(c)); + os << "]"; + + return os; +} + +} +  template<size_t i = 0, size_t j = 0, typename... Ts>  void test_utf_to_utf(std::tuple<Ts...>& t)  { @@ -21,7 +87,7 @@ void test_utf_to_utf(std::tuple<Ts...>& t)   // test   To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) }; - BOOST_CHECK(std::get<j>(t) == result); + BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);   //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; @@ -32,27 +98,62 @@ void test_utf_to_utf(std::tuple<Ts...>& t)    test_utf_to_utf<0, j + 1>(t);  } -BOOST_AUTO_TEST_CASE(utf_to_utf) +// We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign +// a new variable to each tuple element which we don't want +// https://lists.boost.org/boost-bugs/2016/05/45214.php + +BOOST_AUTO_TEST_CASE(utf_to_utf_success)  { - test_utf_to_utf(t); + for (auto& t: success_sets) +  test_utf_to_utf(t);  } -BOOST_AUTO_TEST_CASE(utf8_to_utf16) +// iterate over std::tuple T types +template<typename From, typename Collection, size_t index = 0> +void test_utf_to_utf_failure(std::basic_string<From>& s)  { - std::u8string u8{u8"ascii string1"}; -  - std::u16string u16{unicode::utf_to_utf<char8_t, char16_t>(u8)}; + typedef typename std::tuple_element<index, Collection>::type::value_type To; - BOOST_CHECK(u16 == u"ascii string1"); + try { +  unicode::utf_to_utf<From,To>(s); +  BOOST_FAIL("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + } catch (...) { +  // OK + }; + + // iterate over remaining types  + if constexpr (index + 1 < std::tuple_size<Collection>::value) +  test_utf_to_utf_failure<From, Collection, index + 1>(s);  } -BOOST_AUTO_TEST_CASE(utf16_to_utf8) +BOOST_AUTO_TEST_CASE(utf_to_utf_failure)  { - std::u16string u16{u"ascii string1"}; + for (auto& s: failure_strings_char8_t) +  test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s); - std::u8string u8{unicode::utf_to_utf<char16_t, char8_t>(u16)}; + for (auto& s: failure_strings_char16_t) +  test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s); + + for (auto& s: failure_strings_char32_t) +  test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s); +} + +BOOST_AUTO_TEST_CASE(is_valid_unicode) +{ + BOOST_CHECK(unicode::is_valid_unicode('\0')); + BOOST_CHECK(unicode::is_valid_unicode(U'a')); + BOOST_CHECK(unicode::is_valid_unicode(U'ä')); + BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese + BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese + BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley - BOOST_CHECK(u8 == u8"ascii string1"); + BOOST_CHECK(!unicode::is_valid_unicode(0x00110000)); + BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++ + BOOST_CHECK(!unicode::is_valid_unicode(0x01234567)); + BOOST_CHECK(!unicode::is_valid_unicode(0x12345678)); + BOOST_CHECK(!unicode::is_valid_unicode(0xD800)); + BOOST_CHECK(!unicode::is_valid_unicode(0xD987)); + BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF));  }  // TODO: | 
