diff options
-rw-r--r-- | debian/README.Debian | 21 | ||||
-rw-r--r-- | include/unicode.h | 117 | ||||
-rw-r--r-- | src/test-unicode.cpp | 83 |
3 files changed, 164 insertions, 57 deletions
diff --git a/debian/README.Debian b/debian/README.Debian index 382d20d..0a47d0a 100644 --- a/debian/README.Debian +++ b/debian/README.Debian @@ -57,6 +57,11 @@ The following encodings are implicitly deducted from types: * char16_t: UTF-16 * char32_t: UTF-32 +You can specify different container types directly: + + std::deque<char> utf8_value {...}; + std::list<wchar_t> utf16_value{unicode::convert<std::deque<char>, std::list<wchar_t>>(utf8_value)}; + Explicit encoding specification is also possible: std::string value {"äöü"}; @@ -70,6 +75,22 @@ Supported encodings are: * unicode::ISO_8859_1 * unicode::ISO_8859_15 +Supported basic types: + * char + * char8_t (C++20) + * wchar_t (UTF-16 on Windows, UTF-32 on Linux) + * char16_t + * char32_t + * uint8_t, int8_t + * uint16_t, int16_t + * uint32_t, int32_t + * basically, all basic 8-bit, 16-bit and 32-bit that can encode + UTF-8, UTF-16 and UTF-32, respectively. + +Supported container types: + * All std container types that can be iterated (vector, list, deque, array) + * Source and target containers can be different container types + Validation can be done like this: bool valid{unicode::is_valid_utf<char16_t>(utf16_value)}; diff --git a/include/unicode.h b/include/unicode.h index 171496e..6d7ef16 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -37,7 +37,7 @@ namespace unicode::detail { using namespace std::string_literals; - template<typename T> + template<typename T, typename Container=std::basic_string<T>> struct utf_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); @@ -48,7 +48,7 @@ namespace unicode::detail { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string<T> string_type; + typedef Container string_type; utf_iterator(const typename string_type::const_iterator& cbegin, const typename string_type::const_iterator& cend): iterator(cbegin), end_iterator(cend) @@ -56,18 +56,25 @@ namespace unicode::detail { calculate_value(); } - utf_iterator<T>(const utf_iterator<T>& other) = default; - utf_iterator<T>& operator=(const utf_iterator<T>& other) = default; + utf_iterator(const utf_iterator& other) = default; + utf_iterator& operator=(const utf_iterator& other) = default; - size_t remaining_code_units() + size_t remaining_code_units() const { - return end_iterator - iterator; + return std::distance(iterator, end_iterator); } template<size_t index> - T get_code_unit() + T get_code_unit() const { - return *(iterator + index); + if constexpr (std::is_same<Container, typename std::list<T>>::value) { + // std::list doesn't support it + n + auto it{iterator}; + std::advance(it, index); + return *it; + } else { + return *(iterator + index); + } } inline static bool is_continuation_byte(T b) @@ -111,20 +118,20 @@ namespace unicode::detail { if (!remaining) return; - utf8_t byte0 {get_code_unit<0>()}; + utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes if (remaining >= 2) { - utf8_t byte1 {get_code_unit<1>()}; + utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); sequence_length = 2; } else if (remaining >= 3) { - utf8_t byte2 {get_code_unit<2>()}; + utf8_t byte2 {static_cast<utf8_t>(get_code_unit<2>())}; if (is_byte0_of<3>(byte0) && is_continuation_byte(byte1, byte2)) { // 3 bytes value = value_byte0_of<3>(byte0) | continuation_value(byte1, byte2); sequence_length = 3; } else if (remaining >= 4) { - utf8_t byte3 {get_code_unit<3>()}; + utf8_t byte3 {static_cast<utf8_t>(get_code_unit<3>())}; if (is_byte0_of<4>(byte0) && is_continuation_byte(byte1, byte2, byte3)) { // 4 bytes value = value_byte0_of<4>(byte0) | continuation_value(byte1, byte2, byte3); sequence_length = 4; @@ -154,7 +161,7 @@ namespace unicode::detail { if (!remaining) return; - char16_t unit0 {get_code_unit<0>()}; + char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) value = unit0; @@ -163,7 +170,7 @@ namespace unicode::detail { if (remaining < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); - char16_t unit1 {get_code_unit<1>()}; + char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())}; if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); @@ -179,7 +186,7 @@ namespace unicode::detail { if (!remaining) return; - value = get_code_unit<0>(); + value = static_cast<char32_t>(get_code_unit<0>()); if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); @@ -202,16 +209,16 @@ namespace unicode::detail { } // pre-increment - utf_iterator<T>& operator++() + utf_iterator& operator++() { - iterator += sequence_length; + std::advance(iterator, sequence_length); calculate_value(); return *this; } - bool operator!=(const utf_iterator<T>& other) const + bool operator!=(const utf_iterator& other) const { - return iterator != other.iterator; + return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator); } reference operator*() @@ -227,13 +234,13 @@ namespace unicode::detail { size_t sequence_length{}; }; - template<typename T> + template<typename T, typename Container=std::basic_string<T>> struct utf_back_insert_iterator { static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4); typedef T value_type; - typedef std::basic_string<T> string_type; + typedef Container string_type; typedef utf_back_insert_iterator& reference; typedef utf_back_insert_iterator* pointer; typedef size_t difference_type; @@ -378,7 +385,7 @@ namespace unicode { using namespace detail; - template<unicode::detail::iso_map_type& Map=iso_8859_1_map> + template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<utf8_t>> struct iso_iterator { typedef utf8_t input_type; typedef char32_t value_type; @@ -386,7 +393,8 @@ namespace unicode { typedef char32_t* pointer; typedef size_t difference_type; typedef std::input_iterator_tag iterator_category; - typedef std::basic_string<utf8_t>::const_iterator iterator; + typedef typename Container::const_iterator iterator; + typedef Container string_type; iso_iterator(const iterator& it): m_it(it) {} @@ -420,14 +428,14 @@ namespace unicode { iterator m_it; }; - template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse> + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse, typename Container=std::basic_string<utf8_t>> struct iso_back_insert_iterator { typedef iso_back_insert_iterator& reference; typedef iso_back_insert_iterator* pointer; typedef size_t difference_type; typedef utf8_t value_type; typedef std::output_iterator_tag iterator_category; - typedef std::basic_string<utf8_t> string_type; + typedef Container string_type; iso_back_insert_iterator(string_type& s): s(s) {} @@ -478,18 +486,19 @@ namespace unicode { struct ISO_8859 { typedef utf8_t value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string<value_type>& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt(s.cbegin()); } - static InputIt end(const std::basic_string<value_type>& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt(s.cend()); } - static OutputIt back_inserter(std::basic_string<value_type>& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -499,20 +508,20 @@ namespace unicode { template<typename InputIt, typename OutputIt> struct UTF { - typedef typename InputIt::input_type input_type; typedef typename OutputIt::value_type value_type; + typedef typename InputIt::string_type string_type; - static InputIt begin(const std::basic_string<input_type>& s) + static InputIt begin(const typename InputIt::string_type& s) { return InputIt{s.cbegin(), s.cend()}; } - static InputIt end(const std::basic_string<input_type>& s) + static InputIt end(const typename InputIt::string_type& s) { return InputIt{s.cend(), s.cend()}; } - static OutputIt back_inserter(std::basic_string<value_type>& s) + static OutputIt back_inserter(typename OutputIt::string_type& s) { return OutputIt(s); } @@ -527,10 +536,10 @@ namespace unicode { typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; // From and To are facets - template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value && std::is_empty<To>::value, bool> = true> - std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s) + template<typename From, typename To, std::enable_if_t<std::is_empty<From>::value, bool> = true> + typename To::string_type convert(const typename From::string_type& s) { - std::basic_string<typename To::value_type> result; + typename To::string_type result; std::copy(From::begin(s), From::end(s), To::back_inserter(result)); @@ -561,27 +570,29 @@ namespace unicode { typedef UTF_32 Facet; }; - // From and To are from: utf8_t, char16_t and char32_t + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t template<typename From, typename To, - std::enable_if_t<std::is_trivial<From>::value && std::is_trivial<To>::value, bool> = true - > - std::basic_string<To> convert(const std::basic_string<From>& s) + typename FromContainer=std::basic_string<From>, + typename ToContainer=std::basic_string<To>, + std::enable_if_t<std::is_trivial<From>::value && std::is_scalar<From>::value && !std::is_empty<From>::value, bool> = true> + ToContainer convert(const FromContainer& s) { typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; - std::basic_string<To> result; + ToContainer result; std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); return result; } + // From and To are containers template<typename FromContainer, typename ToContainer, std::enable_if_t<!std::is_empty<FromContainer>::value && !std::is_empty<ToContainer>::value, bool> = true > ToContainer convert(const FromContainer& s) { - typedef UTF<utf_iterator<typename FromContainer::value_type>, utf_back_insert_iterator<typename ToContainer::value_type>> UTF_Trait; + typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait; ToContainer result; @@ -590,9 +601,25 @@ namespace unicode { return result; } + // Container version + template<typename Container, std::enable_if_t<!std::is_empty<Container>::value, bool> = true> + bool is_valid_utf(const Container& s) + { + typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait; + + try { + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + // basic type version - template<typename T> - bool is_valid_utf(const std::basic_string<T>& s) + template<typename T, + typename Container=std::basic_string<T>, + std::enable_if_t<std::is_trivial<T>::value && !std::is_empty<T>::value, bool> = true> + bool is_valid_utf(const Container& s) { typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait; @@ -605,8 +632,8 @@ namespace unicode { } // Facet version - template<typename Facet> - bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s) + template<typename Facet, std::enable_if_t<std::is_empty<Facet>::value, bool> = true> + bool is_valid_utf(const typename Facet::string_type& s) { try { std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 5f5ebbf..fbd4749 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -5,9 +5,12 @@ #include <boost/test/data/monomorphic.hpp> #include <boost/test/data/test_case.hpp> +#include <array> #include <chrono> +#include <deque> #include <exception> #include <limits> +#include <list> #include <random> #include <string> #include <tuple> @@ -98,14 +101,14 @@ void test_utf_to_utf(std::tuple<Ts...>& t) // test base type interface To result { unicode::convert<typename From::value_type, typename To::value_type>(std::get<i>(t)) }; + BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Base: From " << typeid(typename From::value_type).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(typename To::value_type).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); - BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); + // test container interface + result = unicode::convert<From, To>(std::get<i>(t)); + BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Container: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); - //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; - // test facet interface result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t)); - BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); // iterate over other combinations @@ -132,6 +135,10 @@ void test_is_valid_utf(std::tuple<Ts...>& t) // test via basic type bool result { unicode::is_valid_utf<typename T::value_type>(std::get<i>(t)) }; + BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename T::value_type).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result); + + // test via container type + result = unicode::is_valid_utf<T>(std::get<i>(t)); BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result); // test via Facet @@ -158,7 +165,17 @@ void test_utf_to_utf_failure(std::basic_string<From>& s) // via base type try { (void) unicode::convert<From,To>(s); - BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + BOOST_ERROR("Base type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for convert() on bad input + } catch (const std::exception& ex) { + BOOST_ERROR("Unexpected error on convert(): " << ex.what()); + }; + + // via container + try { + (void) unicode::convert<typename unicode::Encoding<From>::Facet::string_type, typename unicode::Encoding<To>::Facet::string_type>(s); + BOOST_ERROR("Container type: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (const std::invalid_argument&) { // OK: this is an expected exception for convert() on bad input } catch (const std::exception& ex) { @@ -198,6 +215,8 @@ void test_is_valid_utf_failure(std::basic_string<T>& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf<T>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); + BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename std::basic_string<T>>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); + BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename unicode::Encoding<T>::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding<T>::Facet).name()); // iterate over remaining types @@ -275,6 +294,21 @@ void test_random(random_context& rc, size_t length) BOOST_ERROR("Unexpected error on convert(): " << ex.what()); } + // container type interface + try { + To result{unicode::convert<From, To>(r)}; + + if (r.empty()) { + BOOST_CHECK(result.empty()); + } else { + BOOST_CHECK(!result.empty()); + } + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for convert() on bad input + } catch (const std::exception& ex) { + BOOST_ERROR("Unexpected error on convert(): " << ex.what()); + } + // facet interface try { To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(r)}; @@ -331,7 +365,7 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK((unicode::convert<unicode::UTF_8,unicode::UTF_16>("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert<unicode::UTF_32,unicode::UTF_16>(U"abc")) == std::u16string{u"abc"}); - + BOOST_CHECK((unicode::convert<utf8_t,char16_t>("abc")) == std::u16string{u"abc"}); BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"}); @@ -354,7 +388,37 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK((unicode::convert<std::string, std::wstring>(std::string{"äöü"})) == std::wstring{L"äöü"}); - //BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{}); + BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{})) == std::vector<wchar_t>{}); + BOOST_CHECK((unicode::convert<std::vector<char>, std::vector<wchar_t>>(std::vector<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::vector<wchar_t>{L'ä', L'ö', L'ü'})); + + // deque + BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{})) == std::deque<wchar_t>{}); + BOOST_CHECK((unicode::convert<std::deque<char>, std::deque<wchar_t>>(std::deque<char>{'\xc3', '\xa4', '\xc3', '\xb6', '\xc3', '\xbc'})) == (std::deque<wchar_t>{L'ä', L'ö', L'ü'})); + + // deque with uint8_t, uint16_t + BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{})) == std::deque<uint16_t>{}); + BOOST_CHECK((unicode::convert<std::deque<uint8_t>, std::deque<uint16_t>>(std::deque<uint8_t>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque<uint16_t>{L'ä', L'ö', L'ü'})); + + // deque with int8_t, int16_t + BOOST_CHECK((unicode::convert<std::deque<int8_t>, std::deque<int16_t>>(std::deque<int8_t>{ + static_cast<int8_t>(0xc3), + static_cast<int8_t>(0xa4), + static_cast<int8_t>(0xc3), + static_cast<int8_t>(0xb6), + static_cast<int8_t>(0xc3), + static_cast<int8_t>(0xbc)})) == (std::deque<int16_t>{L'ä', L'ö', L'ü'})); + + // list + BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::list<uint16_t>>(std::list<uint8_t>{})) == std::list<uint16_t>{}); + BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::list<uint16_t>>(std::list<uint8_t>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'})); + + // list -> deque + BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::deque<uint16_t>>(std::list<uint8_t>{})) == std::deque<uint16_t>{}); + BOOST_CHECK((unicode::convert<std::list<uint8_t>, std::deque<uint16_t>>(std::list<uint8_t>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::deque<uint16_t>{L'ä', L'ö', L'ü'})); + + // array + BOOST_CHECK((unicode::convert<std::array<uint8_t, 0>, std::list<uint16_t>>(std::array<uint8_t, 0>{})) == std::list<uint16_t>{}); + BOOST_CHECK((unicode::convert<std::array<uint8_t, 6>, std::list<uint16_t>>(std::array<uint8_t, 6>{0xc3, 0xa4, 0xc3, 0xb6, 0xc3, 0xbc})) == (std::list<uint16_t>{L'ä', L'ö', L'ü'})); } BOOST_AUTO_TEST_CASE(is_valid_utf) @@ -376,8 +440,3 @@ BOOST_AUTO_TEST_CASE(string_u8string) BOOST_CHECK(a == std::string{"\xc3\xa4"}); } - -// TODO: -// -// string, vector? -// uint8_t, uint16_t, uint32_t? |