diff options
-rw-r--r-- | Makefile | 8 | ||||
-rw-r--r-- | debian/control | 10 | ||||
-rw-r--r-- | include/unicode.h | 221 | ||||
-rw-r--r-- | src/test-unicode.cpp | 72 | ||||
-rw-r--r-- | src/validate.cpp | 4 |
5 files changed, 288 insertions, 27 deletions
@@ -60,9 +60,10 @@ endif SRC=\ src/recode.cpp \ + src/validate.cpp \ src/test-unicode.cpp -all: src/recode src/test-unicode +all: src/recode src/test-unicode src/validate test: src/test-unicode src/test-unicode @@ -70,6 +71,9 @@ test: src/test-unicode src/recode: src/recode.o dep $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ +src/validate: src/validate.o dep + $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ + src/test-unicode: src/test-unicode.o dep $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ @@ -82,7 +86,7 @@ dep: $(SRC:.cpp=.d) $(CXX) $(CXXFLAGS) -c $< -o $@ clean: - -rm -f src/recode src/test-unicode + -rm -f src/recode src/test-unicode src/validate -rm -rf result -find . -name '*.o' -o -name '*.d' -o -name '*.gcno' -o -name '*.gcda' | xargs rm -f diff --git a/debian/control b/debian/control index 1572512..9d31022 100644 --- a/debian/control +++ b/debian/control @@ -19,3 +19,13 @@ Description: Unicode conversion library - Additional support for ISO-8859-15 - Tested on Debian 10, Ubuntu 2004, Ubuntu 2010 - C++17 and C++20 compatible + +Package: unicode-tools +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Homepage: http://www.reichwein.it/unicode/ +Description: Unicode conversion tools + unicode-tools is a collection of tools for Unicode file conversion: + . + - unicode-recode: Recode Unicode or ISO-8859 file + - unicode-validate: Check file for Unicode compliance diff --git a/include/unicode.h b/include/unicode.h index f31cbac..4b676bf 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -3,8 +3,10 @@ #pragma once #include <algorithm> +#include <memory> #include <stdexcept> #include <string> +#include <unordered_map> #ifdef __cpp_char8_t // char8_t available @@ -31,7 +33,7 @@ namespace unicode::detail { template<typename T> struct utf_iterator { - typedef char32_t value_type; + typedef T value_type; typedef char32_t& reference; typedef std::basic_string<T> string_type; @@ -201,6 +203,7 @@ namespace unicode::detail { return value; } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -211,13 +214,14 @@ namespace unicode::detail { template<typename T> struct utf_back_insert_iterator { + typedef T value_type; typedef std::basic_string<T> string_type; typedef utf_back_insert_iterator& reference; utf_back_insert_iterator(string_type& s): s(s) {} // no-op - utf_back_insert_iterator& operator++() + reference operator++() { return *this; } @@ -302,39 +306,220 @@ namespace unicode::detail { return *this; } + private: typename utf_back_insert_iterator::string_type& s; }; - template<typename T> - utf_back_insert_iterator<T> utf_back_inserter(std::basic_string<T>& s) - { - return utf_back_insert_iterator<T>(s); + typedef std::unordered_map<utf8_t, char32_t> iso_map_type; + typedef std::unordered_map<char32_t, utf8_t> iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + iso_map_type iso_8859_15_map { + { '\xA4', U'\u20AC' }, // € + { '\xA6', U'\u0160' }, // Š + { '\xA8', U'\u0161' }, // š + { '\xB4', U'\u017D' }, // Ž + { '\xB8', U'\u017E' }, // ž + { '\xBC', U'\u0152' }, // Œ + { '\xBD', U'\u0153' }, // œ + { '\xBE', U'\u0178' }, // Ÿ + }; + + iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { + iso_map_type_reverse result; + std::for_each(map.cbegin(), map.cend(), + [&](const iso_map_type::value_type& pair) + { + result.emplace(pair.second, pair.first); + }); + return result; } - template<typename T> - utf_iterator<T> utf_begin(const std::basic_string<T>& s) + iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + template<unicode::detail::iso_map_type& Map=iso_8859_1_map> + struct iso_iterator { + typedef char32_t value_type; + typedef char32_t& reference; + typedef std::basic_string<utf8_t>::const_iterator iterator; + + iso_iterator(const iterator& it): m_it(it) {} + + // pre-increment + iso_iterator& operator++() + { + ++m_it; + return *this; + } + + bool operator!=(const iso_iterator& other) const + { + return m_it != other.m_it; + } + + // return reference? + value_type operator*() + { + utf8_t value{*m_it}; + + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) + return it->second; + } + return static_cast<value_type>(static_cast<uint8_t>(value)); + } + + private: + iterator m_it; + }; + + template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse> + struct iso_back_insert_iterator { + typedef iso_back_insert_iterator& reference; + typedef std::basic_string<utf8_t> string_type; + + iso_back_insert_iterator(string_type& s): s(s) {} + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + reference operator=(const char32_t& value) + { + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) { + s.push_back(it->second); + return *this; + } + } + + if (value > 255) + throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast<uint32_t>(value))); + + s.push_back(static_cast<utf8_t>(value)); + return *this; + } + + private: + typename iso_back_insert_iterator::string_type& s; + }; + + // Facet for convert() and ISO-8859-* + template<typename InputIt, typename OutputIt> + struct ISO_8859 + { + typedef utf8_t value_type; + + static InputIt begin(const std::basic_string<value_type>& s) + { + return InputIt(s.cbegin()); + } + + static InputIt end(const std::basic_string<value_type>& s) + { + return InputIt(s.cend()); + } + + static OutputIt back_inserter(std::basic_string<value_type>& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() and UTF-* + template<typename InputIt, typename OutputIt> + struct UTF { - return utf_iterator<T>{s.cbegin(), s.cend()}; + typedef typename InputIt::value_type value_type; // OutputIt::value_type is the same + + static InputIt begin(const std::basic_string<value_type>& s) + { + return InputIt{s.cbegin(), s.cend()}; + } + + static InputIt end(const std::basic_string<value_type>& s) + { + return InputIt{s.cend(), s.cend()}; + } + + static OutputIt back_inserter(std::basic_string<value_type>& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() + typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15; + + typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8; + typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16; + typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32; + + // From and To are facets + template<typename From, typename To> + std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s) + { + std::basic_string<typename To::value_type> result; + + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + + return result; } + // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet template<typename T> - utf_iterator<T> utf_end(const std::basic_string<T>& s) + struct Encoding { - return utf_iterator<T>{s.cend(), s.cend()}; - } + }; -} // namespace + template<> + struct Encoding<utf8_t> + { + typedef UTF_8 Facet; + }; -namespace unicode { + template<> + struct Encoding<char16_t> + { + typedef UTF_16 Facet; + }; - using namespace detail; + template<> + struct Encoding<char32_t> + { + typedef UTF_32 Facet; + }; + // From and To are from: utf8_t, char16_t and char32_t template<typename From, typename To> - std::basic_string<To> utf_to_utf(const std::basic_string<From>& s) + std::basic_string<To> convert(const std::basic_string<From>& s) { std::basic_string<To> result; - std::copy(utf_begin<From>(s), utf_end<From>(s), utf_back_inserter<To>(result)); + std::copy(Encoding<From>::Facet::begin(s), Encoding<From>::Facet::end(s), Encoding<To>::Facet::back_inserter(result)); return result; } @@ -343,7 +528,7 @@ namespace unicode { bool is_valid_utf(const std::basic_string<T>& s) { try { - std::for_each(utf_begin<T>(s), utf_end<T>(s), [](const T& c){}); + std::for_each(Encoding<T>::Facet::begin(s), Encoding<T>::Facet::end(s), [](const T& c){}); } catch(...) { return false; } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 3d67124..e1aa23d 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -96,13 +96,18 @@ void test_utf_to_utf(std::tuple<Ts...>& t) typedef typename std::tuple_element<i,typename std::remove_reference<decltype(t)>::type>::type From; typedef typename std::tuple_element<j,typename std::remove_reference<decltype(t)>::type>::type To; - // test - To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) }; + // test base type interface + To result { unicode::convert<typename From::value_type, typename To::value_type>(std::get<i>(t)) }; - BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); + BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); //std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; + + // test facet interface + result = unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet, typename unicode::Encoding<typename To::value_type>::Facet>(std::get<i>(t)); + BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result); + // iterate over other combinations if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) test_utf_to_utf<i + 1, j>(t); @@ -147,9 +152,18 @@ void test_utf_to_utf_failure(std::basic_string<From>& s) { typedef typename std::tuple_element<index, Collection>::type::value_type To; + // via base type + try { + (void) unicode::convert<From,To>(s); + BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + } catch (...) { + // OK + }; + + // via facet try { - unicode::utf_to_utf<From,To>(s); - BOOST_ERROR("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + (void) unicode::convert<typename unicode::Encoding<From>::Facet,typename unicode::Encoding<To>::Facet>(s); + BOOST_ERROR("Facet: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (...) { // OK }; @@ -236,14 +250,35 @@ void test_random(random_context& rc, size_t length) From r {generate_random<From>(rc, length)}; + // base type interface try { - To result{unicode::utf_to_utf<typename From::value_type,typename To::value_type>(r)}; + To result{unicode::convert<typename From::value_type,typename To::value_type>(r)}; + + if (r.empty()) { + BOOST_CHECK(result.empty()); + } else { + BOOST_CHECK(!result.empty()); + } } catch (const std::runtime_error&) { // OK: this is an expected exception for utf_to_utf on bad input } catch (const std::invalid_argument&) { // OK: this is an expected exception for utf_to_utf on bad input } + // facet interface + try { + To result{unicode::convert<typename unicode::Encoding<typename From::value_type>::Facet,typename unicode::Encoding<typename To::value_type>::Facet>(r)}; + + if (r.empty()) { + BOOST_CHECK(result.empty()); + } else { + BOOST_CHECK(!result.empty()); + } + } catch (const std::runtime_error&) { + // OK: this is an expected exception for utf_to_utf on bad input + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for utf_to_utf on bad input + } //std::cerr << "DEBUG: " << typeid(From).name() << std::endl; //std::cerr << " DEBUG2: " << typeid(To).name() << std::endl; @@ -255,8 +290,9 @@ void test_random(random_context& rc, size_t length) BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) { random_context rc; + int i{}; - // run for 1s (debug) 10s (release) + // run for 1s (debug) 10s (release) = total time for all random_sequences types! #ifdef _DEBUG const auto timeout{1.0s}; #else @@ -267,7 +303,29 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) while (!(std::chrono::steady_clock::now() > timeout_stamp)) { test_random<T,types_collection_type>(rc, rc.sequence_length(rc.gen)); + i++; } + + BOOST_CHECK_MESSAGE(i > 1, "Not enough iterations done!"); +} + +// Test ISO and UTF encodings +BOOST_AUTO_TEST_CASE(convert) +{ + BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>({})}) == std::string{}); + BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>("abc")}) == std::string{"abc"}); + BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>("äöü")}) == std::string{"äöü"}); + BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_1,unicode::ISO_8859_1>("\xa4")}) == std::string{"\xa4"}); // € + + BOOST_CHECK((std::string{unicode::convert<unicode::ISO_8859_15,unicode::ISO_8859_15>("\xa4")}) == std::string{"\xa4"}); // € + + BOOST_CHECK_THROW(((void)std::string{unicode::convert<unicode::ISO_8859_15,unicode::ISO_8859_1>("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 + + BOOST_CHECK((unicode::convert<unicode::UTF_8,unicode::UTF_16>("abc")) == std::u16string{u"abc"}); + BOOST_CHECK((unicode::convert<unicode::UTF_32,unicode::UTF_16>(U"abc")) == std::u16string{u"abc"}); + + BOOST_CHECK((unicode::convert<utf8_t,char16_t>("abc")) == std::u16string{u"abc"}); + BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"}); } // TODO: diff --git a/src/validate.cpp b/src/validate.cpp new file mode 100644 index 0000000..8927fe4 --- /dev/null +++ b/src/validate.cpp @@ -0,0 +1,4 @@ +int main(int argc, char* argv[]) +{ + return 0; +} |