From 611601ec36a5603bc9c94cdac9a307c4bb07c929 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Sun, 31 Jan 2021 19:00:34 +0100 Subject: Add facet based interface --- Makefile | 8 +- debian/control | 10 +++ include/unicode.h | 221 ++++++++++++++++++++++++++++++++++++++++++++++----- src/test-unicode.cpp | 72 +++++++++++++++-- src/validate.cpp | 4 + 5 files changed, 288 insertions(+), 27 deletions(-) create mode 100644 src/validate.cpp diff --git a/Makefile b/Makefile index b66c17e..5d64631 100644 --- a/Makefile +++ b/Makefile @@ -60,9 +60,10 @@ endif SRC=\ src/recode.cpp \ + src/validate.cpp \ src/test-unicode.cpp -all: src/recode src/test-unicode +all: src/recode src/test-unicode src/validate test: src/test-unicode src/test-unicode @@ -70,6 +71,9 @@ test: src/test-unicode src/recode: src/recode.o dep $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ +src/validate: src/validate.o dep + $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ + src/test-unicode: src/test-unicode.o dep $(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ @@ -82,7 +86,7 @@ dep: $(SRC:.cpp=.d) $(CXX) $(CXXFLAGS) -c $< -o $@ clean: - -rm -f src/recode src/test-unicode + -rm -f src/recode src/test-unicode src/validate -rm -rf result -find . -name '*.o' -o -name '*.d' -o -name '*.gcno' -o -name '*.gcda' | xargs rm -f diff --git a/debian/control b/debian/control index 1572512..9d31022 100644 --- a/debian/control +++ b/debian/control @@ -19,3 +19,13 @@ Description: Unicode conversion library - Additional support for ISO-8859-15 - Tested on Debian 10, Ubuntu 2004, Ubuntu 2010 - C++17 and C++20 compatible + +Package: unicode-tools +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Homepage: http://www.reichwein.it/unicode/ +Description: Unicode conversion tools + unicode-tools is a collection of tools for Unicode file conversion: + . + - unicode-recode: Recode Unicode or ISO-8859 file + - unicode-validate: Check file for Unicode compliance diff --git a/include/unicode.h b/include/unicode.h index f31cbac..4b676bf 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -3,8 +3,10 @@ #pragma once #include +#include #include #include +#include #ifdef __cpp_char8_t // char8_t available @@ -31,7 +33,7 @@ namespace unicode::detail { template struct utf_iterator { - typedef char32_t value_type; + typedef T value_type; typedef char32_t& reference; typedef std::basic_string string_type; @@ -201,6 +203,7 @@ namespace unicode::detail { return value; } + private: typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; @@ -211,13 +214,14 @@ namespace unicode::detail { template struct utf_back_insert_iterator { + typedef T value_type; typedef std::basic_string string_type; typedef utf_back_insert_iterator& reference; utf_back_insert_iterator(string_type& s): s(s) {} // no-op - utf_back_insert_iterator& operator++() + reference operator++() { return *this; } @@ -302,39 +306,220 @@ namespace unicode::detail { return *this; } + private: typename utf_back_insert_iterator::string_type& s; }; - template - utf_back_insert_iterator utf_back_inserter(std::basic_string& s) - { - return utf_back_insert_iterator(s); + typedef std::unordered_map iso_map_type; + typedef std::unordered_map iso_map_type_reverse; + + // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary + iso_map_type iso_8859_1_map; + + // ISO-8859-15 is lower 8-bit of Unicode, except for: + iso_map_type iso_8859_15_map { + { '\xA4', U'\u20AC' }, // € + { '\xA6', U'\u0160' }, // Š + { '\xA8', U'\u0161' }, // š + { '\xB4', U'\u017D' }, // Ž + { '\xB8', U'\u017E' }, // ž + { '\xBC', U'\u0152' }, // Œ + { '\xBD', U'\u0153' }, // œ + { '\xBE', U'\u0178' }, // Ÿ + }; + + iso_map_type_reverse reverse_iso_map(const iso_map_type& map) { + iso_map_type_reverse result; + std::for_each(map.cbegin(), map.cend(), + [&](const iso_map_type::value_type& pair) + { + result.emplace(pair.second, pair.first); + }); + return result; } - template - utf_iterator utf_begin(const std::basic_string& s) + iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) }; + iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) }; + +} // namespace unicode::detail + +namespace unicode { + + using namespace detail; + + template + struct iso_iterator { + typedef char32_t value_type; + typedef char32_t& reference; + typedef std::basic_string::const_iterator iterator; + + iso_iterator(const iterator& it): m_it(it) {} + + // pre-increment + iso_iterator& operator++() + { + ++m_it; + return *this; + } + + bool operator!=(const iso_iterator& other) const + { + return m_it != other.m_it; + } + + // return reference? + value_type operator*() + { + utf8_t value{*m_it}; + + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) + return it->second; + } + return static_cast(static_cast(value)); + } + + private: + iterator m_it; + }; + + template + struct iso_back_insert_iterator { + typedef iso_back_insert_iterator& reference; + typedef std::basic_string string_type; + + iso_back_insert_iterator(string_type& s): s(s) {} + + // no-op + reference operator++() + { + return *this; + } + + // support *x = value, together with operator=() + reference operator*() + { + return *this; + } + + reference operator=(const char32_t& value) + { + if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed + { + auto it{Map.find(value)}; + if (it != Map.end()) { + s.push_back(it->second); + return *this; + } + } + + if (value > 255) + throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast(value))); + + s.push_back(static_cast(value)); + return *this; + } + + private: + typename iso_back_insert_iterator::string_type& s; + }; + + // Facet for convert() and ISO-8859-* + template + struct ISO_8859 + { + typedef utf8_t value_type; + + static InputIt begin(const std::basic_string& s) + { + return InputIt(s.cbegin()); + } + + static InputIt end(const std::basic_string& s) + { + return InputIt(s.cend()); + } + + static OutputIt back_inserter(std::basic_string& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() and UTF-* + template + struct UTF { - return utf_iterator{s.cbegin(), s.cend()}; + typedef typename InputIt::value_type value_type; // OutputIt::value_type is the same + + static InputIt begin(const std::basic_string& s) + { + return InputIt{s.cbegin(), s.cend()}; + } + + static InputIt end(const std::basic_string& s) + { + return InputIt{s.cend(), s.cend()}; + } + + static OutputIt back_inserter(std::basic_string& s) + { + return OutputIt(s); + } + }; + + // Facet for convert() + typedef ISO_8859, iso_back_insert_iterator<>> ISO_8859_1; + typedef ISO_8859, iso_back_insert_iterator> ISO_8859_15; + + typedef UTF, utf_back_insert_iterator> UTF_8; + typedef UTF, utf_back_insert_iterator> UTF_16; + typedef UTF, utf_back_insert_iterator> UTF_32; + + // From and To are facets + template + std::basic_string convert(const std::basic_string& s) + { + std::basic_string result; + + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + + return result; } + // Helper to get correct Facet from char type, e.g. Encoding::Facet template - utf_iterator utf_end(const std::basic_string& s) + struct Encoding { - return utf_iterator{s.cend(), s.cend()}; - } + }; -} // namespace + template<> + struct Encoding + { + typedef UTF_8 Facet; + }; -namespace unicode { + template<> + struct Encoding + { + typedef UTF_16 Facet; + }; - using namespace detail; + template<> + struct Encoding + { + typedef UTF_32 Facet; + }; + // From and To are from: utf8_t, char16_t and char32_t template - std::basic_string utf_to_utf(const std::basic_string& s) + std::basic_string convert(const std::basic_string& s) { std::basic_string result; - std::copy(utf_begin(s), utf_end(s), utf_back_inserter(result)); + std::copy(Encoding::Facet::begin(s), Encoding::Facet::end(s), Encoding::Facet::back_inserter(result)); return result; } @@ -343,7 +528,7 @@ namespace unicode { bool is_valid_utf(const std::basic_string& s) { try { - std::for_each(utf_begin(s), utf_end(s), [](const T& c){}); + std::for_each(Encoding::Facet::begin(s), Encoding::Facet::end(s), [](const T& c){}); } catch(...) { return false; } diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 3d67124..e1aa23d 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -96,13 +96,18 @@ void test_utf_to_utf(std::tuple& t) typedef typename std::tuple_element::type>::type From; typedef typename std::tuple_element::type>::type To; - // test - To result { unicode::utf_to_utf(std::get(t)) }; + // test base type interface + To result { unicode::convert(std::get(t)) }; - BOOST_CHECK_MESSAGE(std::get(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); + BOOST_CHECK_MESSAGE(std::get(t) == result, "Base: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); //std::cout << std::to_string(std::tuple_size::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl; + + // test facet interface + result = unicode::convert::Facet, typename unicode::Encoding::Facet>(std::get(t)); + BOOST_CHECK_MESSAGE(std::get(t) == result, "Facet: From " << typeid(From).name() << "(" << i << ", " << std::get(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get(t) << "), got " << result); + // iterate over other combinations if constexpr (i + 1 < std::tuple_size::type>::value) test_utf_to_utf(t); @@ -147,9 +152,18 @@ void test_utf_to_utf_failure(std::basic_string& s) { typedef typename std::tuple_element::type::value_type To; + // via base type + try { + (void) unicode::convert(s); + BOOST_ERROR("Base: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + } catch (...) { + // OK + }; + + // via facet try { - unicode::utf_to_utf(s); - BOOST_ERROR("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); + (void) unicode::convert::Facet,typename unicode::Encoding::Facet>(s); + BOOST_ERROR("Facet: Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name()); } catch (...) { // OK }; @@ -236,14 +250,35 @@ void test_random(random_context& rc, size_t length) From r {generate_random(rc, length)}; + // base type interface try { - To result{unicode::utf_to_utf(r)}; + To result{unicode::convert(r)}; + + if (r.empty()) { + BOOST_CHECK(result.empty()); + } else { + BOOST_CHECK(!result.empty()); + } } catch (const std::runtime_error&) { // OK: this is an expected exception for utf_to_utf on bad input } catch (const std::invalid_argument&) { // OK: this is an expected exception for utf_to_utf on bad input } + // facet interface + try { + To result{unicode::convert::Facet,typename unicode::Encoding::Facet>(r)}; + + if (r.empty()) { + BOOST_CHECK(result.empty()); + } else { + BOOST_CHECK(!result.empty()); + } + } catch (const std::runtime_error&) { + // OK: this is an expected exception for utf_to_utf on bad input + } catch (const std::invalid_argument&) { + // OK: this is an expected exception for utf_to_utf on bad input + } //std::cerr << "DEBUG: " << typeid(From).name() << std::endl; //std::cerr << " DEBUG2: " << typeid(To).name() << std::endl; @@ -255,8 +290,9 @@ void test_random(random_context& rc, size_t length) BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) { random_context rc; + int i{}; - // run for 1s (debug) 10s (release) + // run for 1s (debug) 10s (release) = total time for all random_sequences types! #ifdef _DEBUG const auto timeout{1.0s}; #else @@ -267,7 +303,29 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(random_sequences, T, types_collection_type) while (!(std::chrono::steady_clock::now() > timeout_stamp)) { test_random(rc, rc.sequence_length(rc.gen)); + i++; } + + BOOST_CHECK_MESSAGE(i > 1, "Not enough iterations done!"); +} + +// Test ISO and UTF encodings +BOOST_AUTO_TEST_CASE(convert) +{ + BOOST_CHECK((std::string{unicode::convert({})}) == std::string{}); + BOOST_CHECK((std::string{unicode::convert("abc")}) == std::string{"abc"}); + BOOST_CHECK((std::string{unicode::convert("äöü")}) == std::string{"äöü"}); + BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € + + BOOST_CHECK((std::string{unicode::convert("\xa4")}) == std::string{"\xa4"}); // € + + BOOST_CHECK_THROW(((void)std::string{unicode::convert("\xa4")}), std::invalid_argument); // € not available in ISO-8859-1 + + BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); + BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); + + BOOST_CHECK((unicode::convert("abc")) == std::u16string{u"abc"}); + BOOST_CHECK((unicode::convert(U"abc")) == std::u16string{u"abc"}); } // TODO: diff --git a/src/validate.cpp b/src/validate.cpp new file mode 100644 index 0000000..8927fe4 --- /dev/null +++ b/src/validate.cpp @@ -0,0 +1,4 @@ +int main(int argc, char* argv[]) +{ + return 0; +} -- cgit v1.2.3