diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-02-05 14:10:53 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-02-05 14:10:53 +0100 |
commit | 3d7a431811748c5aa1f49c35436696fc3f05de5d (patch) | |
tree | 252cb1896ebd994ee6c4f7b09b0927bd7fa709f3 | |
parent | 6a12dddc641be34b323835a495399715790811e0 (diff) |
Documentation, support validation via Traits
-rw-r--r-- | debian/README.Debian | 75 | ||||
-rw-r--r-- | include/unicode.h | 13 | ||||
-rw-r--r-- | src/test-unicode.cpp | 16 |
3 files changed, 102 insertions, 2 deletions
diff --git a/debian/README.Debian b/debian/README.Debian index 162e3f0..382d20d 100644 --- a/debian/README.Debian +++ b/debian/README.Debian @@ -4,6 +4,81 @@ unicode for Debian This package is the Debian version of unicode, a C++ library for Unicode encoding. +CLI interface (package unicode-tools) +------------------------------------- + +* unicode-recode + + Usage: recode <from-format> <from-file> <to-format> <to-file> + Format: + UTF-8 UTF-8 + UTF-16 UTF-16, native endian + UTF-16LE UTF-16, little endian + UTF-16BE UTF-16, big endian + UTF-32 UTF-32, native endian + UTF-32LE UTF-32, little endian + UTF-32BE UTF-32, big endian + ISO-8859-1 ISO-8859-1 (Latin-1) + ISO-8859-15 ISO-8859-15 (Latin-9) + Exit code: 0 if valid, 1 otherwise. + +* unicode-validate + + Usage: validate <format> <file> + Format: + UTF-8 UTF-8 + UTF-16 UTF-16, big or little endian + UTF-16LE UTF-16, little endian + UTF-16BE UTF-16, big endian + UTF-32 UTF-32, big or little endian + UTF-32LE UTF-32, little endian + UTF-32BE UTF-32, big endian + Exit code: 0 if valid, 1 otherwise. + + +C++ interface (package libunicode-dev) +-------------------------------------- + +Example: + +#include <unicode.h> +... + + std::string utf8_value {u8"äöü"}; + std::u16string utf16_value{unicode::convert<char, char16_t>(utf8_value)}; + +And for C++20: + + std::u8string utf8_value {u8"äöü"}; + std::u16string utf16_value{unicode::convert<char8_t, char16_t>(utf8_value)}; + +The following encodings are implicitly deducted from types: + * char resp. char8_t (C++20): UTF-8 + * char16_t: UTF-16 + * char32_t: UTF-32 + +Explicit encoding specification is also possible: + + std::string value {"äöü"}; + std::u32string utf32_value{unicode::convert<unicode::ISO_8859_1, unicode::UTF_32>(value)}; + +Supported encodings are: + + * unicode::UTF_8 + * unicode::UTF_16 + * unicode::UTF_32 + * unicode::ISO_8859_1 + * unicode::ISO_8859_15 + +Validation can be done like this: + + bool valid{unicode::is_valid_utf<char16_t>(utf16_value)}; + +Or via explicit encoding specification: + + bool valid{unicode::is_valid_utf<unicode::UTF_8>(utf8_value)}; + + Contact ------- diff --git a/include/unicode.h b/include/unicode.h index df61ac3..2424fb1 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -562,6 +562,7 @@ namespace unicode { return result; } + // basic type version template<typename T> bool is_valid_utf(const std::basic_string<T>& s) { @@ -573,5 +574,17 @@ namespace unicode { return true; } + // Facet version + template<typename Facet> + bool is_valid_utf(const std::basic_string<typename Facet::value_type>& s) + { + try { + std::for_each(Facet::begin(s), Facet::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + } // namespace unicode diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 5529d2c..692dfac 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -130,11 +130,14 @@ void test_is_valid_utf(std::tuple<Ts...>& t) { typedef typename std::tuple_element<i,typename std::remove_reference<decltype(t)>::type>::type T; - // test + // test via basic type bool result { unicode::is_valid_utf<typename T::value_type>(std::get<i>(t)) }; - BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(T).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result); + // test via Facet + result = unicode::is_valid_utf<typename unicode::Encoding<typename T::value_type>::Facet>(std::get<i>(t)); + BOOST_CHECK_MESSAGE(result == true, "is_valid_utf w/ " << typeid(typename unicode::Encoding<typename T::value_type>::Facet).name() << "(" << i << ", " << std::get<i>(t) << "), got " << result); + // iterate over other combinations if constexpr (i + 1 < std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) test_is_valid_utf<i + 1>(t); @@ -194,6 +197,8 @@ template<typename T, typename Collection, size_t index = 0> void test_is_valid_utf_failure(std::basic_string<T>& s) { BOOST_CHECK_MESSAGE(unicode::is_valid_utf<T>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(T).name()); + + BOOST_CHECK_MESSAGE(unicode::is_valid_utf<typename unicode::Encoding<T>::Facet>(s) == false, "Expected bad UTF at index: " << index << ", " << typeid(typename unicode::Encoding<T>::Facet).name()); // iterate over remaining types if constexpr (index + 1 < std::tuple_size<Collection>::value) @@ -331,6 +336,13 @@ BOOST_AUTO_TEST_CASE(convert) BOOST_CHECK((unicode::convert<char32_t,char16_t>(U"abc")) == std::u16string{u"abc"}); } +BOOST_AUTO_TEST_CASE(is_valid_utf) +{ + BOOST_CHECK(unicode::is_valid_utf<char16_t>(u"äöü")); + + BOOST_CHECK(unicode::is_valid_utf<unicode::UTF_8>(u8"äöü")); +} + BOOST_AUTO_TEST_CASE(string_u8string) { std::string a{"\xc3\xa4"}; |