summaryrefslogtreecommitdiffhomepage
path: root/include/unicode.h
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-01-31 19:00:34 +0100
committerRoland Reichwein <mail@reichwein.it>2021-01-31 19:00:34 +0100
commit611601ec36a5603bc9c94cdac9a307c4bb07c929 (patch)
tree0b1c27d5958a2a3bdfe3c421a27f6ab528fbc3e1 /include/unicode.h
parent2ef9f51df48b14556e236d14213233e1bd7f829a (diff)
Add facet based interface
Diffstat (limited to 'include/unicode.h')
-rw-r--r--include/unicode.h221
1 files changed, 203 insertions, 18 deletions
diff --git a/include/unicode.h b/include/unicode.h
index f31cbac..4b676bf 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -3,8 +3,10 @@
#pragma once
#include <algorithm>
+#include <memory>
#include <stdexcept>
#include <string>
+#include <unordered_map>
#ifdef __cpp_char8_t
// char8_t available
@@ -31,7 +33,7 @@ namespace unicode::detail {
template<typename T>
struct utf_iterator
{
- typedef char32_t value_type;
+ typedef T value_type;
typedef char32_t& reference;
typedef std::basic_string<T> string_type;
@@ -201,6 +203,7 @@ namespace unicode::detail {
return value;
}
+ private:
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
@@ -211,13 +214,14 @@ namespace unicode::detail {
template<typename T>
struct utf_back_insert_iterator
{
+ typedef T value_type;
typedef std::basic_string<T> string_type;
typedef utf_back_insert_iterator& reference;
utf_back_insert_iterator(string_type& s): s(s) {}
// no-op
- utf_back_insert_iterator& operator++()
+ reference operator++()
{
return *this;
}
@@ -302,39 +306,220 @@ namespace unicode::detail {
return *this;
}
+ private:
typename utf_back_insert_iterator::string_type& s;
};
- template<typename T>
- utf_back_insert_iterator<T> utf_back_inserter(std::basic_string<T>& s)
- {
- return utf_back_insert_iterator<T>(s);
+ typedef std::unordered_map<utf8_t, char32_t> iso_map_type;
+ typedef std::unordered_map<char32_t, utf8_t> iso_map_type_reverse;
+
+ // ISO-8859-1 is lower 8-bit of Unicode, so no exceptions necessary
+ iso_map_type iso_8859_1_map;
+
+ // ISO-8859-15 is lower 8-bit of Unicode, except for:
+ iso_map_type iso_8859_15_map {
+ { '\xA4', U'\u20AC' }, // €
+ { '\xA6', U'\u0160' }, // Š
+ { '\xA8', U'\u0161' }, // š
+ { '\xB4', U'\u017D' }, // Ž
+ { '\xB8', U'\u017E' }, // ž
+ { '\xBC', U'\u0152' }, // Œ
+ { '\xBD', U'\u0153' }, // œ
+ { '\xBE', U'\u0178' }, // Ÿ
+ };
+
+ iso_map_type_reverse reverse_iso_map(const iso_map_type& map) {
+ iso_map_type_reverse result;
+ std::for_each(map.cbegin(), map.cend(),
+ [&](const iso_map_type::value_type& pair)
+ {
+ result.emplace(pair.second, pair.first);
+ });
+ return result;
}
- template<typename T>
- utf_iterator<T> utf_begin(const std::basic_string<T>& s)
+ iso_map_type_reverse iso_8859_15_map_reverse { reverse_iso_map(iso_8859_15_map) };
+ iso_map_type_reverse iso_8859_1_map_reverse { reverse_iso_map(iso_8859_1_map) };
+
+} // namespace unicode::detail
+
+namespace unicode {
+
+ using namespace detail;
+
+ template<unicode::detail::iso_map_type& Map=iso_8859_1_map>
+ struct iso_iterator {
+ typedef char32_t value_type;
+ typedef char32_t& reference;
+ typedef std::basic_string<utf8_t>::const_iterator iterator;
+
+ iso_iterator(const iterator& it): m_it(it) {}
+
+ // pre-increment
+ iso_iterator& operator++()
+ {
+ ++m_it;
+ return *this;
+ }
+
+ bool operator!=(const iso_iterator& other) const
+ {
+ return m_it != other.m_it;
+ }
+
+ // return reference?
+ value_type operator*()
+ {
+ utf8_t value{*m_it};
+
+ if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed
+ {
+ auto it{Map.find(value)};
+ if (it != Map.end())
+ return it->second;
+ }
+ return static_cast<value_type>(static_cast<uint8_t>(value));
+ }
+
+ private:
+ iterator m_it;
+ };
+
+ template<unicode::detail::iso_map_type_reverse& Map=iso_8859_1_map_reverse>
+ struct iso_back_insert_iterator {
+ typedef iso_back_insert_iterator& reference;
+ typedef std::basic_string<utf8_t> string_type;
+
+ iso_back_insert_iterator(string_type& s): s(s) {}
+
+ // no-op
+ reference operator++()
+ {
+ return *this;
+ }
+
+ // support *x = value, together with operator=()
+ reference operator*()
+ {
+ return *this;
+ }
+
+ reference operator=(const char32_t& value)
+ {
+ if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map_reverse)) // mapping of 128 <= x <= 255 needed
+ {
+ auto it{Map.find(value)};
+ if (it != Map.end()) {
+ s.push_back(it->second);
+ return *this;
+ }
+ }
+
+ if (value > 255)
+ throw std::invalid_argument("Bad Unicode value above 255: "s + std::to_string(static_cast<uint32_t>(value)));
+
+ s.push_back(static_cast<utf8_t>(value));
+ return *this;
+ }
+
+ private:
+ typename iso_back_insert_iterator::string_type& s;
+ };
+
+ // Facet for convert() and ISO-8859-*
+ template<typename InputIt, typename OutputIt>
+ struct ISO_8859
+ {
+ typedef utf8_t value_type;
+
+ static InputIt begin(const std::basic_string<value_type>& s)
+ {
+ return InputIt(s.cbegin());
+ }
+
+ static InputIt end(const std::basic_string<value_type>& s)
+ {
+ return InputIt(s.cend());
+ }
+
+ static OutputIt back_inserter(std::basic_string<value_type>& s)
+ {
+ return OutputIt(s);
+ }
+ };
+
+ // Facet for convert() and UTF-*
+ template<typename InputIt, typename OutputIt>
+ struct UTF
{
- return utf_iterator<T>{s.cbegin(), s.cend()};
+ typedef typename InputIt::value_type value_type; // OutputIt::value_type is the same
+
+ static InputIt begin(const std::basic_string<value_type>& s)
+ {
+ return InputIt{s.cbegin(), s.cend()};
+ }
+
+ static InputIt end(const std::basic_string<value_type>& s)
+ {
+ return InputIt{s.cend(), s.cend()};
+ }
+
+ static OutputIt back_inserter(std::basic_string<value_type>& s)
+ {
+ return OutputIt(s);
+ }
+ };
+
+ // Facet for convert()
+ typedef ISO_8859<iso_iterator<>, iso_back_insert_iterator<>> ISO_8859_1;
+ typedef ISO_8859<iso_iterator<iso_8859_15_map>, iso_back_insert_iterator<iso_8859_15_map_reverse>> ISO_8859_15;
+
+ typedef UTF<utf_iterator<utf8_t>, utf_back_insert_iterator<utf8_t>> UTF_8;
+ typedef UTF<utf_iterator<char16_t>, utf_back_insert_iterator<char16_t>> UTF_16;
+ typedef UTF<utf_iterator<char32_t>, utf_back_insert_iterator<char32_t>> UTF_32;
+
+ // From and To are facets
+ template<typename From, typename To>
+ std::basic_string<typename To::value_type> convert(const std::basic_string<typename From::value_type>& s)
+ {
+ std::basic_string<typename To::value_type> result;
+
+ std::copy(From::begin(s), From::end(s), To::back_inserter(result));
+
+ return result;
}
+ // Helper to get correct Facet from char type, e.g. Encoding<typename decltype(s)::value_type>::Facet
template<typename T>
- utf_iterator<T> utf_end(const std::basic_string<T>& s)
+ struct Encoding
{
- return utf_iterator<T>{s.cend(), s.cend()};
- }
+ };
-} // namespace
+ template<>
+ struct Encoding<utf8_t>
+ {
+ typedef UTF_8 Facet;
+ };
-namespace unicode {
+ template<>
+ struct Encoding<char16_t>
+ {
+ typedef UTF_16 Facet;
+ };
- using namespace detail;
+ template<>
+ struct Encoding<char32_t>
+ {
+ typedef UTF_32 Facet;
+ };
+ // From and To are from: utf8_t, char16_t and char32_t
template<typename From, typename To>
- std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)
+ std::basic_string<To> convert(const std::basic_string<From>& s)
{
std::basic_string<To> result;
- std::copy(utf_begin<From>(s), utf_end<From>(s), utf_back_inserter<To>(result));
+ std::copy(Encoding<From>::Facet::begin(s), Encoding<From>::Facet::end(s), Encoding<To>::Facet::back_inserter(result));
return result;
}
@@ -343,7 +528,7 @@ namespace unicode {
bool is_valid_utf(const std::basic_string<T>& s)
{
try {
- std::for_each(utf_begin<T>(s), utf_end<T>(s), [](const T& c){});
+ std::for_each(Encoding<T>::Facet::begin(s), Encoding<T>::Facet::end(s), [](const T& c){});
} catch(...) {
return false;
}