summaryrefslogtreecommitdiffhomepage
path: root/include/unicode.h
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2021-12-25 14:38:46 +0100
committerRoland Reichwein <mail@reichwein.it>2021-12-25 14:38:46 +0100
commit79dc9edc72c5b9fefb129fe36029d4781b1e969c (patch)
tree9e5ff95ef84ab089c652935ae8f94758318b6dbc /include/unicode.h
parent98f9132997353bb3e750e8e2db99ebd474a8dbb6 (diff)
Generalized type usage and optimizations
Diffstat (limited to 'include/unicode.h')
-rw-r--r--include/unicode.h155
1 files changed, 102 insertions, 53 deletions
diff --git a/include/unicode.h b/include/unicode.h
index 8dedb19..c2d727a 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -45,8 +45,8 @@ namespace unicode::detail {
{
static_assert(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4);
- typedef T input_type;
- typedef char32_t value_type;
+ typedef T value_type;
+ typedef char32_t internal_type;
typedef char32_t& reference;
typedef char32_t* pointer;
typedef size_t difference_type;
@@ -67,9 +67,9 @@ namespace unicode::detail {
}
template<size_t index>
- T get_code_unit() const noexcept
+ value_type get_code_unit() const noexcept
{
- if constexpr (std::is_same<Container, typename std::list<T>>::value) {
+ if constexpr (std::is_same<Container, typename std::list<value_type>>::value) {
// std::list doesn't support it + n
auto it{iterator};
std::advance(it, index);
@@ -79,46 +79,46 @@ namespace unicode::detail {
}
}
- inline static bool is_continuation_byte(T b) noexcept
+ inline static bool is_continuation_byte(value_type b) noexcept
{
return (b & 0b11000000) == 0b10000000;
}
template<typename... Targs>
- inline static bool is_continuation_byte(T b, Targs... Fargs) noexcept
+ inline static bool is_continuation_byte(value_type b, Targs... Fargs) noexcept
{
return is_continuation_byte(b) && is_continuation_byte(Fargs...);
}
template<size_t n>
- inline static bool is_byte0_of(T b) noexcept
+ inline static bool is_byte0_of(value_type b) noexcept
{
- return (b & static_cast<T>(0xFF << (7 - n))) == static_cast<T>(0xFF << (8 - n));
+ return (b & static_cast<value_type>(0xFF << (7 - n))) == static_cast<value_type>(0xFF << (8 - n));
}
- inline static char32_t continuation_value(T b) noexcept
+ inline static internal_type continuation_value(value_type b) noexcept
{
- return static_cast<char32_t>(b & 0b00111111);
+ return static_cast<internal_type>(b & 0b00111111);
}
template<typename... Targs>
- inline static char32_t continuation_value(T b, Targs... Fargs) noexcept
+ inline static internal_type continuation_value(value_type b, Targs... Fargs) noexcept
{
return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
}
template<size_t n>
- inline static char32_t value_byte0_of(T b) noexcept
+ inline static internal_type value_byte0_of(value_type b) noexcept
{
return static_cast<char32_t>(b & (0b1111111 >> n)) << ((n - 1) * 6);
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
- inline value_type calculate_value()
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ inline internal_type calculate_value()
{
utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())};
if (byte0 & 0x80) { // 2-4 bytes
- value_type value{};
+ internal_type value{};
if (size_t remaining{remaining_code_units()}; remaining >= 2) {
utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())};
if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes
@@ -154,8 +154,8 @@ namespace unicode::detail {
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
- inline value_type calculate_value()
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ inline internal_type calculate_value()
{
char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())};
@@ -175,10 +175,10 @@ namespace unicode::detail {
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
- inline value_type calculate_value()
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ inline internal_type calculate_value()
{
- value_type result {static_cast<char32_t>(get_code_unit<0>())};
+ internal_type result {static_cast<internal_type>(get_code_unit<0>())};
if (!unicode::is_valid_unicode(result))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(result)));
@@ -199,7 +199,7 @@ namespace unicode::detail {
return std::distance(iterator, end_iterator) != std::distance(other.iterator, other.end_iterator);
}
- value_type operator*()
+ internal_type operator*()
{
return calculate_value();
}
@@ -256,14 +256,14 @@ namespace unicode::detail {
// n is number of UTF-8 bytes in sequence
template<size_t n>
- inline static T byte0_of(char32_t value)
+ inline static value_type byte0_of(char32_t value)
{
return (value >> 6 * (n - 1)) | (0xFF << (8 - n));
}
// n is index of 6-bit groups, counting from bit 0
template<size_t n>
- inline static T trailing_byte(char32_t value)
+ inline static value_type trailing_byte(char32_t value)
{
return ((value >> n * 6) & 0b111111) | 0b10000000;
}
@@ -271,7 +271,7 @@ namespace unicode::detail {
// calculate UTF-8 sequence byte for m >= 2 bytes sequences (i.e. non-ASCII)
// assume value to be valid Unicode value for given byte position
template<size_t n, size_t m>
- inline static T byte_n_of_m(char32_t value)
+ inline static value_type byte_n_of_m(char32_t value)
{
if constexpr (n == 0)
return byte0_of<m>(value);
@@ -282,7 +282,7 @@ namespace unicode::detail {
template<typename Arg>
inline void append(Arg&& arg)
{
- if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) {
+ if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) {
s.append({arg});
} else {
s.emplace_back(arg);
@@ -292,7 +292,7 @@ namespace unicode::detail {
template<typename Arg, typename... Args>
inline void append(Arg&& arg, Args&&... args)
{
- if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) {
+ if constexpr (std::is_same<Container, typename std::basic_string<value_type>>::value) {
s.append({arg, args...});
} else {
s.emplace_back(arg);
@@ -300,7 +300,7 @@ namespace unicode::detail {
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 1), bool>::type = true>
inline void append_utf(const char32_t& value)
{
if (value < 0x80) { // 1 byte
@@ -315,18 +315,18 @@ namespace unicode::detail {
throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value)));
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 2), bool>::type = true>
inline void append_utf(const char32_t& value)
{
if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator)
append(static_cast<value_type>(value));
} else {
char32_t value_reduced{value - 0x10000};
- append(static_cast<T>((value_reduced >> 10) + 0xD800), static_cast<T>((value_reduced & 0x3FF) + 0xDC00));
+ append(static_cast<value_type>((value_reduced >> 10) + 0xD800), static_cast<value_type>((value_reduced & 0x3FF) + 0xDC00));
}
}
- template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
+ template<class X = value_type, typename std::enable_if<(sizeof(X) == 4), bool>::type = true>
inline void append_utf(const char32_t& value)
{
// expect value to be already valid Unicode values (checked in input iterator)
@@ -382,8 +382,8 @@ namespace unicode {
template<unicode::detail::iso_map_type& Map=iso_8859_1_map, typename Container=std::basic_string<iso_t>>
struct iso_iterator {
- typedef iso_t input_type;
- typedef char32_t value_type;
+ typedef iso_t value_type;
+ typedef char32_t internal_type;
typedef char32_t& reference;
typedef char32_t* pointer;
typedef size_t difference_type;
@@ -406,9 +406,9 @@ namespace unicode {
}
// return reference?
- value_type operator*() const
+ internal_type operator*() const
{
- input_type value{*m_it};
+ value_type value{*m_it};
if constexpr(std::addressof(Map) != std::addressof(iso_8859_1_map)) // mapping of 128 <= x <= 255 needed
{
@@ -416,7 +416,7 @@ namespace unicode {
if (it != Map.end())
return it->second;
}
- return static_cast<value_type>(static_cast<uint8_t>(value));
+ return static_cast<internal_type>(static_cast<uint8_t>(value));
}
iso_iterator& operator+=(size_t distance)
@@ -554,28 +554,61 @@ namespace unicode {
template<> struct ConvertInputOptimizer<1>
{
static const uint32_t ascii_mask { 0x80808080 };
+ // 00112233
+ // 00112222
+ // 00111122
+ // 00111111
+ // 00001122
+ // 00001111
+ // 00000011
};
- template<int value_size>
- struct ConvertOutputOptimizer {};
+ template<> struct ConvertInputOptimizer<2>
+ {
+ static const uint32_t ascii_mask { 0xFF80FF80 };
+ };
+
+ template<> struct ConvertInputOptimizer<4>
+ {
+ static const uint32_t ascii_mask { 0xFFFFFF80 };
+ };
+
+ template<int AccuSize, class ConvertInputOptimizer>
+ struct ArchitectureOptimizer {};
- template<> struct ConvertOutputOptimizer<1>
+ template<class ConvertInputOptimizer>
+ struct ArchitectureOptimizer<4, ConvertInputOptimizer>
{
- template<typename input_value_type, class output_string_type, int code_units>
+ typedef ConvertInputOptimizer input_optimizer;
+ typedef uint32_t accu_type;
+ static const size_t accu_size {4};
+ static const accu_type addr_mask {accu_size - 1};
+ static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask };
+ static const accu_type ascii_value { 0ULL };
+
+ template<typename input_value_type, class output_string_type>
inline static void append(const input_value_type* addr, output_string_type& s)
{
- s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), code_units);
+ if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) {
+ s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type));
+ } else if constexpr(sizeof(input_value_type) == 1) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1]),
+ static_cast<typename output_string_type::value_type>(addr[2]),
+ static_cast<typename output_string_type::value_type>(addr[3])});
+ } else if constexpr(sizeof(input_value_type) == 2) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1])});
+ } else if constexpr(sizeof(input_value_type) == 4) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0])});
+ }
}
};
-
- template<int AccuSize, class ConvertInputOptimizer, class ConvertOutputOptimizer>
- struct ArchitectureOptimizer {};
- template<class ConvertInputOptimizer, class ConvertOutputOptimizer>
- struct ArchitectureOptimizer<8, ConvertInputOptimizer, ConvertOutputOptimizer>
+ template<class ConvertInputOptimizer>
+ struct ArchitectureOptimizer<8, ConvertInputOptimizer>
{
typedef ConvertInputOptimizer input_optimizer;
- typedef ConvertOutputOptimizer output_optimizer;
typedef uint64_t accu_type;
static const size_t accu_size {8};
static const accu_type addr_mask {accu_size - 1};
@@ -585,7 +618,26 @@ namespace unicode {
template<typename input_value_type, class output_string_type>
inline static void append(const input_value_type* addr, output_string_type& s)
{
- output_optimizer::template append<input_value_type, output_string_type, accu_size>(addr, s);
+ if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) {
+ s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type));
+ } else if constexpr(sizeof(input_value_type) == 1) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1]),
+ static_cast<typename output_string_type::value_type>(addr[2]),
+ static_cast<typename output_string_type::value_type>(addr[3]),
+ static_cast<typename output_string_type::value_type>(addr[4]),
+ static_cast<typename output_string_type::value_type>(addr[5]),
+ static_cast<typename output_string_type::value_type>(addr[6]),
+ static_cast<typename output_string_type::value_type>(addr[7])});
+ } else if constexpr(sizeof(input_value_type) == 2) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1]),
+ static_cast<typename output_string_type::value_type>(addr[2]),
+ static_cast<typename output_string_type::value_type>(addr[3])});
+ } else if constexpr(sizeof(input_value_type) == 4) {
+ s.append({static_cast<typename output_string_type::value_type>(addr[0]),
+ static_cast<typename output_string_type::value_type>(addr[1])});
+ }
}
};
@@ -595,12 +647,9 @@ namespace unicode {
{
typename To::string_type result;
- if constexpr(sizeof(typename From::string_type::value_type) == 1 &&
- sizeof(typename To::value_type) == 1 &&
- sizeof(size_t) >= 8) {
+ if constexpr(sizeof(size_t) == 4 || sizeof(size_t) == 8) {
typedef ConvertInputOptimizer<sizeof(typename From::string_type::value_type)> input_optimizer;
- typedef ConvertOutputOptimizer<sizeof(typename To::value_type)> output_optimizer;
- typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer, output_optimizer> arch_optimizer;
+ typedef ArchitectureOptimizer<sizeof(size_t), input_optimizer> arch_optimizer;
auto begin{From::begin(s)};
auto end{From::end(s)};
@@ -612,7 +661,7 @@ namespace unicode {
typename arch_optimizer::accu_type data{*addr};
if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) {
arch_optimizer::template append<typename From::string_type::value_type, typename To::string_type>(reinterpret_cast<const typename From::string_type::value_type*>(addr), result);
- begin += arch_optimizer::accu_size;
+ begin += arch_optimizer::accu_size / sizeof(typename From::string_type::value_type);
++addr;
} else {
// just advance one code unit for now