diff options
| -rw-r--r-- | Makefile | 3 | ||||
| -rw-r--r-- | README.txt | 9 | ||||
| -rw-r--r-- | include/unicode.h | 402 | ||||
| -rw-r--r-- | include/unicode/conversion.h | 113 | ||||
| -rw-r--r-- | include/unicode/endian.h | 8 | ||||
| -rw-r--r-- | include/unicode/iso.h | 13 | ||||
| -rw-r--r-- | include/unicode/optimization.h | 325 | ||||
| -rw-r--r-- | include/unicode/predicate.h | 4 | ||||
| -rw-r--r-- | include/unicode/type_traits.h | 8 | ||||
| -rw-r--r-- | include/unicode/types.h | 13 | ||||
| -rw-r--r-- | include/unicode/utf.h | 10 | ||||
| -rw-r--r-- | include/unicode/utf_fwd.h | 9 | ||||
| -rw-r--r-- | include/unicode/validation.h | 78 | 
13 files changed, 593 insertions, 402 deletions
| @@ -153,13 +153,16 @@ DISTFILES= \  	   Makefile \  	   README.txt \  	   include/unicode.h \ +	   include/unicode/conversion.h \  	   include/unicode/endian.h \  	   include/unicode/iso.h \ +	   include/unicode/optimization.h \  	   include/unicode/predicate.h \  	   include/unicode/types.h \  	   include/unicode/type_traits.h \  	   include/unicode/utf.h \  	   include/unicode/utf_fwd.h \ +	   include/unicode/validation.h \             debian/control \             debian/compat \             debian/copyright \ @@ -5,6 +5,15 @@ This software package contains a C++ library for Unicode encoding conversion  and command line tools which apply those functions in example runtime programs:  recode and validate. +Properties +---------- + +* Supports C++17 and C++20 +* Locale independent validation and conversion +* Supports UTF-8, UTF-16, UTF-32, ISO-8859-1 and ISO-8859-15 +* Supports Linux and Windows +* Supports current compilers (clang++-11, clang++-13, g++-11, msvc-19.28.29337) +  C++ interface (package libunicode-dev)  -------------------------------------- diff --git a/include/unicode.h b/include/unicode.h index d033f63..6102a21 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,5 +1,7 @@  // libunicode  // +// Reichwein.IT Unicode Library +//  // Author: Roland Reichwein <mail@reichwein.it>  //  // Available under the conditions of CC0 1.0 Universal @@ -10,407 +12,13 @@  #pragma once +#include "unicode/conversion.h"  #include "unicode/endian.h"  #include "unicode/iso.h" +#include "unicode/optimization.h"  #include "unicode/predicate.h"  #include "unicode/types.h"  #include "unicode/type_traits.h"  #include "unicode/utf.h" - -#include <algorithm> -#include <array> -#include <cstdint> -#include <iterator> -#include <memory> -#include <stdexcept> -#include <string> -#include <type_traits> -#include <utility> - -namespace unicode { - - // Helper function: Item distance of specified iterators - // std::distance doesn't work here: it is based on "output" distance of iterators - template<class Iterator> - inline size_t input_distance(const Iterator& it1, const Iterator& it2) - { -  return it2 - it1; - } -  - template<class Iterator> - inline size_t input_distance_bytes(const Iterator& it1, const Iterator& it2) - { -  return input_distance(it1, it2) * sizeof(typename Iterator::value_type); - } - - // Optimizations following: - static const size_t accu_size {sizeof(size_t)}; - - template<int value_size> - struct ConvertInputOptimizer {}; - - template<> struct ConvertInputOptimizer<1> - { -  static const uint32_t ascii_mask { 0x80808080 }; - }; -  - template<> struct ConvertInputOptimizer<2> - { -  static const uint32_t ascii_mask { 0xFF80FF80 }; - }; -  - template<> struct ConvertInputOptimizer<4> - { -  static const uint32_t ascii_mask { 0xFFFFFF80 }; - }; - - template<int AccuSize, class ConvertInputOptimizer> - struct ArchitectureOptimizer {}; - - template<class ConvertInputOptimizer> - struct ArchitectureOptimizer<4, ConvertInputOptimizer> - { -  typedef ConvertInputOptimizer input_optimizer; -  typedef uint32_t accu_type; -  static const accu_type addr_mask {accu_size - 1}; -  static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; -  static const accu_type ascii_value { 0ULL }; -   -  template<typename input_value_type, class output_string_type> -  inline static void append(const input_value_type* addr, output_string_type& s) -  { -   if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { -    s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); -   } else if constexpr(is_utf_8_v<input_value_type>) { -    s.append({static_cast<typename output_string_type::value_type>(addr[0]), -              static_cast<typename output_string_type::value_type>(addr[1]), -              static_cast<typename output_string_type::value_type>(addr[2]), -              static_cast<typename output_string_type::value_type>(addr[3])}); -   } else if constexpr(is_utf_16_v<input_value_type>) { -    s.append({static_cast<typename output_string_type::value_type>(addr[0]), -              static_cast<typename output_string_type::value_type>(addr[1])}); -   } else if constexpr(is_utf_32_v<input_value_type>) { -    s.append({static_cast<typename output_string_type::value_type>(addr[0])}); -   } -  } - }; - - template<class ConvertInputOptimizer> - struct ArchitectureOptimizer<8, ConvertInputOptimizer> - { -  typedef ConvertInputOptimizer input_optimizer; -  typedef uint64_t accu_type; -  static const accu_type addr_mask {accu_size - 1}; -  static const accu_type ascii_mask { ((accu_type)input_optimizer::ascii_mask) << 32 | (accu_type)input_optimizer::ascii_mask }; -  static const accu_type ascii_value { 0ULL }; -   -  template<typename input_value_type, class output_string_type> -  inline static void append(const input_value_type* addr, output_string_type& s) -  { -   if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { -    s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); -   } else if constexpr(is_utf_8_v<input_value_type>) { -    s.append({static_cast<typename output_string_type::value_type>(addr[0]), -              static_cast<typename output_string_type::value_type>(addr[1]), -              static_cast<typename output_string_type::value_type>(addr[2]), -              static_cast<typename output_string_type::value_type>(addr[3]), -              static_cast<typename output_string_type::value_type>(addr[4]), -              static_cast<typename output_string_type::value_type>(addr[5]), -              static_cast<typename output_string_type::value_type>(addr[6]), -              static_cast<typename output_string_type::value_type>(addr[7])}); -   } else if constexpr(is_utf_16_v<input_value_type>) { -    s.append({static_cast<typename output_string_type::value_type>(addr[0]), -              static_cast<typename output_string_type::value_type>(addr[1]), -              static_cast<typename output_string_type::value_type>(addr[2]), -              static_cast<typename output_string_type::value_type>(addr[3])}); -   } else if constexpr(is_utf_32_v<input_value_type>) { -    s.append({static_cast<typename output_string_type::value_type>(addr[0]), -              static_cast<typename output_string_type::value_type>(addr[1])}); -   } -  } - - }; // class ArchitectureOptimizer - - // Optimize for the case of all ASCII (7-bit) data in a accu size row - // From and To are Encodings - template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> - typename To::string_type convert_optimized(const typename From::string_type& s) - { -  typename To::string_type result; -  typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer; -  typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer; - -  auto begin{From::begin(s)}; -  auto end{From::end(s)}; -  auto back_inserter{To::back_inserter(result)}; -  auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])}; -  while (input_distance_bytes(begin, end) >= accu_size) { -   if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { -    while (input_distance_bytes(begin, end) >= accu_size) { -     typename arch_optimizer::accu_type data{*addr}; -     if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) -#if __cplusplus >= 202002L -     [[likely]] -#endif -     { -      arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result); -      begin += accu_size / sizeof(typename From::value_type); -      ++addr; -     } else { -      // just advance one code unit for now and break to trigger unoptimized -      // version until next accu boundary -      back_inserter = *begin; -      ++begin; -      break; -     } -    } -   } - -   // keep up after unaligned Non-ASCII code points -   while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { -    back_inserter = *begin; -    ++begin; -   } -  } - -  // remainder < 8 bytes    -  while (begin != end) { -   back_inserter = *begin; -   ++begin; -  } - -  return result; - } - - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_8_v<To>, bool> = true> - inline void append_utf(std::basic_string<To>& result, const char32_t& value) - { -  using From = char32_t; -  if (bits_to_compare <= 7 || value < 0x80) { // 1 byte -   result.push_back(static_cast<To>(value)); -  } else if (bits_to_compare <= 11 || value < 0x800) { // 2 bytes -   result.append({utf8_byte_n_of_m<0,2,From,To>(value), utf8_byte_n_of_m<1,2,From,To>(value)}); -  } else if (bits_to_compare <= 16 || value < 0x10000) { // 3 bytes -   result.append({utf8_byte_n_of_m<0,3,From,To>(value), utf8_byte_n_of_m<1,3,From,To>(value), utf8_byte_n_of_m<2,3,From,To>(value)}); -  } else { // 4 bytes -   // expect value to be already valid Unicode values -   result.append({utf8_byte_n_of_m<0,4,From,To>(value), utf8_byte_n_of_m<1,4,From,To>(value), utf8_byte_n_of_m<2,4,From,To>(value), utf8_byte_n_of_m<3,4,From,To>(value)}); -  } - } - - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_16_v<To>, bool> = true> - inline void append_utf(std::basic_string<To>& result, const char32_t& value) - { -  if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values -   result.push_back(static_cast<To>(value)); -  } else { -   char32_t value_reduced{value - 0x10000}; -   result.append({static_cast<To>((value_reduced >> 10) + 0xD800), static_cast<To>((value_reduced & 0x3FF) + 0xDC00)}); -  } - } - - template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_32_v<To>, bool> = true> - inline void append_utf(std::basic_string<To>& result, const char32_t& value) - { -  // expect value to be already valid Unicode values (checked in input iterator) -  result.push_back(static_cast<To>(value)); - } - - // Little Endian optimized version for UTF-8 - // In block_mode, at least 4 bytes are in accu. On first call, even 8. - // otherwise, at least one code unit is in accu - template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true> - inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) - { -  if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) -#if __cplusplus >= 202002L -  [[likely]] -#endif -  { -   result.append({ -                 static_cast<To>(accu & 0x7F), -                 static_cast<To>((accu >> 8) & 0x7F), -                 static_cast<To>((accu >> 16) & 0x7F), -                 static_cast<To>((accu >> 24) & 0x7F), -                 static_cast<To>((accu >> 32) & 0x7F), -                 static_cast<To>((accu >> 40) & 0x7F), -                 static_cast<To>((accu >> 48) & 0x7F), -                 static_cast<To>((accu >> 56) & 0x7F), -                 }); -   accu = 0; -   bytes_in_accu = 0; -  } else if ((accu & 0x80) == 0) { // 1 byte sequence -   append_utf<7>(result, static_cast<char32_t>(accu & 0x7F)); -   accu >>= 8; -   bytes_in_accu -= 1; -  } else if ((block_mode || bytes_in_accu >= 2) && (accu & 0xC0E0) == 0x80C0) { // 2 byte sequence -   char32_t value {static_cast<char32_t>(((accu & 0x1F) << 6) | ((accu >> 8) & 0x3f))}; -   accu >>= 16; -   bytes_in_accu -= 2; -   if (is_valid_unicode<11>(value)) -    append_utf<11>(result, value); -   else -#if __cplusplus >= 202002L -    [[unlikely]] -#endif -    throw std::invalid_argument("Invalid Unicode character in 2 byte UTF-8 sequence"); -  } else if ((block_mode || bytes_in_accu >= 3) && (accu & 0xC0C0F0) == 0x8080E0) { // 3 byte sequence -   char32_t value {static_cast<char32_t>(((accu & 0x0F) << 12) | ((accu >> 2) & 0x0FC0) | ((accu >> 16) & 0x3f))}; -   accu >>= 24; -   bytes_in_accu -= 3; -   if (is_valid_unicode<16>(value)) -    append_utf<16>(result, value); -   else -#if __cplusplus >= 202002L -    [[unlikely]] -#endif -    throw std::invalid_argument("Invalid Unicode character in 3 byte UTF-8 sequence"); -  } else if ((block_mode || bytes_in_accu >= 4) && (accu & 0xC0C0C0F8) == 0x808080F0) { // 4 byte sequence -   char32_t value {static_cast<char32_t>(((accu & 0x07) << 18) | ((accu << 4) & 0x3f000) | ((accu >> 10) & 0xFC0) | ((accu >> 24) & 0x3f))}; -   accu >>= 32; -   bytes_in_accu -= 4; -   if (is_valid_unicode<21>(value)) -    append_utf(result, value); -   else -#if __cplusplus >= 202002L -    [[unlikely]] -#endif -    throw std::invalid_argument("Invalid Unicode character in 4 byte UTF-8 sequence"); -  } else -#if __cplusplus >= 202002L -   [[unlikely]] -#endif -   throw std::invalid_argument("Invalid UTF-8 byte sequence"); - } - - // Little Endian optimized version - template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> - typename To::string_type convert_optimized_utf(const typename From::string_type& s) - { -  typename To::string_type result; -  uint64_t accu{}; -  int bytes_in_accu{}; - -  size_t s_index{}; -  size_t s_size{s.size()}; -  while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { -   // read input -   // assume: bytes_in_accu < 8 -   accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); -   s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); -   bytes_in_accu = 8; - -   while (bytes_in_accu >= 4) { -    append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu); -   } -  } - -  // 0..3 bytes left in accu -  // 0..7 bytes left in s - -  while (s_index < s_size || bytes_in_accu > 0) { -   while (s_index < s_size && bytes_in_accu < 8) { -    accu |= static_cast<uint64_t>(*reinterpret_cast<const uint8_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); -    ++s_index; -    bytes_in_accu += sizeof(typename From::value_type); -   } - -   append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu); -  } -  return result; - } - - // From and To are Encodings - template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> - typename To::string_type convert(const typename From::string_type& s) - { -  // if input type == output type, only validate and return input, if appropriate -  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && -               is_utf_encoding_v<From> && is_utf_encoding_v<To>) { -   if (validate_utf<typename From::value_type>(s)) { -    return s; -   } else { -    throw std::invalid_argument("Invalid UTF input"); -   } -  } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v<typename From::value_type> && -                      is_utf_encoding_v<From> && is_utf_encoding_v<To>) { // endian specific optimization -   return convert_optimized_utf<From, To>(s); -  } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input -   return convert_optimized<From, To>(s); -  } else { -   typename To::string_type result; -   std::copy(From::begin(s), From::end(s), To::back_inserter(result)); -   return result; -  } - } - - // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t - template<typename From, typename To, -  typename FromContainer=std::basic_string<From>, -  typename ToContainer=std::basic_string<To>, -  std::enable_if_t<is_char_v<From> && is_char_v<To>, bool> = true> - ToContainer convert(const FromContainer& s) - { -  typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; - -  ToContainer result; - -  std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); - -  return result; - } - - // From and To are containers - template<typename FromContainer, typename ToContainer, -  std::enable_if_t<is_container_v<FromContainer> && is_container_v<ToContainer>, bool> = true - > - ToContainer convert(const FromContainer& s) - { -  typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait; -   -  ToContainer result; - -  std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); - -  return result; - } - - // Container version - template<typename Container, std::enable_if_t<is_container_v<Container>, bool> = true> - bool is_valid_utf(const Container& s) - { -  typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait; -   -  try { -   std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); -  } catch (const std::invalid_argument&) { -   return false; -  } -  return true; - } - - // basic type version - template<typename T, -  typename Container=std::basic_string<T>, -  std::enable_if_t<is_char_v<T>, bool> = true> - bool is_valid_utf(const Container& s) - { -  typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait; -   -  try { -   std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); -  } catch (const std::invalid_argument&) { -   return false; -  } -  return true; - } - - // Encoding version - template<typename Encoding, std::enable_if_t<is_encoding_v<Encoding>, bool> = true> - bool is_valid_utf(const typename Encoding::string_type& s) - { -  return validate_utf<typename Encoding::value_type>(s); - } - -} // namespace unicode +#include "unicode/validation.h" diff --git a/include/unicode/conversion.h b/include/unicode/conversion.h new file mode 100644 index 0000000..dc57084 --- /dev/null +++ b/include/unicode/conversion.h @@ -0,0 +1,113 @@ +// +// Reichwein.IT Unicode Library +// +// Functions for conversion between UTF and ISO encodings +// + +#pragma once + +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/optimization.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" + +#include <algorithm> +#include <array> +#include <cstdint> +#include <iterator> +#include <memory> +#include <stdexcept> +#include <string> +#include <type_traits> +#include <utility> + +namespace unicode { + + // First variant of convert(): Specification of encodings explicitly + // + // e.g. + // unicode::UTF_8 + // unicode::UTF_16 + // unicode::UTF_32 + // unicode::ISO_8859_1 + // unicode::ISO_8859_15 + // + // see also utf.h and iso.h + // + // From and To are Encodings + // + // throws std::invalid_argument on conversion error + template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> + typename To::string_type convert(const typename From::string_type& s) + { +  // At compile time, decide which optimization to use, with fallback to +  // iterating with std::copy() + +  // if input type == output type, only validate and return input, if appropriate +  if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && +               is_utf_encoding_v<From> && is_utf_encoding_v<To>) { +   if (validate_utf<typename From::value_type>(s)) { +    return s; +   } else { +    throw std::invalid_argument("Invalid UTF input"); +   } +  } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v<typename From::value_type> && +                      is_utf_encoding_v<From> && is_utf_encoding_v<To>) { // endian specific optimization +   return convert_optimized_utf<From, To>(s); +  } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input +   return convert_optimized<From, To>(s); +  } else { +   typename To::string_type result; +   std::copy(From::begin(s), From::end(s), To::back_inserter(result)); +   return result; +  } + } + + // Second variant of convert(): Specification of encodings via character type + // + // see also type_traits.h for is_char + // + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t + // + // throws std::invalid_argument on conversion error + template<typename From, typename To, +  typename FromContainer=std::basic_string<From>, +  typename ToContainer=std::basic_string<To>, +  std::enable_if_t<is_char_v<From> && is_char_v<To>, bool> = true> + ToContainer convert(const FromContainer& s) + { +  typedef UTF<utf_iterator<From>, utf_back_insert_iterator<To>> UTF_Trait; + +  ToContainer result; + +  std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); + +  return result; + } + + // Third variant of convert(): Specification of encodings via container type + // + // see also type_traits.h for is_container + // + // From and To are containers + // + // throws std::invalid_argument on conversion error + template<typename FromContainer, typename ToContainer, +  std::enable_if_t<is_container_v<FromContainer> && is_container_v<ToContainer>, bool> = true + > + ToContainer convert(const FromContainer& s) + { +  typedef UTF<utf_iterator<typename FromContainer::value_type, FromContainer>, utf_back_insert_iterator<typename ToContainer::value_type, ToContainer>> UTF_Trait; +   +  ToContainer result; + +  std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); + +  return result; + } + +} // namespace unicode + diff --git a/include/unicode/endian.h b/include/unicode/endian.h index 1230f06..d933a2b 100644 --- a/include/unicode/endian.h +++ b/include/unicode/endian.h @@ -1,3 +1,11 @@ +// +// Reichwein.IT Unicode Library +// +// Endian handling functions +// +// In C++17, endian support is not yet available. +// +  #pragma once  #if __cplusplus >= 202002L diff --git a/include/unicode/iso.h b/include/unicode/iso.h index 1f5f007..24e3dd1 100644 --- a/include/unicode/iso.h +++ b/include/unicode/iso.h @@ -1,3 +1,16 @@ +// +// Reichwein.IT Unicode Library +// +// ISO 8895 (-1 and -15) handling functions (i.e. Latin-1 and Latin-9) +// +// Implementation of iso_iterator for reading individual Unicode code points +// from an string or container input, and a iso_back_insert_iterator for +// writing them to the destination. +// +// The design is made to be compatible to the respective iterators in utf.h +// to make it easy to combine them. +// +  #pragma once  #include "types.h" diff --git a/include/unicode/optimization.h b/include/unicode/optimization.h new file mode 100644 index 0000000..d7b054d --- /dev/null +++ b/include/unicode/optimization.h @@ -0,0 +1,325 @@ +// +// Reichwein.IT Unicode Library +// +// Optimized conversion functions for UTF input and output +// + +#pragma once + +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" + +#include <algorithm> +#include <array> +#include <cstdint> +#include <iterator> +#include <memory> +#include <stdexcept> +#include <string> +#include <type_traits> +#include <utility> + +namespace unicode { + + // Helper function: Item distance of specified iterators + // std::distance doesn't work here: it is based on "output" distance of iterators + template<class Iterator> + inline size_t input_distance(const Iterator& it1, const Iterator& it2) + { +  return it2 - it1; + } +  + // Helper function: Distance of specified iterator content data in bytes + template<class Iterator> + inline size_t input_distance_bytes(const Iterator& it1, const Iterator& it2) + { +  return input_distance(it1, it2) * sizeof(typename Iterator::value_type); + } + + // Optimizations following: + static const size_t accu_size {sizeof(size_t)}; + + template<int value_size> + struct ConvertInputOptimizer {}; + + template<> struct ConvertInputOptimizer<1> + { +  static const uint32_t ascii_mask { 0x80808080 }; + }; +  + template<> struct ConvertInputOptimizer<2> + { +  static const uint32_t ascii_mask { 0xFF80FF80 }; + }; +  + template<> struct ConvertInputOptimizer<4> + { +  static const uint32_t ascii_mask { 0xFFFFFF80 }; + }; + + template<int AccuSize, class ConvertInputOptimizer> + struct ArchitectureOptimizer {}; + + // On 32 bit architecture, calculate with 32 bit accumulator value + // (hoping the compiler will put it into a 32 bit register) + template<class ConvertInputOptimizer> + struct ArchitectureOptimizer<4, ConvertInputOptimizer> + { +  typedef ConvertInputOptimizer input_optimizer; +  typedef uint32_t accu_type; +  static const accu_type addr_mask {accu_size - 1}; +  static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; +  static const accu_type ascii_value { 0ULL }; +   +  template<typename input_value_type, class output_string_type> +  inline static void append(const input_value_type* addr, output_string_type& s) +  { +   if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { +    s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); +   } else if constexpr(is_utf_8_v<input_value_type>) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1]), +              static_cast<typename output_string_type::value_type>(addr[2]), +              static_cast<typename output_string_type::value_type>(addr[3])}); +   } else if constexpr(is_utf_16_v<input_value_type>) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1])}); +   } else if constexpr(is_utf_32_v<input_value_type>) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0])}); +   } +  } + }; + + // On 64 bit architecture, calculate with 64 bit accumulator value + // (hoping the compiler will put it into a 64 bit register) + template<class ConvertInputOptimizer> + struct ArchitectureOptimizer<8, ConvertInputOptimizer> + { +  typedef ConvertInputOptimizer input_optimizer; +  typedef uint64_t accu_type; +  static const accu_type addr_mask {accu_size - 1}; +  static const accu_type ascii_mask { ((accu_type)input_optimizer::ascii_mask) << 32 | (accu_type)input_optimizer::ascii_mask }; +  static const accu_type ascii_value { 0ULL }; +   +  template<typename input_value_type, class output_string_type> +  inline static void append(const input_value_type* addr, output_string_type& s) +  { +   if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { +    s.append(reinterpret_cast<const typename output_string_type::value_type*>(addr), accu_size / sizeof(input_value_type)); +   } else if constexpr(is_utf_8_v<input_value_type>) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1]), +              static_cast<typename output_string_type::value_type>(addr[2]), +              static_cast<typename output_string_type::value_type>(addr[3]), +              static_cast<typename output_string_type::value_type>(addr[4]), +              static_cast<typename output_string_type::value_type>(addr[5]), +              static_cast<typename output_string_type::value_type>(addr[6]), +              static_cast<typename output_string_type::value_type>(addr[7])}); +   } else if constexpr(is_utf_16_v<input_value_type>) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1]), +              static_cast<typename output_string_type::value_type>(addr[2]), +              static_cast<typename output_string_type::value_type>(addr[3])}); +   } else if constexpr(is_utf_32_v<input_value_type>) { +    s.append({static_cast<typename output_string_type::value_type>(addr[0]), +              static_cast<typename output_string_type::value_type>(addr[1])}); +   } +  } + + }; // class ArchitectureOptimizer + + // Optimize for the case of all ASCII (7-bit) data in a accu size row + // From and To are Encodings + template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> + typename To::string_type convert_optimized(const typename From::string_type& s) + { +  typename To::string_type result; +  typedef ConvertInputOptimizer<sizeof(typename From::value_type)> input_optimizer; +  typedef ArchitectureOptimizer<accu_size, input_optimizer> arch_optimizer; + +  auto begin{From::begin(s)}; +  auto end{From::end(s)}; +  auto back_inserter{To::back_inserter(result)}; +  auto addr{reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])}; +  while (input_distance_bytes(begin, end) >= accu_size) { +   if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { +    while (input_distance_bytes(begin, end) >= accu_size) { +     typename arch_optimizer::accu_type data{*addr}; +     if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) +#if __cplusplus >= 202002L +     [[likely]] +#endif +     { +      arch_optimizer::template append(reinterpret_cast<const typename From::value_type*>(addr), result); +      begin += accu_size / sizeof(typename From::value_type); +      ++addr; +     } else { +      // just advance one code unit for now and break to trigger unoptimized +      // version until next accu boundary +      back_inserter = *begin; +      ++begin; +      break; +     } +    } +   } + +   // keep up after unaligned Non-ASCII code points +   while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast<const typename arch_optimizer::accu_type*>(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { +    back_inserter = *begin; +    ++begin; +   } +  } + +  // remainder < 8 bytes    +  while (begin != end) { +   back_inserter = *begin; +   ++begin; +  } + +  return result; + } + + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_8_v<To>, bool> = true> + inline void append_utf(std::basic_string<To>& result, const char32_t& value) + { +  using From = char32_t; +  if (bits_to_compare <= 7 || value < 0x80) { // 1 byte +   result.push_back(static_cast<To>(value)); +  } else if (bits_to_compare <= 11 || value < 0x800) { // 2 bytes +   result.append({utf8_byte_n_of_m<0,2,From,To>(value), utf8_byte_n_of_m<1,2,From,To>(value)}); +  } else if (bits_to_compare <= 16 || value < 0x10000) { // 3 bytes +   result.append({utf8_byte_n_of_m<0,3,From,To>(value), utf8_byte_n_of_m<1,3,From,To>(value), utf8_byte_n_of_m<2,3,From,To>(value)}); +  } else { // 4 bytes +   // expect value to be already valid Unicode values +   result.append({utf8_byte_n_of_m<0,4,From,To>(value), utf8_byte_n_of_m<1,4,From,To>(value), utf8_byte_n_of_m<2,4,From,To>(value), utf8_byte_n_of_m<3,4,From,To>(value)}); +  } + } + + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_16_v<To>, bool> = true> + inline void append_utf(std::basic_string<To>& result, const char32_t& value) + { +  if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values +   result.push_back(static_cast<To>(value)); +  } else { +   char32_t value_reduced{value - 0x10000}; +   result.append({static_cast<To>((value_reduced >> 10) + 0xD800), static_cast<To>((value_reduced & 0x3FF) + 0xDC00)}); +  } + } + + template<size_t bits_to_compare = 32, typename To, typename std::enable_if_t<is_utf_32_v<To>, bool> = true> + inline void append_utf(std::basic_string<To>& result, const char32_t& value) + { +  // expect value to be already valid Unicode values (checked in input iterator) +  result.push_back(static_cast<To>(value)); + } + + // Little Endian optimized version for UTF-8 + // In block_mode, at least 4 bytes are in accu. On first call, even 8. + // otherwise, at least one code unit is in accu + template<typename From, typename To, bool block_mode = true, typename std::enable_if_t<is_utf_8_v<From>, bool> = true> + inline static void append_accu(std::basic_string<To>& result, uint64_t& accu, int& bytes_in_accu) + { +  if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) +#if __cplusplus >= 202002L +  [[likely]] +#endif +  { +   result.append({ +                 static_cast<To>(accu & 0x7F), +                 static_cast<To>((accu >> 8) & 0x7F), +                 static_cast<To>((accu >> 16) & 0x7F), +                 static_cast<To>((accu >> 24) & 0x7F), +                 static_cast<To>((accu >> 32) & 0x7F), +                 static_cast<To>((accu >> 40) & 0x7F), +                 static_cast<To>((accu >> 48) & 0x7F), +                 static_cast<To>((accu >> 56) & 0x7F), +                 }); +   accu = 0; +   bytes_in_accu = 0; +  } else if ((accu & 0x80) == 0) { // 1 byte sequence +   append_utf<7>(result, static_cast<char32_t>(accu & 0x7F)); +   accu >>= 8; +   bytes_in_accu -= 1; +  } else if ((block_mode || bytes_in_accu >= 2) && (accu & 0xC0E0) == 0x80C0) { // 2 byte sequence +   char32_t value {static_cast<char32_t>(((accu & 0x1F) << 6) | ((accu >> 8) & 0x3f))}; +   accu >>= 16; +   bytes_in_accu -= 2; +   if (is_valid_unicode<11>(value)) +    append_utf<11>(result, value); +   else +#if __cplusplus >= 202002L +    [[unlikely]] +#endif +    throw std::invalid_argument("Invalid Unicode character in 2 byte UTF-8 sequence"); +  } else if ((block_mode || bytes_in_accu >= 3) && (accu & 0xC0C0F0) == 0x8080E0) { // 3 byte sequence +   char32_t value {static_cast<char32_t>(((accu & 0x0F) << 12) | ((accu >> 2) & 0x0FC0) | ((accu >> 16) & 0x3f))}; +   accu >>= 24; +   bytes_in_accu -= 3; +   if (is_valid_unicode<16>(value)) +    append_utf<16>(result, value); +   else +#if __cplusplus >= 202002L +    [[unlikely]] +#endif +    throw std::invalid_argument("Invalid Unicode character in 3 byte UTF-8 sequence"); +  } else if ((block_mode || bytes_in_accu >= 4) && (accu & 0xC0C0C0F8) == 0x808080F0) { // 4 byte sequence +   char32_t value {static_cast<char32_t>(((accu & 0x07) << 18) | ((accu << 4) & 0x3f000) | ((accu >> 10) & 0xFC0) | ((accu >> 24) & 0x3f))}; +   accu >>= 32; +   bytes_in_accu -= 4; +   if (is_valid_unicode<21>(value)) +    append_utf(result, value); +   else +#if __cplusplus >= 202002L +    [[unlikely]] +#endif +    throw std::invalid_argument("Invalid Unicode character in 4 byte UTF-8 sequence"); +  } else +#if __cplusplus >= 202002L +   [[unlikely]] +#endif +   throw std::invalid_argument("Invalid UTF-8 byte sequence"); + } + + // Little Endian optimized version + template<typename From, typename To, std::enable_if_t<is_encoding_v<From> && is_encoding_v<To>, bool> = true> + typename To::string_type convert_optimized_utf(const typename From::string_type& s) + { +  typename To::string_type result; +  uint64_t accu{}; +  int bytes_in_accu{}; + +  size_t s_index{}; +  size_t s_size{s.size()}; +  while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { +   // read input +   // assume: bytes_in_accu < 8 +   accu |= (*reinterpret_cast<const uint64_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); +   s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); +   bytes_in_accu = 8; + +   while (bytes_in_accu >= 4) { +    append_accu<typename From::value_type, typename To::value_type, true>(result, accu, bytes_in_accu); +   } +  } + +  // 0..3 bytes left in accu +  // 0..7 bytes left in s + +  while (s_index < s_size || bytes_in_accu > 0) { +   while (s_index < s_size && bytes_in_accu < 8) { +    accu |= static_cast<uint64_t>(*reinterpret_cast<const uint8_t*>(&(s.data()[s_index]))) << (bytes_in_accu * 8); +    ++s_index; +    bytes_in_accu += sizeof(typename From::value_type); +   } + +   append_accu<typename From::value_type, typename To::value_type, false>(result, accu, bytes_in_accu); +  } +  return result; + } + +} // namespace unicode + diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h index 82031d1..f0a003d 100644 --- a/include/unicode/predicate.h +++ b/include/unicode/predicate.h @@ -1,5 +1,7 @@  // -// Unicode library - predicates for Unicode characters +// Reichwein.IT Unicode Library +// +// Predicates for Unicode characters  //  #pragma once diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index 63c7d69..47789b9 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -1,3 +1,9 @@ +// +// Reichwein.IT Unicode Library +// +// Type traits +// +  #pragma once  #include "utf_fwd.h" @@ -32,7 +38,7 @@ namespace unicode {   template<typename T>   struct is_char   { -  static const bool value{std::is_trivial_v<T> && std::is_scalar_v<T> && !std::is_empty_v<T>}; +  static const bool value{std::is_trivial_v<T> && std::is_scalar_v<T>};   };   template<typename T> diff --git a/include/unicode/types.h b/include/unicode/types.h index a4461d7..6eac5f7 100644 --- a/include/unicode/types.h +++ b/include/unicode/types.h @@ -1,9 +1,20 @@ +// +// Reichwein.IT Unicode Library +// +// Basic types +// +  #pragma once +// Definition of utf8_t as abstraction from char and char8_t, when available +// +// Be aware of char being signed on common architectures, while char8_t is +// unsigned.  #ifdef __cpp_char8_t -// char8_t available + // char8_t available in C++20   typedef char8_t utf8_t;  #else + // fallback to char   typedef char utf8_t;  #endif  typedef char iso_t; diff --git a/include/unicode/utf.h b/include/unicode/utf.h index 1d2f28e..5db9cac 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -1,3 +1,13 @@ +// +// Reichwein.IT Unicode Library +// +// Functions for support of UTF encodings +// +// Implementation of utf_iterator and utf_back_insert_iterator templates for +// validation and conversion via STL compatible iteration over standard +// containers +// +  #pragma once  #include "utf_fwd.h" diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h index c42dea1..7fd9329 100644 --- a/include/unicode/utf_fwd.h +++ b/include/unicode/utf_fwd.h @@ -1,6 +1,11 @@ -#pragma once +// +// Reichwein.IT Unicode Library +// +// Forward declarations for utf.h - Functions for reading and writing UTF +// encodings +// -// Forward declarations +#pragma once  #include "types.h" diff --git a/include/unicode/validation.h b/include/unicode/validation.h new file mode 100644 index 0000000..b5060c4 --- /dev/null +++ b/include/unicode/validation.h @@ -0,0 +1,78 @@ +// +// Reichwein.IT Unicode Library +// +// Functions for validation of UTF (Unicode Transformation Format) encodings +// + +#pragma once + +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/optimization.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" + +#include <algorithm> +#include <array> +#include <cstdint> +#include <iterator> +#include <memory> +#include <stdexcept> +#include <string> +#include <type_traits> +#include <utility> + +namespace unicode { + + // First variant of is_valid_utf(): Specification of encoding explicitly + // + // e.g. + // unicode::UTF_8 + // unicode::UTF_16 + // unicode::UTF_32 + // + // see also type_traits.h and utf.h + template<typename Encoding, std::enable_if_t<is_encoding_v<Encoding>, bool> = true> + bool is_valid_utf(const typename Encoding::string_type& s) + { +  return validate_utf<typename Encoding::value_type>(s); + } + + // Second variant of is_valid_utf(): Specification of encoding via character type + // + // see also type_traits.h for is_char + template<typename T, +  typename Container=std::basic_string<T>, +  std::enable_if_t<is_char_v<T>, bool> = true> + bool is_valid_utf(const Container& s) + { +  typedef UTF<utf_iterator<T>, utf_back_insert_iterator<T>> UTF_Trait; +   +  try { +   std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); +  } catch (const std::invalid_argument&) { +   return false; +  } +  return true; + } + + // Third variant of is_valid_utf(): Specification of encoding via container type + // + // see also type_traits.h for is_container + template<typename Container, std::enable_if_t<is_container_v<Container>, bool> = true> + bool is_valid_utf(const Container& s) + { +  typedef UTF<utf_iterator<typename Container::value_type, Container>, utf_back_insert_iterator<typename Container::value_type, Container>> UTF_Trait; +   +  try { +   std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); +  } catch (const std::invalid_argument&) { +   return false; +  } +  return true; + } + +} // namespace unicode + | 
