From d234c1ca09af512e9a13579a6fff8d5834d7b36c Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Mon, 3 Jan 2022 16:08:38 +0100 Subject: Separated out remaining functions from unicode.h, documentation --- Makefile | 3 + README.txt | 9 + include/unicode.h | 402 +---------------------------------------- include/unicode/conversion.h | 113 ++++++++++++ include/unicode/endian.h | 8 + include/unicode/iso.h | 13 ++ include/unicode/optimization.h | 325 +++++++++++++++++++++++++++++++++ include/unicode/predicate.h | 4 +- include/unicode/type_traits.h | 8 +- include/unicode/types.h | 13 +- include/unicode/utf.h | 10 + include/unicode/utf_fwd.h | 9 +- include/unicode/validation.h | 78 ++++++++ 13 files changed, 593 insertions(+), 402 deletions(-) create mode 100644 include/unicode/conversion.h create mode 100644 include/unicode/optimization.h create mode 100644 include/unicode/validation.h diff --git a/Makefile b/Makefile index 6c89182..75e9bc5 100644 --- a/Makefile +++ b/Makefile @@ -153,13 +153,16 @@ DISTFILES= \ Makefile \ README.txt \ include/unicode.h \ + include/unicode/conversion.h \ include/unicode/endian.h \ include/unicode/iso.h \ + include/unicode/optimization.h \ include/unicode/predicate.h \ include/unicode/types.h \ include/unicode/type_traits.h \ include/unicode/utf.h \ include/unicode/utf_fwd.h \ + include/unicode/validation.h \ debian/control \ debian/compat \ debian/copyright \ diff --git a/README.txt b/README.txt index 9544b49..fc4265c 100644 --- a/README.txt +++ b/README.txt @@ -5,6 +5,15 @@ This software package contains a C++ library for Unicode encoding conversion and command line tools which apply those functions in example runtime programs: recode and validate. +Properties +---------- + +* Supports C++17 and C++20 +* Locale independent validation and conversion +* Supports UTF-8, UTF-16, UTF-32, ISO-8859-1 and ISO-8859-15 +* Supports Linux and Windows +* Supports current compilers (clang++-11, clang++-13, g++-11, msvc-19.28.29337) + C++ interface (package libunicode-dev) -------------------------------------- diff --git a/include/unicode.h b/include/unicode.h index d033f63..6102a21 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,5 +1,7 @@ // libunicode // +// Reichwein.IT Unicode Library +// // Author: Roland Reichwein // // Available under the conditions of CC0 1.0 Universal @@ -10,407 +12,13 @@ #pragma once +#include "unicode/conversion.h" #include "unicode/endian.h" #include "unicode/iso.h" +#include "unicode/optimization.h" #include "unicode/predicate.h" #include "unicode/types.h" #include "unicode/type_traits.h" #include "unicode/utf.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace unicode { - - // Helper function: Item distance of specified iterators - // std::distance doesn't work here: it is based on "output" distance of iterators - template - inline size_t input_distance(const Iterator& it1, const Iterator& it2) - { - return it2 - it1; - } - - template - inline size_t input_distance_bytes(const Iterator& it1, const Iterator& it2) - { - return input_distance(it1, it2) * sizeof(typename Iterator::value_type); - } - - // Optimizations following: - static const size_t accu_size {sizeof(size_t)}; - - template - struct ConvertInputOptimizer {}; - - template<> struct ConvertInputOptimizer<1> - { - static const uint32_t ascii_mask { 0x80808080 }; - }; - - template<> struct ConvertInputOptimizer<2> - { - static const uint32_t ascii_mask { 0xFF80FF80 }; - }; - - template<> struct ConvertInputOptimizer<4> - { - static const uint32_t ascii_mask { 0xFFFFFF80 }; - }; - - template - struct ArchitectureOptimizer {}; - - template - struct ArchitectureOptimizer<4, ConvertInputOptimizer> - { - typedef ConvertInputOptimizer input_optimizer; - typedef uint32_t accu_type; - static const accu_type addr_mask {accu_size - 1}; - static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; - static const accu_type ascii_value { 0ULL }; - - template - inline static void append(const input_value_type* addr, output_string_type& s) - { - if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { - s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); - } else if constexpr(is_utf_8_v) { - s.append({static_cast(addr[0]), - static_cast(addr[1]), - static_cast(addr[2]), - static_cast(addr[3])}); - } else if constexpr(is_utf_16_v) { - s.append({static_cast(addr[0]), - static_cast(addr[1])}); - } else if constexpr(is_utf_32_v) { - s.append({static_cast(addr[0])}); - } - } - }; - - template - struct ArchitectureOptimizer<8, ConvertInputOptimizer> - { - typedef ConvertInputOptimizer input_optimizer; - typedef uint64_t accu_type; - static const accu_type addr_mask {accu_size - 1}; - static const accu_type ascii_mask { ((accu_type)input_optimizer::ascii_mask) << 32 | (accu_type)input_optimizer::ascii_mask }; - static const accu_type ascii_value { 0ULL }; - - template - inline static void append(const input_value_type* addr, output_string_type& s) - { - if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { - s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); - } else if constexpr(is_utf_8_v) { - s.append({static_cast(addr[0]), - static_cast(addr[1]), - static_cast(addr[2]), - static_cast(addr[3]), - static_cast(addr[4]), - static_cast(addr[5]), - static_cast(addr[6]), - static_cast(addr[7])}); - } else if constexpr(is_utf_16_v) { - s.append({static_cast(addr[0]), - static_cast(addr[1]), - static_cast(addr[2]), - static_cast(addr[3])}); - } else if constexpr(is_utf_32_v) { - s.append({static_cast(addr[0]), - static_cast(addr[1])}); - } - } - - }; // class ArchitectureOptimizer - - // Optimize for the case of all ASCII (7-bit) data in a accu size row - // From and To are Encodings - template && is_encoding_v, bool> = true> - typename To::string_type convert_optimized(const typename From::string_type& s) - { - typename To::string_type result; - typedef ConvertInputOptimizer input_optimizer; - typedef ArchitectureOptimizer arch_optimizer; - - auto begin{From::begin(s)}; - auto end{From::end(s)}; - auto back_inserter{To::back_inserter(result)}; - auto addr{reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])}; - while (input_distance_bytes(begin, end) >= accu_size) { - if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { - while (input_distance_bytes(begin, end) >= accu_size) { - typename arch_optimizer::accu_type data{*addr}; - if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) -#if __cplusplus >= 202002L - [[likely]] -#endif - { - arch_optimizer::template append(reinterpret_cast(addr), result); - begin += accu_size / sizeof(typename From::value_type); - ++addr; - } else { - // just advance one code unit for now and break to trigger unoptimized - // version until next accu boundary - back_inserter = *begin; - ++begin; - break; - } - } - } - - // keep up after unaligned Non-ASCII code points - while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { - back_inserter = *begin; - ++begin; - } - } - - // remainder < 8 bytes - while (begin != end) { - back_inserter = *begin; - ++begin; - } - - return result; - } - - template, bool> = true> - inline void append_utf(std::basic_string& result, const char32_t& value) - { - using From = char32_t; - if (bits_to_compare <= 7 || value < 0x80) { // 1 byte - result.push_back(static_cast(value)); - } else if (bits_to_compare <= 11 || value < 0x800) { // 2 bytes - result.append({utf8_byte_n_of_m<0,2,From,To>(value), utf8_byte_n_of_m<1,2,From,To>(value)}); - } else if (bits_to_compare <= 16 || value < 0x10000) { // 3 bytes - result.append({utf8_byte_n_of_m<0,3,From,To>(value), utf8_byte_n_of_m<1,3,From,To>(value), utf8_byte_n_of_m<2,3,From,To>(value)}); - } else { // 4 bytes - // expect value to be already valid Unicode values - result.append({utf8_byte_n_of_m<0,4,From,To>(value), utf8_byte_n_of_m<1,4,From,To>(value), utf8_byte_n_of_m<2,4,From,To>(value), utf8_byte_n_of_m<3,4,From,To>(value)}); - } - } - - template, bool> = true> - inline void append_utf(std::basic_string& result, const char32_t& value) - { - if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values - result.push_back(static_cast(value)); - } else { - char32_t value_reduced{value - 0x10000}; - result.append({static_cast((value_reduced >> 10) + 0xD800), static_cast((value_reduced & 0x3FF) + 0xDC00)}); - } - } - - template, bool> = true> - inline void append_utf(std::basic_string& result, const char32_t& value) - { - // expect value to be already valid Unicode values (checked in input iterator) - result.push_back(static_cast(value)); - } - - // Little Endian optimized version for UTF-8 - // In block_mode, at least 4 bytes are in accu. On first call, even 8. - // otherwise, at least one code unit is in accu - template, bool> = true> - inline static void append_accu(std::basic_string& result, uint64_t& accu, int& bytes_in_accu) - { - if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) -#if __cplusplus >= 202002L - [[likely]] -#endif - { - result.append({ - static_cast(accu & 0x7F), - static_cast((accu >> 8) & 0x7F), - static_cast((accu >> 16) & 0x7F), - static_cast((accu >> 24) & 0x7F), - static_cast((accu >> 32) & 0x7F), - static_cast((accu >> 40) & 0x7F), - static_cast((accu >> 48) & 0x7F), - static_cast((accu >> 56) & 0x7F), - }); - accu = 0; - bytes_in_accu = 0; - } else if ((accu & 0x80) == 0) { // 1 byte sequence - append_utf<7>(result, static_cast(accu & 0x7F)); - accu >>= 8; - bytes_in_accu -= 1; - } else if ((block_mode || bytes_in_accu >= 2) && (accu & 0xC0E0) == 0x80C0) { // 2 byte sequence - char32_t value {static_cast(((accu & 0x1F) << 6) | ((accu >> 8) & 0x3f))}; - accu >>= 16; - bytes_in_accu -= 2; - if (is_valid_unicode<11>(value)) - append_utf<11>(result, value); - else -#if __cplusplus >= 202002L - [[unlikely]] -#endif - throw std::invalid_argument("Invalid Unicode character in 2 byte UTF-8 sequence"); - } else if ((block_mode || bytes_in_accu >= 3) && (accu & 0xC0C0F0) == 0x8080E0) { // 3 byte sequence - char32_t value {static_cast(((accu & 0x0F) << 12) | ((accu >> 2) & 0x0FC0) | ((accu >> 16) & 0x3f))}; - accu >>= 24; - bytes_in_accu -= 3; - if (is_valid_unicode<16>(value)) - append_utf<16>(result, value); - else -#if __cplusplus >= 202002L - [[unlikely]] -#endif - throw std::invalid_argument("Invalid Unicode character in 3 byte UTF-8 sequence"); - } else if ((block_mode || bytes_in_accu >= 4) && (accu & 0xC0C0C0F8) == 0x808080F0) { // 4 byte sequence - char32_t value {static_cast(((accu & 0x07) << 18) | ((accu << 4) & 0x3f000) | ((accu >> 10) & 0xFC0) | ((accu >> 24) & 0x3f))}; - accu >>= 32; - bytes_in_accu -= 4; - if (is_valid_unicode<21>(value)) - append_utf(result, value); - else -#if __cplusplus >= 202002L - [[unlikely]] -#endif - throw std::invalid_argument("Invalid Unicode character in 4 byte UTF-8 sequence"); - } else -#if __cplusplus >= 202002L - [[unlikely]] -#endif - throw std::invalid_argument("Invalid UTF-8 byte sequence"); - } - - // Little Endian optimized version - template && is_encoding_v, bool> = true> - typename To::string_type convert_optimized_utf(const typename From::string_type& s) - { - typename To::string_type result; - uint64_t accu{}; - int bytes_in_accu{}; - - size_t s_index{}; - size_t s_size{s.size()}; - while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { - // read input - // assume: bytes_in_accu < 8 - accu |= (*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); - s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); - bytes_in_accu = 8; - - while (bytes_in_accu >= 4) { - append_accu(result, accu, bytes_in_accu); - } - } - - // 0..3 bytes left in accu - // 0..7 bytes left in s - - while (s_index < s_size || bytes_in_accu > 0) { - while (s_index < s_size && bytes_in_accu < 8) { - accu |= static_cast(*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); - ++s_index; - bytes_in_accu += sizeof(typename From::value_type); - } - - append_accu(result, accu, bytes_in_accu); - } - return result; - } - - // From and To are Encodings - template && is_encoding_v, bool> = true> - typename To::string_type convert(const typename From::string_type& s) - { - // if input type == output type, only validate and return input, if appropriate - if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && - is_utf_encoding_v && is_utf_encoding_v) { - if (validate_utf(s)) { - return s; - } else { - throw std::invalid_argument("Invalid UTF input"); - } - } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v && - is_utf_encoding_v && is_utf_encoding_v) { // endian specific optimization - return convert_optimized_utf(s); - } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input - return convert_optimized(s); - } else { - typename To::string_type result; - std::copy(From::begin(s), From::end(s), To::back_inserter(result)); - return result; - } - } - - // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t - template, - typename ToContainer=std::basic_string, - std::enable_if_t && is_char_v, bool> = true> - ToContainer convert(const FromContainer& s) - { - typedef UTF, utf_back_insert_iterator> UTF_Trait; - - ToContainer result; - - std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); - - return result; - } - - // From and To are containers - template && is_container_v, bool> = true - > - ToContainer convert(const FromContainer& s) - { - typedef UTF, utf_back_insert_iterator> UTF_Trait; - - ToContainer result; - - std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); - - return result; - } - - // Container version - template, bool> = true> - bool is_valid_utf(const Container& s) - { - typedef UTF, utf_back_insert_iterator> UTF_Trait; - - try { - std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); - } catch (const std::invalid_argument&) { - return false; - } - return true; - } - - // basic type version - template, - std::enable_if_t, bool> = true> - bool is_valid_utf(const Container& s) - { - typedef UTF, utf_back_insert_iterator> UTF_Trait; - - try { - std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); - } catch (const std::invalid_argument&) { - return false; - } - return true; - } - - // Encoding version - template, bool> = true> - bool is_valid_utf(const typename Encoding::string_type& s) - { - return validate_utf(s); - } - -} // namespace unicode +#include "unicode/validation.h" diff --git a/include/unicode/conversion.h b/include/unicode/conversion.h new file mode 100644 index 0000000..dc57084 --- /dev/null +++ b/include/unicode/conversion.h @@ -0,0 +1,113 @@ +// +// Reichwein.IT Unicode Library +// +// Functions for conversion between UTF and ISO encodings +// + +#pragma once + +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/optimization.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace unicode { + + // First variant of convert(): Specification of encodings explicitly + // + // e.g. + // unicode::UTF_8 + // unicode::UTF_16 + // unicode::UTF_32 + // unicode::ISO_8859_1 + // unicode::ISO_8859_15 + // + // see also utf.h and iso.h + // + // From and To are Encodings + // + // throws std::invalid_argument on conversion error + template && is_encoding_v, bool> = true> + typename To::string_type convert(const typename From::string_type& s) + { + // At compile time, decide which optimization to use, with fallback to + // iterating with std::copy() + + // if input type == output type, only validate and return input, if appropriate + if constexpr(sizeof(typename From::value_type) == sizeof(typename To::value_type) && + is_utf_encoding_v && is_utf_encoding_v) { + if (validate_utf(s)) { + return s; + } else { + throw std::invalid_argument("Invalid UTF input"); + } + } else if constexpr(accu_size == 8 && is_little_endian() && is_utf_8_v && + is_utf_encoding_v && is_utf_encoding_v) { // endian specific optimization + return convert_optimized_utf(s); + } else if constexpr(accu_size == 4 || accu_size == 8) { // accu size specific optimization with speedup for 7bit input + return convert_optimized(s); + } else { + typename To::string_type result; + std::copy(From::begin(s), From::end(s), To::back_inserter(result)); + return result; + } + } + + // Second variant of convert(): Specification of encodings via character type + // + // see also type_traits.h for is_char + // + // From and To are from: utf8_t (i.e. char or char8_t (C++20)), char16_t and char32_t, char, wchar_t, uint8_t, uint16_t, uint32_t + // + // throws std::invalid_argument on conversion error + template, + typename ToContainer=std::basic_string, + std::enable_if_t && is_char_v, bool> = true> + ToContainer convert(const FromContainer& s) + { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + + ToContainer result; + + std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); + + return result; + } + + // Third variant of convert(): Specification of encodings via container type + // + // see also type_traits.h for is_container + // + // From and To are containers + // + // throws std::invalid_argument on conversion error + template && is_container_v, bool> = true + > + ToContainer convert(const FromContainer& s) + { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + + ToContainer result; + + std::copy(UTF_Trait::begin(s), UTF_Trait::end(s), UTF_Trait::back_inserter(result)); + + return result; + } + +} // namespace unicode + diff --git a/include/unicode/endian.h b/include/unicode/endian.h index 1230f06..d933a2b 100644 --- a/include/unicode/endian.h +++ b/include/unicode/endian.h @@ -1,3 +1,11 @@ +// +// Reichwein.IT Unicode Library +// +// Endian handling functions +// +// In C++17, endian support is not yet available. +// + #pragma once #if __cplusplus >= 202002L diff --git a/include/unicode/iso.h b/include/unicode/iso.h index 1f5f007..24e3dd1 100644 --- a/include/unicode/iso.h +++ b/include/unicode/iso.h @@ -1,3 +1,16 @@ +// +// Reichwein.IT Unicode Library +// +// ISO 8895 (-1 and -15) handling functions (i.e. Latin-1 and Latin-9) +// +// Implementation of iso_iterator for reading individual Unicode code points +// from an string or container input, and a iso_back_insert_iterator for +// writing them to the destination. +// +// The design is made to be compatible to the respective iterators in utf.h +// to make it easy to combine them. +// + #pragma once #include "types.h" diff --git a/include/unicode/optimization.h b/include/unicode/optimization.h new file mode 100644 index 0000000..d7b054d --- /dev/null +++ b/include/unicode/optimization.h @@ -0,0 +1,325 @@ +// +// Reichwein.IT Unicode Library +// +// Optimized conversion functions for UTF input and output +// + +#pragma once + +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace unicode { + + // Helper function: Item distance of specified iterators + // std::distance doesn't work here: it is based on "output" distance of iterators + template + inline size_t input_distance(const Iterator& it1, const Iterator& it2) + { + return it2 - it1; + } + + // Helper function: Distance of specified iterator content data in bytes + template + inline size_t input_distance_bytes(const Iterator& it1, const Iterator& it2) + { + return input_distance(it1, it2) * sizeof(typename Iterator::value_type); + } + + // Optimizations following: + static const size_t accu_size {sizeof(size_t)}; + + template + struct ConvertInputOptimizer {}; + + template<> struct ConvertInputOptimizer<1> + { + static const uint32_t ascii_mask { 0x80808080 }; + }; + + template<> struct ConvertInputOptimizer<2> + { + static const uint32_t ascii_mask { 0xFF80FF80 }; + }; + + template<> struct ConvertInputOptimizer<4> + { + static const uint32_t ascii_mask { 0xFFFFFF80 }; + }; + + template + struct ArchitectureOptimizer {}; + + // On 32 bit architecture, calculate with 32 bit accumulator value + // (hoping the compiler will put it into a 32 bit register) + template + struct ArchitectureOptimizer<4, ConvertInputOptimizer> + { + typedef ConvertInputOptimizer input_optimizer; + typedef uint32_t accu_type; + static const accu_type addr_mask {accu_size - 1}; + static const accu_type ascii_mask { (accu_type)input_optimizer::ascii_mask }; + static const accu_type ascii_value { 0ULL }; + + template + inline static void append(const input_value_type* addr, output_string_type& s) + { + if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { + s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); + } else if constexpr(is_utf_8_v) { + s.append({static_cast(addr[0]), + static_cast(addr[1]), + static_cast(addr[2]), + static_cast(addr[3])}); + } else if constexpr(is_utf_16_v) { + s.append({static_cast(addr[0]), + static_cast(addr[1])}); + } else if constexpr(is_utf_32_v) { + s.append({static_cast(addr[0])}); + } + } + }; + + // On 64 bit architecture, calculate with 64 bit accumulator value + // (hoping the compiler will put it into a 64 bit register) + template + struct ArchitectureOptimizer<8, ConvertInputOptimizer> + { + typedef ConvertInputOptimizer input_optimizer; + typedef uint64_t accu_type; + static const accu_type addr_mask {accu_size - 1}; + static const accu_type ascii_mask { ((accu_type)input_optimizer::ascii_mask) << 32 | (accu_type)input_optimizer::ascii_mask }; + static const accu_type ascii_value { 0ULL }; + + template + inline static void append(const input_value_type* addr, output_string_type& s) + { + if constexpr(sizeof(input_value_type) == sizeof(typename output_string_type::value_type)) { + s.append(reinterpret_cast(addr), accu_size / sizeof(input_value_type)); + } else if constexpr(is_utf_8_v) { + s.append({static_cast(addr[0]), + static_cast(addr[1]), + static_cast(addr[2]), + static_cast(addr[3]), + static_cast(addr[4]), + static_cast(addr[5]), + static_cast(addr[6]), + static_cast(addr[7])}); + } else if constexpr(is_utf_16_v) { + s.append({static_cast(addr[0]), + static_cast(addr[1]), + static_cast(addr[2]), + static_cast(addr[3])}); + } else if constexpr(is_utf_32_v) { + s.append({static_cast(addr[0]), + static_cast(addr[1])}); + } + } + + }; // class ArchitectureOptimizer + + // Optimize for the case of all ASCII (7-bit) data in a accu size row + // From and To are Encodings + template && is_encoding_v, bool> = true> + typename To::string_type convert_optimized(const typename From::string_type& s) + { + typename To::string_type result; + typedef ConvertInputOptimizer input_optimizer; + typedef ArchitectureOptimizer arch_optimizer; + + auto begin{From::begin(s)}; + auto end{From::end(s)}; + auto back_inserter{To::back_inserter(result)}; + auto addr{reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])}; + while (input_distance_bytes(begin, end) >= accu_size) { + if (((uintptr_t)(void*)addr & arch_optimizer::addr_mask) == 0) { + while (input_distance_bytes(begin, end) >= accu_size) { + typename arch_optimizer::accu_type data{*addr}; + if ((data & arch_optimizer::ascii_mask) == arch_optimizer::ascii_value) +#if __cplusplus >= 202002L + [[likely]] +#endif + { + arch_optimizer::template append(reinterpret_cast(addr), result); + begin += accu_size / sizeof(typename From::value_type); + ++addr; + } else { + // just advance one code unit for now and break to trigger unoptimized + // version until next accu boundary + back_inserter = *begin; + ++begin; + break; + } + } + } + + // keep up after unaligned Non-ASCII code points + while (begin != end && (uintptr_t)(void*)(addr = reinterpret_cast(&s.data()[s.size() - input_distance(begin, end)])) & arch_optimizer::addr_mask) { + back_inserter = *begin; + ++begin; + } + } + + // remainder < 8 bytes + while (begin != end) { + back_inserter = *begin; + ++begin; + } + + return result; + } + + template, bool> = true> + inline void append_utf(std::basic_string& result, const char32_t& value) + { + using From = char32_t; + if (bits_to_compare <= 7 || value < 0x80) { // 1 byte + result.push_back(static_cast(value)); + } else if (bits_to_compare <= 11 || value < 0x800) { // 2 bytes + result.append({utf8_byte_n_of_m<0,2,From,To>(value), utf8_byte_n_of_m<1,2,From,To>(value)}); + } else if (bits_to_compare <= 16 || value < 0x10000) { // 3 bytes + result.append({utf8_byte_n_of_m<0,3,From,To>(value), utf8_byte_n_of_m<1,3,From,To>(value), utf8_byte_n_of_m<2,3,From,To>(value)}); + } else { // 4 bytes + // expect value to be already valid Unicode values + result.append({utf8_byte_n_of_m<0,4,From,To>(value), utf8_byte_n_of_m<1,4,From,To>(value), utf8_byte_n_of_m<2,4,From,To>(value), utf8_byte_n_of_m<3,4,From,To>(value)}); + } + } + + template, bool> = true> + inline void append_utf(std::basic_string& result, const char32_t& value) + { + if (bits_to_compare <= 16 || value <= 0xFFFF) { // expect value to be already valid Unicode values + result.push_back(static_cast(value)); + } else { + char32_t value_reduced{value - 0x10000}; + result.append({static_cast((value_reduced >> 10) + 0xD800), static_cast((value_reduced & 0x3FF) + 0xDC00)}); + } + } + + template, bool> = true> + inline void append_utf(std::basic_string& result, const char32_t& value) + { + // expect value to be already valid Unicode values (checked in input iterator) + result.push_back(static_cast(value)); + } + + // Little Endian optimized version for UTF-8 + // In block_mode, at least 4 bytes are in accu. On first call, even 8. + // otherwise, at least one code unit is in accu + template, bool> = true> + inline static void append_accu(std::basic_string& result, uint64_t& accu, int& bytes_in_accu) + { + if (block_mode && bytes_in_accu == 8 && (accu & 0x8080808080808080) == 0) +#if __cplusplus >= 202002L + [[likely]] +#endif + { + result.append({ + static_cast(accu & 0x7F), + static_cast((accu >> 8) & 0x7F), + static_cast((accu >> 16) & 0x7F), + static_cast((accu >> 24) & 0x7F), + static_cast((accu >> 32) & 0x7F), + static_cast((accu >> 40) & 0x7F), + static_cast((accu >> 48) & 0x7F), + static_cast((accu >> 56) & 0x7F), + }); + accu = 0; + bytes_in_accu = 0; + } else if ((accu & 0x80) == 0) { // 1 byte sequence + append_utf<7>(result, static_cast(accu & 0x7F)); + accu >>= 8; + bytes_in_accu -= 1; + } else if ((block_mode || bytes_in_accu >= 2) && (accu & 0xC0E0) == 0x80C0) { // 2 byte sequence + char32_t value {static_cast(((accu & 0x1F) << 6) | ((accu >> 8) & 0x3f))}; + accu >>= 16; + bytes_in_accu -= 2; + if (is_valid_unicode<11>(value)) + append_utf<11>(result, value); + else +#if __cplusplus >= 202002L + [[unlikely]] +#endif + throw std::invalid_argument("Invalid Unicode character in 2 byte UTF-8 sequence"); + } else if ((block_mode || bytes_in_accu >= 3) && (accu & 0xC0C0F0) == 0x8080E0) { // 3 byte sequence + char32_t value {static_cast(((accu & 0x0F) << 12) | ((accu >> 2) & 0x0FC0) | ((accu >> 16) & 0x3f))}; + accu >>= 24; + bytes_in_accu -= 3; + if (is_valid_unicode<16>(value)) + append_utf<16>(result, value); + else +#if __cplusplus >= 202002L + [[unlikely]] +#endif + throw std::invalid_argument("Invalid Unicode character in 3 byte UTF-8 sequence"); + } else if ((block_mode || bytes_in_accu >= 4) && (accu & 0xC0C0C0F8) == 0x808080F0) { // 4 byte sequence + char32_t value {static_cast(((accu & 0x07) << 18) | ((accu << 4) & 0x3f000) | ((accu >> 10) & 0xFC0) | ((accu >> 24) & 0x3f))}; + accu >>= 32; + bytes_in_accu -= 4; + if (is_valid_unicode<21>(value)) + append_utf(result, value); + else +#if __cplusplus >= 202002L + [[unlikely]] +#endif + throw std::invalid_argument("Invalid Unicode character in 4 byte UTF-8 sequence"); + } else +#if __cplusplus >= 202002L + [[unlikely]] +#endif + throw std::invalid_argument("Invalid UTF-8 byte sequence"); + } + + // Little Endian optimized version + template && is_encoding_v, bool> = true> + typename To::string_type convert_optimized_utf(const typename From::string_type& s) + { + typename To::string_type result; + uint64_t accu{}; + int bytes_in_accu{}; + + size_t s_index{}; + size_t s_size{s.size()}; + while (s_index + 8 / sizeof(typename From::value_type) <= s_size) { + // read input + // assume: bytes_in_accu < 8 + accu |= (*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); + s_index += (8 - bytes_in_accu) / sizeof(typename From::value_type); + bytes_in_accu = 8; + + while (bytes_in_accu >= 4) { + append_accu(result, accu, bytes_in_accu); + } + } + + // 0..3 bytes left in accu + // 0..7 bytes left in s + + while (s_index < s_size || bytes_in_accu > 0) { + while (s_index < s_size && bytes_in_accu < 8) { + accu |= static_cast(*reinterpret_cast(&(s.data()[s_index]))) << (bytes_in_accu * 8); + ++s_index; + bytes_in_accu += sizeof(typename From::value_type); + } + + append_accu(result, accu, bytes_in_accu); + } + return result; + } + +} // namespace unicode + diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h index 82031d1..f0a003d 100644 --- a/include/unicode/predicate.h +++ b/include/unicode/predicate.h @@ -1,5 +1,7 @@ // -// Unicode library - predicates for Unicode characters +// Reichwein.IT Unicode Library +// +// Predicates for Unicode characters // #pragma once diff --git a/include/unicode/type_traits.h b/include/unicode/type_traits.h index 63c7d69..47789b9 100644 --- a/include/unicode/type_traits.h +++ b/include/unicode/type_traits.h @@ -1,3 +1,9 @@ +// +// Reichwein.IT Unicode Library +// +// Type traits +// + #pragma once #include "utf_fwd.h" @@ -32,7 +38,7 @@ namespace unicode { template struct is_char { - static const bool value{std::is_trivial_v && std::is_scalar_v && !std::is_empty_v}; + static const bool value{std::is_trivial_v && std::is_scalar_v}; }; template diff --git a/include/unicode/types.h b/include/unicode/types.h index a4461d7..6eac5f7 100644 --- a/include/unicode/types.h +++ b/include/unicode/types.h @@ -1,9 +1,20 @@ +// +// Reichwein.IT Unicode Library +// +// Basic types +// + #pragma once +// Definition of utf8_t as abstraction from char and char8_t, when available +// +// Be aware of char being signed on common architectures, while char8_t is +// unsigned. #ifdef __cpp_char8_t -// char8_t available + // char8_t available in C++20 typedef char8_t utf8_t; #else + // fallback to char typedef char utf8_t; #endif typedef char iso_t; diff --git a/include/unicode/utf.h b/include/unicode/utf.h index 1d2f28e..5db9cac 100644 --- a/include/unicode/utf.h +++ b/include/unicode/utf.h @@ -1,3 +1,13 @@ +// +// Reichwein.IT Unicode Library +// +// Functions for support of UTF encodings +// +// Implementation of utf_iterator and utf_back_insert_iterator templates for +// validation and conversion via STL compatible iteration over standard +// containers +// + #pragma once #include "utf_fwd.h" diff --git a/include/unicode/utf_fwd.h b/include/unicode/utf_fwd.h index c42dea1..7fd9329 100644 --- a/include/unicode/utf_fwd.h +++ b/include/unicode/utf_fwd.h @@ -1,6 +1,11 @@ -#pragma once +// +// Reichwein.IT Unicode Library +// +// Forward declarations for utf.h - Functions for reading and writing UTF +// encodings +// -// Forward declarations +#pragma once #include "types.h" diff --git a/include/unicode/validation.h b/include/unicode/validation.h new file mode 100644 index 0000000..b5060c4 --- /dev/null +++ b/include/unicode/validation.h @@ -0,0 +1,78 @@ +// +// Reichwein.IT Unicode Library +// +// Functions for validation of UTF (Unicode Transformation Format) encodings +// + +#pragma once + +#include "unicode/endian.h" +#include "unicode/iso.h" +#include "unicode/optimization.h" +#include "unicode/predicate.h" +#include "unicode/types.h" +#include "unicode/type_traits.h" +#include "unicode/utf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace unicode { + + // First variant of is_valid_utf(): Specification of encoding explicitly + // + // e.g. + // unicode::UTF_8 + // unicode::UTF_16 + // unicode::UTF_32 + // + // see also type_traits.h and utf.h + template, bool> = true> + bool is_valid_utf(const typename Encoding::string_type& s) + { + return validate_utf(s); + } + + // Second variant of is_valid_utf(): Specification of encoding via character type + // + // see also type_traits.h for is_char + template, + std::enable_if_t, bool> = true> + bool is_valid_utf(const Container& s) + { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + + try { + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + + // Third variant of is_valid_utf(): Specification of encoding via container type + // + // see also type_traits.h for is_container + template, bool> = true> + bool is_valid_utf(const Container& s) + { + typedef UTF, utf_back_insert_iterator> UTF_Trait; + + try { + std::for_each(UTF_Trait::begin(s), UTF_Trait::end(s), [](const char32_t& c){}); + } catch (const std::invalid_argument&) { + return false; + } + return true; + } + +} // namespace unicode + -- cgit v1.2.3