diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-01-27 22:21:04 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-01-27 22:21:04 +0100 |
commit | cd4fad54c0be9fb7fca57e8e03228b8b649b5b51 (patch) | |
tree | 6b688a27597791bfea60d533f985061f1e6f9e06 /include | |
parent | fad8b697dff7c7b47f034124ea6eef25e74bd7af (diff) |
Bugfixes, tests
Diffstat (limited to 'include')
-rw-r--r-- | include/unicode.h | 40 |
1 files changed, 21 insertions, 19 deletions
diff --git a/include/unicode.h b/include/unicode.h index a55eac3..f539e6b 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -1,5 +1,4 @@ // libunicode -// Copyright (C) 2021 Roland Reichwein #pragma once @@ -7,11 +6,20 @@ #include <stdexcept> #include <string> -#ifdef __has_cpp_attribute -#if __has_cpp_attribute(__cpp_char8_t) +#ifdef __cpp_char8_t // char8_t available #endif -#endif + +namespace unicode { + + // usually, char32_t, uint32_t etc. + template<typename T> + static inline bool is_valid_unicode(const T& value) + { + return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000); + } + +} namespace { @@ -50,6 +58,8 @@ namespace { template<typename T1> void calculate_value() { + static_assert(sizeof(T1) == 4); + size_t remaining{remaining_code_units()}; if (!remaining) @@ -57,7 +67,7 @@ namespace { value = get_code_unit<0>(); - if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000)) + if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); sequence_length = 1; @@ -88,7 +98,7 @@ namespace { template<typename... Targs> inline static char32_t continuation_value(T b, Targs... Fargs) { - return continuation_value(b) << 6 | continuation_value(Fargs...); + return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...); } template<size_t n> @@ -159,7 +169,7 @@ namespace { if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00) throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates"); - value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF); + value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000; sequence_length = 2; } } @@ -185,7 +195,7 @@ namespace { typename string_type::const_iterator iterator; typename string_type::const_iterator end_iterator; - value_type value{}; + char32_t value{}; // always save complete unicode code point at this point size_t sequence_length{}; }; @@ -276,8 +286,9 @@ namespace { if (value <= 0xFFFF) { // expect value to be already valid Unicode values s.push_back(value); } else { - s.push_back((value >> 10) + 0xD800); - s.push_back((value & 0x3FF) + 0xDC00); + char32_t value_reduced{value - 0x10000}; + s.push_back((value_reduced >> 10) + 0xD800); + s.push_back((value_reduced & 0x3FF) + 0xDC00); } return *this; } @@ -317,14 +328,5 @@ std::basic_string<To> utf_to_utf(const std::basic_string<From>& s) return result; } -//std::u8string utf16_to_utf8(const std::u16string& s) -//{ -// std::u8string result; -// -// std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result)); -// -// return result; -//} - } // namespace unicode |