diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-12-21 20:11:06 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-12-21 20:11:06 +0100 |
commit | 65c8f3fa5856b216816a25c2d3dc9d87bd3896ee (patch) | |
tree | 9d6b1c66021301ab1e83c9866196e47b2ad2405c /include | |
parent | 3ca9f389084a2defe1fff2046dd3450e0b242e58 (diff) |
Optimize on UTF input and output
Diffstat (limited to 'include')
-rw-r--r-- | include/unicode.h | 69 |
1 files changed, 33 insertions, 36 deletions
diff --git a/include/unicode.h b/include/unicode.h index 7965a6e..43dc44e 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -115,16 +115,10 @@ namespace unicode::detail { template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> inline value_type calculate_value() { - size_t remaining{remaining_code_units()}; - - if (!remaining) - return {}; - - value_type value{}; - utf8_t byte0 {static_cast<utf8_t>(get_code_unit<0>())}; if (byte0 & 0x80) { // 2-4 bytes - if (remaining >= 2) { + value_type value{}; + if (size_t remaining{remaining_code_units()}; remaining >= 2) { utf8_t byte1 {static_cast<utf8_t>(get_code_unit<1>())}; if (is_byte0_of<2>(byte0) && is_continuation_byte(byte1)) { // 2 bytes value = value_byte0_of<2>(byte0) | continuation_value(byte1); @@ -152,29 +146,23 @@ namespace unicode::detail { if (!unicode::is_valid_unicode(value)) throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value))); + return value; } else { // 1 byte: 7 bit ASCII - value = byte0; std::advance(iterator, 1); + return byte0; } - - return value; } template<class X = T, typename std::enable_if<(sizeof(X) == 2), bool>::type = true> inline value_type calculate_value() { - size_t remaining{remaining_code_units()}; - - if (!remaining) - return {}; - char16_t unit0 {static_cast<char16_t>(get_code_unit<0>())}; if (unit0 <= 0xD7FF || unit0 >= 0xE000) { // 1 unit (BMP Basic Multilingual Plane) std::advance(iterator, 1); return unit0; } else { - if (remaining < 2) + if (remaining_code_units() < 2) throw std::invalid_argument("Bad input: Continuation of first UTF-16 unit missing"); char16_t unit1 {static_cast<char16_t>(get_code_unit<1>())}; @@ -189,11 +177,6 @@ namespace unicode::detail { template<class X = T, typename std::enable_if<(sizeof(X) == 4), bool>::type = true> inline value_type calculate_value() { - size_t remaining{remaining_code_units()}; - - if (!remaining) - return {}; - value_type result {static_cast<char32_t>(get_code_unit<0>())}; if (!unicode::is_valid_unicode(result)) @@ -284,23 +267,38 @@ namespace unicode::detail { return trailing_byte<m - n - 1>(value); } + template<typename Arg> + inline void append(Arg&& arg) + { + if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) { + s.append({arg}); + } else { + s.emplace_back(arg); + } + } + + template<typename Arg, typename... Args> + inline void append(Arg&& arg, Args&&... args) + { + if constexpr (std::is_same<Container, typename std::basic_string<T>>::value) { + s.append({arg, args...}); + } else { + s.emplace_back(arg); + append(args...); + } + } + template<class X = T, typename std::enable_if<(sizeof(X) == 1), bool>::type = true> inline void append_utf(const char32_t& value) { if (value < 0x80) { // 1 byte - s.push_back(static_cast<value_type>(value)); + append(static_cast<value_type>(value)); } else if (value < 0x800) { // 2 bytes - s.push_back(byte_n_of_m<0,2>(value)); - s.push_back(byte_n_of_m<1,2>(value)); + append(byte_n_of_m<0,2>(value), byte_n_of_m<1,2>(value)); } else if (value < 0x10000) { // 3 bytes - s.push_back(byte_n_of_m<0,3>(value)); - s.push_back(byte_n_of_m<1,3>(value)); - s.push_back(byte_n_of_m<2,3>(value)); + append(byte_n_of_m<0,3>(value), byte_n_of_m<1,3>(value), byte_n_of_m<2,3>(value)); } else if (value < 0x110000) { // 4 bytes - s.push_back(byte_n_of_m<0,4>(value)); - s.push_back(byte_n_of_m<1,4>(value)); - s.push_back(byte_n_of_m<2,4>(value)); - s.push_back(byte_n_of_m<3,4>(value)); + append(byte_n_of_m<0,4>(value), byte_n_of_m<1,4>(value), byte_n_of_m<2,4>(value), byte_n_of_m<3,4>(value)); } else throw std::runtime_error("Invalid internal Unicode value: "s + std::to_string(static_cast<uint32_t>(value))); } @@ -309,11 +307,10 @@ namespace unicode::detail { inline void append_utf(const char32_t& value) { if (value <= 0xFFFF) { // expect value to be already valid Unicode values (checked in input iterator) - s.push_back(static_cast<value_type>(value)); + append(static_cast<value_type>(value)); } else { char32_t value_reduced{value - 0x10000}; - s.push_back((value_reduced >> 10) + 0xD800); - s.push_back((value_reduced & 0x3FF) + 0xDC00); + append(static_cast<T>((value_reduced >> 10) + 0xD800), static_cast<T>((value_reduced & 0x3FF) + 0xDC00)); } } @@ -321,7 +318,7 @@ namespace unicode::detail { inline void append_utf(const char32_t& value) { // expect value to be already valid Unicode values (checked in input iterator) - s.push_back(value); + append(static_cast<value_type>(value)); } reference operator=(const char32_t& value) |