diff options
| -rw-r--r-- | include/unicode.h | 73 | ||||
| -rw-r--r-- | src/test-unicode.cpp | 15 | 
2 files changed, 69 insertions, 19 deletions
| diff --git a/include/unicode.h b/include/unicode.h index b90ed15..512891a 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -4,8 +4,15 @@  #pragma once  #include <algorithm> +#include <stdexcept>  #include <string> +#ifdef __has_cpp_attribute +#if __has_cpp_attribute(__cpp_char8_t) +// char8_t available +#endif +#endif +  namespace {   struct utf8_iterator @@ -13,22 +20,57 @@ namespace {    typedef char32_t value_type;    typedef char32_t& reference; -  void get_value() +  utf8_iterator(const std::u8string::const_iterator& cbegin, const std::u8string::const_iterator& cend): +   iterator(cbegin), end_iterator(cend)    { -   // TODO: set value to current data in *iterator ... -   value = 'X'; +   calculate_value();    } -  size_t get_number_of_utf8_bytes() +  utf8_iterator(const utf8_iterator& other) = default; +  utf8_iterator& operator=(const utf8_iterator& other) = default; + +  // set value member +  void calculate_value()    { -   // TODO: how many bytes -   return 1; +   if (iterator == end_iterator) +    return; + +   char8_t first_byte {*iterator}; +   if (first_byte & 0x80) { // 2-4 bytes +    if (iterator + 1 != end_iterator) { +     char8_t second_byte {*(iterator + 1)}; +     if ((first_byte & 0b11100000) == 0b11000000 && (second_byte & 0b11000000) == 0b10000000) { // 2 bytes +      value = char32_t(first_byte & 0b11111) << 6 | (second_byte & 0b111111); +      sequence_length = 2; +     } else if (iterator + 2 != end_iterator) { +      char8_t third_byte {*(iterator + 2)}; +      if ((first_byte & 0b11110000) == 0b11100000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000) { // 3 bytes +       value = char32_t(first_byte & 0b1111) << 12 | char32_t(second_byte & 0b111111) << 6 | (third_byte & 0b111111); +       sequence_length = 3; +      } else if (iterator + 3 != end_iterator) { +       char8_t fourth_byte {*(iterator + 3)}; +       if ((first_byte & 0b11111000) == 0b11110000 && (second_byte & 0b11000000) == 0b10000000 && (third_byte & 0b11000000) == 0b10000000 && (fourth_byte & 0b11000000) == 0b10000000) { // 4 bytes +        value = char32_t(first_byte & 0b111) << 18 | char32_t(second_byte & 0b111111) << 12 | char32_t(third_byte & 0b111111) << 6 | (fourth_byte & 0b111111); +        sequence_length = 4; +       } else +        throw std::invalid_argument("bad input: invalid 4 byte sequence"); +      } else +       throw std::invalid_argument("bad input: invalid 3 byte sequence"); +     } else +      throw std::invalid_argument("bad input: invalid 2 byte sequence"); +    } else +     throw std::invalid_argument("bad input: byte 2 expected, none found"); +   } else { // 1 byte: 7 bit ASCII +    value = first_byte; +    sequence_length = 1; +   }    }    // pre-increment    utf8_iterator& operator++()    { -   iterator += get_number_of_utf8_bytes(); +   iterator += sequence_length; +   calculate_value();     return *this;    } @@ -39,14 +81,14 @@ namespace {    reference operator*()    { -   get_value();     return value;    }    std::u8string::const_iterator iterator; -    std::u8string::const_iterator end_iterator; +    value_type value{}; +  size_t sequence_length{};   };   struct utf16_back_insert_iterator @@ -70,7 +112,12 @@ namespace {    // append utf-16 word sequence    reference operator=(const char32_t& value)    { -   s.push_back(0); // TODO +   if (value <= 0xFFFF) { // expect value to be already valid Unicode values, TODO: validate char32_t! +    s.push_back(value); +   } else { +    s.push_back((value >> 10) + 0xD800); +    s.push_back((value & 0x3FF) + 0xDC00); +   }     return *this;    } @@ -96,12 +143,6 @@ namespace {  namespace unicode { -// returns number of bytes in UTF-8 byte sequence of first found code point, -// if found. 0 if none found or sequence empty. -//size_t utf8_start() -//{ -//} -  std::u16string utf8_to_utf16(const std::u8string& s)  {   std::u16string result; diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp index 4576d06..41fcd20 100644 --- a/src/test-unicode.cpp +++ b/src/test-unicode.cpp @@ -4,14 +4,23 @@  #include <string> -//#include <unicode.h> +#include <unicode.h>  BOOST_AUTO_TEST_CASE(utf8_to_utf16)  {   std::u8string u8{u8"ascii string1"}; - //std::u16string u16{unicode::utf8_to_utf16(u8)}; + std::u16string u16{unicode::utf8_to_utf16(u8)}; - //BOOST_CHECK_EQUAL(u16, u"ascii string1"); + BOOST_CHECK(u16 == u"ascii string1");  } +// TODO: +//  invalid bytes +//  an unexpected continuation byte +//  a non-continuation byte before the end of the character +//  the string ending before the end of the character (which can happen in simple string truncation) +//  an overlong encoding +//  a sequence that decodes to an invalid code point +// +//  high and low surrogate halves used by UTF-16 (U+D800 through U+DFFF) and code points not encodable by UTF-16 (those after U+10FFFF) | 
