summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--include/unicode.h40
-rw-r--r--src/test-unicode.cpp129
2 files changed, 136 insertions, 33 deletions
diff --git a/include/unicode.h b/include/unicode.h
index a55eac3..f539e6b 100644
--- a/include/unicode.h
+++ b/include/unicode.h
@@ -1,5 +1,4 @@
// libunicode
-// Copyright (C) 2021 Roland Reichwein
#pragma once
@@ -7,11 +6,20 @@
#include <stdexcept>
#include <string>
-#ifdef __has_cpp_attribute
-#if __has_cpp_attribute(__cpp_char8_t)
+#ifdef __cpp_char8_t
// char8_t available
#endif
-#endif
+
+namespace unicode {
+
+ // usually, char32_t, uint32_t etc.
+ template<typename T>
+ static inline bool is_valid_unicode(const T& value)
+ {
+ return value <= 0x10FFFF && (value <= 0xD7FF || value >= 0xE000);
+ }
+
+}
namespace {
@@ -50,6 +58,8 @@ namespace {
template<typename T1>
void calculate_value()
{
+ static_assert(sizeof(T1) == 4);
+
size_t remaining{remaining_code_units()};
if (!remaining)
@@ -57,7 +67,7 @@ namespace {
value = get_code_unit<0>();
- if (value > 0x10FFFF || (value > 0xD7FF && value < 0xE000))
+ if (!unicode::is_valid_unicode(value))
throw std::invalid_argument("Invalid Unicode character: "s + std::to_string(static_cast<uint32_t>(value)));
sequence_length = 1;
@@ -88,7 +98,7 @@ namespace {
template<typename... Targs>
inline static char32_t continuation_value(T b, Targs... Fargs)
{
- return continuation_value(b) << 6 | continuation_value(Fargs...);
+ return continuation_value(b) << (6 * sizeof...(Targs)) | continuation_value(Fargs...);
}
template<size_t n>
@@ -159,7 +169,7 @@ namespace {
if ((unit0 & 0xFC00) != 0xD800 || (unit1 & 0xFC00) != 0xDC00)
throw std::invalid_argument("Bad input: 2 malformed UTF-16 surrogates");
- value = static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF);
+ value = (static_cast<char32_t>(unit0 & 0x03FF) << 10 | (unit1 & 0x03FF)) + 0x10000;
sequence_length = 2;
}
}
@@ -185,7 +195,7 @@ namespace {
typename string_type::const_iterator iterator;
typename string_type::const_iterator end_iterator;
- value_type value{};
+ char32_t value{}; // always save complete unicode code point at this point
size_t sequence_length{};
};
@@ -276,8 +286,9 @@ namespace {
if (value <= 0xFFFF) { // expect value to be already valid Unicode values
s.push_back(value);
} else {
- s.push_back((value >> 10) + 0xD800);
- s.push_back((value & 0x3FF) + 0xDC00);
+ char32_t value_reduced{value - 0x10000};
+ s.push_back((value_reduced >> 10) + 0xD800);
+ s.push_back((value_reduced & 0x3FF) + 0xDC00);
}
return *this;
}
@@ -317,14 +328,5 @@ std::basic_string<To> utf_to_utf(const std::basic_string<From>& s)
return result;
}
-//std::u8string utf16_to_utf8(const std::u16string& s)
-//{
-// std::u8string result;
-//
-// std::transform(utf16_begin(s), utf16_end(s), std::back_inserter(result));
-//
-// return result;
-//}
-
} // namespace unicode
diff --git a/src/test-unicode.cpp b/src/test-unicode.cpp
index 0560c1b..2cc8393 100644
--- a/src/test-unicode.cpp
+++ b/src/test-unicode.cpp
@@ -1,17 +1,83 @@
#define BOOST_TEST_MODULE unicode_test
#include <boost/test/included/unit_test.hpp>
+#include <boost/test/data/dataset.hpp>
+#include <boost/test/data/monomorphic.hpp>
+#include <boost/test/data/test_case.hpp>
+#include <exception>
#include <string>
#include <tuple>
#include <type_traits>
+#include <vector>
#include <unicode.h>
-std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> t {
- u8"Täst", u"Täst", U"Täst"
+typedef std::tuple<std::basic_string<char8_t>, std::basic_string<char16_t>, std::basic_string<char32_t>> types_collection_type;
+
+// create tuple of the same string, in UTF-8, UTF-16 and UTF-32
+#define SUCCESS_TUPLE(x) {u8 ## x, u ## x, U ## x}
+
+// Success cases: convert string to all other types, respectively
+std::vector<types_collection_type> success_sets {
+ SUCCESS_TUPLE(""),
+ SUCCESS_TUPLE("ASCII string1"),
+ SUCCESS_TUPLE("Täst just looks like German"),
+ SUCCESS_TUPLE("\u732b is chinese for cat"),
+ SUCCESS_TUPLE("\U0001F63A"),
+ SUCCESS_TUPLE("\U0001F63A is a smiling cat"),
+};
+
+// Error cases: throwing upon convert to all other types
+std::vector<std::basic_string<char8_t>> failure_strings_char8_t {
+ u8"\x80",
+ u8"\x81"
+};
+
+std::vector<std::basic_string<char16_t>> failure_strings_char16_t {
+ u"\xD801",
+};
+
+std::vector<std::basic_string<char32_t>> failure_strings_char32_t {
+ U"\xD801",
+ U"\x10000000",
};
+// output operators must be in same namespace as the type itself
+namespace std {
+
+std::ostream& operator<<(std::ostream& os, std::basic_string<char8_t> const& s)
+{
+ os << "[";
+ for (auto& c: s)
+ os << " " << std::to_string(static_cast<uint8_t>(c));
+ os << "]";
+
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, std::basic_string<char16_t> const& s)
+{
+ os << "[";
+ for (auto& c: s)
+ os << " " << std::to_string(static_cast<uint16_t>(c));
+ os << "]";
+
+ return os;
+}
+
+std::ostream& operator<<(std::ostream& os, std::basic_string<char32_t> const& s)
+{
+ os << "[";
+ for (auto& c: s)
+ os << " " << std::to_string(static_cast<uint32_t>(c));
+ os << "]";
+
+ return os;
+}
+
+}
+
template<size_t i = 0, size_t j = 0, typename... Ts>
void test_utf_to_utf(std::tuple<Ts...>& t)
{
@@ -21,7 +87,7 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
// test
To result { unicode::utf_to_utf<typename From::value_type, typename To::value_type>(std::get<i>(t)) };
- BOOST_CHECK(std::get<j>(t) == result);
+ BOOST_CHECK_MESSAGE(std::get<j>(t) == result, "From " << typeid(From).name() << "(" << i << ", " << std::get<i>(t) << ") to " << typeid(To).name() << "(" << j << ", " << std::get<j>(t) << "), got " << result);
//std::cout << std::to_string(std::tuple_size<typename std::remove_reference<decltype(t)>::type>::value) << "," << std::to_string(i) << "," << std::to_string(j) << std::endl;
@@ -32,27 +98,62 @@ void test_utf_to_utf(std::tuple<Ts...>& t)
test_utf_to_utf<0, j + 1>(t);
}
-BOOST_AUTO_TEST_CASE(utf_to_utf)
+// We don't use BOOST_DATA_TEST_CASE here because boost::test tries to assign
+// a new variable to each tuple element which we don't want
+// https://lists.boost.org/boost-bugs/2016/05/45214.php
+
+BOOST_AUTO_TEST_CASE(utf_to_utf_success)
{
- test_utf_to_utf(t);
+ for (auto& t: success_sets)
+ test_utf_to_utf(t);
}
-BOOST_AUTO_TEST_CASE(utf8_to_utf16)
+// iterate over std::tuple T types
+template<typename From, typename Collection, size_t index = 0>
+void test_utf_to_utf_failure(std::basic_string<From>& s)
{
- std::u8string u8{u8"ascii string1"};
-
- std::u16string u16{unicode::utf_to_utf<char8_t, char16_t>(u8)};
+ typedef typename std::tuple_element<index, Collection>::type::value_type To;
- BOOST_CHECK(u16 == u"ascii string1");
+ try {
+ unicode::utf_to_utf<From,To>(s);
+ BOOST_FAIL("Expected exception at index: " << index << ", " << typeid(From).name() << " -> " << typeid(To).name());
+ } catch (...) {
+ // OK
+ };
+
+ // iterate over remaining types
+ if constexpr (index + 1 < std::tuple_size<Collection>::value)
+ test_utf_to_utf_failure<From, Collection, index + 1>(s);
}
-BOOST_AUTO_TEST_CASE(utf16_to_utf8)
+BOOST_AUTO_TEST_CASE(utf_to_utf_failure)
{
- std::u16string u16{u"ascii string1"};
+ for (auto& s: failure_strings_char8_t)
+ test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s);
- std::u8string u8{unicode::utf_to_utf<char16_t, char8_t>(u16)};
+ for (auto& s: failure_strings_char16_t)
+ test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s);
+
+ for (auto& s: failure_strings_char32_t)
+ test_utf_to_utf_failure<typename std::remove_reference<decltype(s)>::type::value_type, types_collection_type>(s);
+}
+
+BOOST_AUTO_TEST_CASE(is_valid_unicode)
+{
+ BOOST_CHECK(unicode::is_valid_unicode('\0'));
+ BOOST_CHECK(unicode::is_valid_unicode(U'a'));
+ BOOST_CHECK(unicode::is_valid_unicode(U'ä'));
+ BOOST_CHECK(unicode::is_valid_unicode(U'\u732b')); // cat chinese
+ BOOST_CHECK(unicode::is_valid_unicode(U'\U0001F63A')); // cat chinese
+ BOOST_CHECK(unicode::is_valid_unicode(0x0001F63A)); // cat smiley
- BOOST_CHECK(u8 == u8"ascii string1");
+ BOOST_CHECK(!unicode::is_valid_unicode(0x00110000));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xFFFFFFFF)); // U"\UFFFFFFFF" is invalid C++
+ BOOST_CHECK(!unicode::is_valid_unicode(0x01234567));
+ BOOST_CHECK(!unicode::is_valid_unicode(0x12345678));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xD800));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xD987));
+ BOOST_CHECK(!unicode::is_valid_unicode(0xDFFF));
}
// TODO: