diff options
| -rw-r--r-- | Makefile | 10 | ||||
| -rw-r--r-- | README.txt | 163 | ||||
| -rw-r--r-- | debian/README.Debian | 96 | ||||
| -rw-r--r-- | debian/copyright | 2 | ||||
| -rw-r--r-- | debian/libunicode-dev.docs | 1 | ||||
| -rw-r--r-- | debian/unicode-tools.docs | 1 | ||||
| -rw-r--r-- | include/unicode.h | 4 | ||||
| -rw-r--r-- | include/unicode/predicate.h | 6 | 
8 files changed, 186 insertions, 97 deletions
| @@ -89,7 +89,10 @@ SRC=\  all: src/recode src/test-unicode src/test-performance src/validate -test: src/test-unicode src/test-performance +test: src/test-unicode +	src/test-unicode + +tests: src/test-unicode src/test-performance  	src/test-unicode  	src/test-performance @@ -148,6 +151,7 @@ DISTFILES= \  	   src/file.h \  	   src/test-helper.h \  	   Makefile \ +	   README.txt \  	   include/unicode.h \  	   include/unicode/endian.h \  	   include/unicode/iso.h \ @@ -164,13 +168,17 @@ DISTFILES= \             debian/changelog \             debian/README.Debian \             debian/rules \ +           debian/unicode-tools.docs \             debian/unicode-tools.install \ +           debian/libunicode-dev.docs \             debian/libunicode-dev.install \  	   msbuild/compiler.props \  	   msbuild/libunicode.vcxproj \  	   msbuild/libunicode.vcxproj.filters \  	   msbuild/recode.vcxproj \  	   msbuild/recode.vcxproj.filters \ +	   msbuild/test-performance.vcxproj \ +	   msbuild/test-performance.vcxproj.filters \  	   msbuild/test-unicode.vcxproj \  	   msbuild/test-unicode.vcxproj.filters \  	   msbuild/unicode.sln \ diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..9544b49 --- /dev/null +++ b/README.txt @@ -0,0 +1,163 @@ +Reichweit.IT unicode library +============================ + +This software package contains a C++ library for Unicode encoding conversion +and command line tools which apply those functions in example runtime programs: +recode and validate. + + +C++ interface (package libunicode-dev) +-------------------------------------- + +This library includes multiple encoding specification concepts to choose from: +While explicit specification of source and destination encodings are possible, +implicit specification of encoding of Unicode UTF encodings is also implemented +via the respective C++ types: For char8_t, char16_t and char32_t, the +respective UTF-8, UTF-16 and UTF-32 encoding is automatically used. In case of +C++17 where char8_t is not implemented, char is used instead. The same applies +for the std::basic_string<> specializations std::u8string (or std::string on +C++17), std::u16string and std::u32string. + +The main purpose of this library is conversion (and validation) between Unicode +encodings. However, Latin-1 (i.e. ISO 8859-1) and Latin-9 (i.e. ISO 8859-15) +are also implemented for practical reasons. Since the Latin character sets are +also encoded in char and std::string (at least necessarily on C++17), the Latin +encodings must be specified explicitly for disambiguation where Unicode is used +by default otherwise. I.e. UTF-8 is the default for all 8 bit character types, +UTF-16 is the default for 16 bit character types and UTF-32 is the default for +32 bit character types. + +Besides support for different character and string types from the STL, common +container types like std::vector, std::deque, std::list and std::array (the +latter only as source) are supported. + +The basic convention for the conversion interface is: + +    to = unicode::convert<FromType, ToType>(from); + +where FromType and ToType can be one of: + +(1) Character type like char, char8_t, char16_t and char32_t +(2) Container type like std::string, std::list<char>, std::deque<char32_t> +(3) Explicit encoding like unicode::UTF_8, unicode::UTF_16, unicode::UTF_32, +    unicode::ISO_8859_1 or unicode::ISO_8859_15 + +For the validation interface, the same principle applies: + +    bool flag = unicode::is_valid_utf<FromType>(from); + +There is also a Unicode character validation function which operates on Unicode +character values directly, i.e. no specific encoding is used but 32 bit (or +less) values are evaluated for a valid Unicode character: + +    bool flag = unicode::is_valid_unicode(character_value); + +While this validates a Unicode value in general, it doesn't tell if the +specified value is actually designated in an actual Unicode version. E.g. as of +2022, in current Unicode version 14.0, the character 0x1FABA "NEST WITH EGGS" +is designated, but not 0x1FABB. Both of them would be detected as "valid" by +unicode::is_valid_unicode(). See also: + +https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + + +Examples: + +#include <unicode.h> +... + +C++17 conversion of a UTF-8 string to UTF-16: + +  std::string utf8_value {u8"äöü"}; +  std::u16string utf16_value{unicode::convert<char, char16_t>(utf8_value)}; + +C++20 conversion of a UTF-8 string to UTF-16: + +  std::u8string utf8_value {u8"äöü"}; +  std::u16string utf16_value{unicode::convert<char8_t, char16_t>(utf8_value)}; + +The following encodings are implicitly deducted from types: +  * char resp. char8_t (C++20): UTF-8 +  * char16_t: UTF-16 +  * char32_t: UTF-32 + +Specification via container types: +   +  std::deque<char> utf8_value {...}; +  std::list<wchar_t> utf16_value{unicode::convert<std::deque<char>, std::list<wchar_t>>(utf8_value)}; + +Explicit encoding specification: + +  std::string value {"äöü"}; +  std::u32string utf32_value{unicode::convert<unicode::ISO_8859_1, unicode::UTF_32>(value)}; + +Supported encodings are: + +  * unicode::UTF_8 +  * unicode::UTF_16 +  * unicode::UTF_32 +  * unicode::ISO_8859_1 +  * unicode::ISO_8859_15 + +Supported basic types for source and target characters: +  * char +  * char8_t (C++20) +  * wchar_t (UTF-16 on Windows, UTF-32 on Linux) +  * char16_t +  * char32_t +  * uint8_t, int8_t +  * uint16_t, int16_t +  * uint32_t, int32_t +  * basically, all basic 8-bit, 16-bit and 32-bit that can encode +    UTF-8, UTF-16 and UTF-32, respectively. + +Supported container types: +  * All std container types that can be iterated (vector, list, deque, array) +  * Source and target containers can be different container types + +Validation can be done like this: + +  bool valid{unicode::is_valid_utf<char16_t>(utf16_value)}; + +Or via explicit encoding specification: + +  bool valid{unicode::is_valid_utf<unicode::UTF_8>(utf8_value)}; + + +CLI interface (package unicode-tools) +------------------------------------- + +* unicode-recode + +  Usage: recode <from-format> <from-file> <to-format> <to-file> +  Format: +      UTF-8       UTF-8 +      UTF-16      UTF-16, native endian +      UTF-16LE    UTF-16, little endian +      UTF-16BE    UTF-16, big endian +      UTF-32      UTF-32, native endian +      UTF-32LE    UTF-32, little endian +      UTF-32BE    UTF-32, big endian +      ISO-8859-1  ISO-8859-1 (Latin-1) +      ISO-8859-15 ISO-8859-15 (Latin-9) +  Exit code: 0 if valid, 1 otherwise. + +* unicode-validate + +  Usage: validate <format> <file> +  Format: +      UTF-8     UTF-8 +      UTF-16    UTF-16, big or little endian +      UTF-16LE  UTF-16, little endian +      UTF-16BE  UTF-16, big endian +      UTF-32    UTF-32, big or little endian +      UTF-32LE  UTF-32, little endian +      UTF-32BE  UTF-32, big endian +  Exit code: 0 if valid, 1 otherwise. + + +Contact +------- + +Reichwein IT <mail@reichwein.it> + diff --git a/debian/README.Debian b/debian/README.Debian index 0a47d0a..29bd4b9 100644 --- a/debian/README.Debian +++ b/debian/README.Debian @@ -3,101 +3,7 @@ unicode for Debian  This package is the Debian version of unicode, a C++ library for Unicode encoding. - -CLI interface (package unicode-tools) -------------------------------------- - -* unicode-recode - -  Usage: recode <from-format> <from-file> <to-format> <to-file> -  Format: -      UTF-8       UTF-8 -      UTF-16      UTF-16, native endian -      UTF-16LE    UTF-16, little endian -      UTF-16BE    UTF-16, big endian -      UTF-32      UTF-32, native endian -      UTF-32LE    UTF-32, little endian -      UTF-32BE    UTF-32, big endian -      ISO-8859-1  ISO-8859-1 (Latin-1) -      ISO-8859-15 ISO-8859-15 (Latin-9) -  Exit code: 0 if valid, 1 otherwise. - -* unicode-validate - -  Usage: validate <format> <file> -  Format: -      UTF-8     UTF-8 -      UTF-16    UTF-16, big or little endian -      UTF-16LE  UTF-16, little endian -      UTF-16BE  UTF-16, big endian -      UTF-32    UTF-32, big or little endian -      UTF-32LE  UTF-32, little endian -      UTF-32BE  UTF-32, big endian -  Exit code: 0 if valid, 1 otherwise. - - -C++ interface (package libunicode-dev) --------------------------------------- - -Example: - -#include <unicode.h> -... - -  std::string utf8_value {u8"äöü"}; -  std::u16string utf16_value{unicode::convert<char, char16_t>(utf8_value)}; - -And for C++20: - -  std::u8string utf8_value {u8"äöü"}; -  std::u16string utf16_value{unicode::convert<char8_t, char16_t>(utf8_value)}; - -The following encodings are implicitly deducted from types: -  * char resp. char8_t (C++20): UTF-8 -  * char16_t: UTF-16 -  * char32_t: UTF-32 - -You can specify different container types directly: -   -  std::deque<char> utf8_value {...}; -  std::list<wchar_t> utf16_value{unicode::convert<std::deque<char>, std::list<wchar_t>>(utf8_value)}; - -Explicit encoding specification is also possible: - -  std::string value {"äöü"}; -  std::u32string utf32_value{unicode::convert<unicode::ISO_8859_1, unicode::UTF_32>(value)}; - -Supported encodings are: - -  * unicode::UTF_8 -  * unicode::UTF_16 -  * unicode::UTF_32 -  * unicode::ISO_8859_1 -  * unicode::ISO_8859_15 - -Supported basic types: -  * char -  * char8_t (C++20) -  * wchar_t (UTF-16 on Windows, UTF-32 on Linux) -  * char16_t -  * char32_t -  * uint8_t, int8_t -  * uint16_t, int16_t -  * uint32_t, int32_t -  * basically, all basic 8-bit, 16-bit and 32-bit that can encode -    UTF-8, UTF-16 and UTF-32, respectively. - -Supported container types: -  * All std container types that can be iterated (vector, list, deque, array) -  * Source and target containers can be different container types - -Validation can be done like this: - -  bool valid{unicode::is_valid_utf<char16_t>(utf16_value)}; - -Or via explicit encoding specification: - -  bool valid{unicode::is_valid_utf<unicode::UTF_8>(utf8_value)}; +See README.txt for usage.  Contact diff --git a/debian/copyright b/debian/copyright index 850d014..b82bee0 100644 --- a/debian/copyright +++ b/debian/copyright @@ -1,4 +1,4 @@ -Author: Roland Reichwein <mail@reichwein.it>, 2021 +Author: Roland Reichwein <mail@reichwein.it>, 2021, 2022  Both upstream source code and Debian packaging is available  under the conditions of CC0 1.0 Universal diff --git a/debian/libunicode-dev.docs b/debian/libunicode-dev.docs new file mode 100644 index 0000000..71dfd5b --- /dev/null +++ b/debian/libunicode-dev.docs @@ -0,0 +1 @@ +README.txt diff --git a/debian/unicode-tools.docs b/debian/unicode-tools.docs new file mode 100644 index 0000000..71dfd5b --- /dev/null +++ b/debian/unicode-tools.docs @@ -0,0 +1 @@ +README.txt diff --git a/include/unicode.h b/include/unicode.h index feb012a..d033f63 100644 --- a/include/unicode.h +++ b/include/unicode.h @@ -4,6 +4,9 @@  //  // Available under the conditions of CC0 1.0 Universal  // https://creativecommons.org/publicdomain/zero/1.0/ +// +// This is the main library header, including all other headers of this library. +//  #pragma once @@ -26,6 +29,7 @@  namespace unicode { + // Helper function: Item distance of specified iterators   // std::distance doesn't work here: it is based on "output" distance of iterators   template<class Iterator>   inline size_t input_distance(const Iterator& it1, const Iterator& it2) diff --git a/include/unicode/predicate.h b/include/unicode/predicate.h index e326529..82031d1 100644 --- a/include/unicode/predicate.h +++ b/include/unicode/predicate.h @@ -6,6 +6,12 @@  namespace unicode { + // Detection of a valid Unicode code point value. Independent of encoding. + // + // Note: This doesn't tell if the specified value is actually allocated in an + // existing Unicode version, but rather just detects if the value is inside + // allocatable range. + //    // bits_to_compare: limit bits to consider even further than defined by T   // T: usually, char32_t, uint32_t etc.   template<size_t bits_to_compare = 32, typename T> | 
