diff options
| -rw-r--r-- | Makefile | 14 | ||||
| -rw-r--r-- | debian/libunicode-dev.install | 1 | ||||
| -rw-r--r-- | debian/unicode-tools.install | 2 | ||||
| -rw-r--r-- | src/file.cpp | 36 | ||||
| -rw-r--r-- | src/file.h | 13 | ||||
| -rw-r--r-- | src/recode.cpp | 173 | ||||
| -rw-r--r-- | src/validate.cpp | 151 | 
7 files changed, 385 insertions, 5 deletions
| @@ -59,6 +59,7 @@ LIBS+= \  endif  SRC=\ +    src/file.cpp \      src/recode.cpp \      src/validate.cpp \      src/test-unicode.cpp @@ -68,13 +69,13 @@ all: src/recode src/test-unicode src/validate  test: src/test-unicode  	src/test-unicode -src/recode: src/recode.o dep -	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ +src/recode: src/recode.o src/file.o dep +	$(CXX) $(LDFLAGS) src/recode.o src/file.o $(LDLIBS) $(LIBS) -o $@ -src/validate: src/validate.o dep -	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@ +src/validate: src/validate.o src/file.o dep +	$(CXX) $(LDFLAGS) src/validate.o src/file.o $(LDLIBS) $(LIBS) -o $@ -src/test-unicode: src/test-unicode.o dep +src/test-unicode: src/test-unicode.o src/file.o dep  	$(CXX) $(LDFLAGS) $< $(LDLIBS) $(LIBS) -o $@  dep: $(SRC:.cpp=.d) @@ -93,6 +94,9 @@ clean:  install:  	mkdir -p $(DESTDIR)/usr/include  	cp include/unicode.h $(DESTDIR)/usr/include +	mkdir -p $(DESTDIR)/usr/bin +	cp src/recode $(DESTDIR)/usr/bin/unicode-recode +	cp src/validate $(DESTDIR)/usr/bin/unicode-validate  deb:  	# build binary deb package diff --git a/debian/libunicode-dev.install b/debian/libunicode-dev.install new file mode 100644 index 0000000..92c3336 --- /dev/null +++ b/debian/libunicode-dev.install @@ -0,0 +1 @@ +usr/include/unicode.h diff --git a/debian/unicode-tools.install b/debian/unicode-tools.install new file mode 100644 index 0000000..11d89bb --- /dev/null +++ b/debian/unicode-tools.install @@ -0,0 +1,2 @@ +usr/bin/unicode-recode +usr/bin/unicode-validate diff --git a/src/file.cpp b/src/file.cpp new file mode 100644 index 0000000..571a9f8 --- /dev/null +++ b/src/file.cpp @@ -0,0 +1,36 @@ +#include "file.h" + +#include <fstream> + +namespace fs = std::filesystem; + +using namespace std::string_literals; + +std::string unicode::File::getFile(const fs::path& filename) +{ + std::ifstream file(filename.string(), std::ios::in | std::ios::binary | std::ios::ate); + + if (file.is_open()) { +  std::ifstream::pos_type fileSize { file.tellg() }; +  file.seekg(0, std::ios::beg); + +  std::string bytes(fileSize, '\0'); +  file.read(reinterpret_cast<char*>(bytes.data()), fileSize); + +  return bytes; + + } else { +  throw std::runtime_error("Opening "s + filename.string() + " for reading"); + } +} + +void unicode::File::setFile(const fs::path& filename, const std::string& s) +{ + std::ofstream file(filename.string(), std::ios::out | std::ios::binary); + if (file.is_open()) { +  file.write(s.data(), s.size()); + } else { +  throw std::runtime_error("Opening "s + filename.string() + " for writing"); + } +} + diff --git a/src/file.h b/src/file.h new file mode 100644 index 0000000..d2e396b --- /dev/null +++ b/src/file.h @@ -0,0 +1,13 @@ +#pragma once + +#include <cstdint> +#include <filesystem> +#include <string> +#include <vector> + +namespace unicode::File { + + std::string getFile(const std::filesystem::path& filename); + void setFile(const std::filesystem::path& filename, const std::string& s); + +} diff --git a/src/recode.cpp b/src/recode.cpp index 8927fe4..b8ada69 100644 --- a/src/recode.cpp +++ b/src/recode.cpp @@ -1,4 +1,177 @@ +#include "file.h" + +#include "unicode.h" + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/endian.hpp> + +#include <filesystem> +#include <functional> +#include <iostream> +#include <string> +#include <tuple> +#include <unordered_map> + +namespace fs = std::filesystem; + +using namespace std::string_literals; + +namespace { + +void usage() +{ + std::cout << "Usage: recode <from-format> <from-file> <to-format> <to-file>" << std::endl; + std::cout << "Format:" << std::endl; + std::cout << "    UTF-8       UTF-8" << std::endl; + std::cout << "    UTF-16      UTF-16, native endian" << std::endl; + std::cout << "    UTF-16LE    UTF-16, little endian" << std::endl; + std::cout << "    UTF-16BE    UTF-16, big endian" << std::endl; + std::cout << "    UTF-32      UTF-32, native endian" << std::endl; + std::cout << "    UTF-32LE    UTF-32, little endian" << std::endl; + std::cout << "    UTF-32BE    UTF-32, big endian" << std::endl; + std::cout << "    ISO-8859-1  ISO-8859-1 (Latin-1)" << std::endl; + std::cout << "    ISO-8859-15 ISO-8859-15 (Latin-9)" << std::endl; + std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl; +} + +std::unordered_map<std::string, std::string> typeid_name_map +{ + { "UTF-8", typeid(unicode::UTF_8).name() }, + { "UTF-16", typeid(unicode::UTF_16).name() }, + { "UTF-16LE", typeid(unicode::UTF_16).name() }, + { "UTF-16BE", typeid(unicode::UTF_16).name() }, + { "UTF-32", typeid(unicode::UTF_32).name() }, + { "UTF-32LE", typeid(unicode::UTF_32).name() }, + { "UTF-32BE", typeid(unicode::UTF_32).name() }, + { "ISO-8859-1", typeid(unicode::ISO_8859_1).name() }, + { "ISO-8859-15", typeid(unicode::ISO_8859_15).name() }, +}; + +std::string get_id(const std::string& from, const std::string& to) +{ + return from + "," + to; +} + +template<typename From, typename To> +std::string get_id() +{ + return get_id(std::string{typeid(From).name()}, typeid(To).name()); +} + +template<typename T> +void reverse_endian(std::basic_string<T>& s) +{ + std::for_each(s.begin(), s.end(), [](T& c){boost::endian::endian_reverse_inplace(c);}); +} + +std::unordered_map<std::string, std::function<std::string(const std::string&, bool, bool)>> convert_map {}; + +template<typename From, typename To> +void register_convert() +{ + std::string id{ get_id<From, To>() }; + + std::function<std::string(const std::string&, bool, bool)> f([](const std::string& s, bool swap_from_endian, bool swap_to_endian) -> std::string +  { +   if (s.size() % sizeof(typename From::value_type) != 0) +    throw std::invalid_argument("Bad number of input bytes. Need multiple of "s + std::to_string(sizeof(typename From::value_type)) + ", got " + std::to_string(s.size())); + +   std::basic_string<typename From::value_type> from_data(s.size(), static_cast<typename From::value_type>(0)); + +   std::memcpy(from_data.data(), s.data(), s.size()); +   +   if (swap_from_endian) { +    reverse_endian(from_data); +   } + +   std::basic_string<typename To::value_type> to_data {unicode::convert<From, To>(from_data)}; +    +   if (swap_to_endian) { +    reverse_endian(to_data); +   } + +   std::string result(to_data.size() * sizeof(typename To::value_type), '\0'); +    +   std::memcpy(result.data(), to_data.data(), to_data.size() * sizeof(typename To::value_type)); + +   return result; +  }); + + convert_map[id] = f; +} + +template<int N, typename... Ts> using NthTypeOf = + typename std::tuple_element<N, std::tuple<Ts...>>::type; + +template<size_t i, size_t j, typename ...Ts> +void iterate_over() +{ + register_convert<NthTypeOf<i,Ts...>, NthTypeOf<j,Ts...>>(); + + if constexpr (i + 1 < sizeof...(Ts)) { +  iterate_over<i + 1, j, Ts...>(); + } else if constexpr (j + 1 < sizeof...(Ts)) { +  iterate_over<0, j + 1, Ts...>(); + } +} + +template<typename...Ts> +void build_map() +{ + iterate_over<0, 0, Ts...>(); +} + +} +  int main(int argc, char* argv[])  { + if (argc != 5) { +  usage(); +  return 1; + } + + try { +  build_map<unicode::UTF_8, unicode::UTF_16, unicode::UTF_32, unicode::ISO_8859_1, unicode::ISO_8859_15>(); + +  std::string from_format {argv[1]}; +  fs::path from_path {argv[2]}; +  std::string to_format {argv[3]}; +  fs::path to_path {argv[4]}; + +  std::string data{unicode::File::getFile(from_path)}; + +  auto it_from{typeid_name_map.find(from_format)}; +  if (it_from == typeid_name_map.end()) +   throw std::invalid_argument("Bad input format: "s + from_format); + +  auto it_to{typeid_name_map.find(to_format)}; +  if (it_to == typeid_name_map.end()) +   throw std::invalid_argument("Bad output format: "s + to_format); + +  std::string id{get_id(it_from->second, it_to->second)}; + +  std::cout << "DEBUG: " << id << std::endl; + +  auto it { convert_map.find(id) }; +  if (it == convert_map.end()) { +   std::cerr << "Error: Conversion ID " << id << " not supported." << std::endl; +   return 1; +  } + +  bool swap_from_endian{(boost::algorithm::ends_with(from_format, "LE") && boost::endian::order::native != boost::endian::order::little) || +                        (boost::algorithm::ends_with(from_format, "BE") && boost::endian::order::native != boost::endian::order::big)}; +  bool swap_to_endian{(boost::algorithm::ends_with(to_format, "LE") && boost::endian::order::native != boost::endian::order::little) || +                      (boost::algorithm::ends_with(to_format, "BE") && boost::endian::order::native != boost::endian::order::big)}; + +  // actual conversion +  std::string to_data{it->second(data, swap_from_endian, swap_to_endian)}; + +  unicode::File::setFile(to_path, to_data); + + } catch (const std::exception& ex) { +  std::cerr << "Error: " << ex.what() << std::endl; +  return 1; + }   return 0;  } + diff --git a/src/validate.cpp b/src/validate.cpp index 8927fe4..78e6175 100644 --- a/src/validate.cpp +++ b/src/validate.cpp @@ -1,4 +1,155 @@ +#include "file.h" + +#include "unicode.h" + +#include <boost/endian.hpp> + +#include <filesystem> +#include <functional> +#include <iostream> +#include <string> +#include <unordered_map> + +namespace fs = std::filesystem; + +namespace { + +void usage() +{ + std::cout << "Usage: validate <format> <file>" << std::endl; + std::cout << "Format:" << std::endl; + std::cout << "    UTF-8     UTF-8" << std::endl; + std::cout << "    UTF-16    UTF-16, big or little endian" << std::endl; + std::cout << "    UTF-16LE  UTF-16, little endian" << std::endl; + std::cout << "    UTF-16BE  UTF-16, big endian" << std::endl; + std::cout << "    UTF-32    UTF-32, big or little endian" << std::endl; + std::cout << "    UTF-32LE  UTF-32, little endian" << std::endl; + std::cout << "    UTF-32BE  UTF-32, big endian" << std::endl; + std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl; +} + +std::unordered_map<std::string, std::function<bool(const std::string&)>> validate_map +{ + { "UTF-8", [](const std::string& s) -> bool { return unicode::is_valid_utf(s); }}, + { "UTF-16", [](const std::string& s) -> bool +  { +   if (s.size() & 1) // need even number of bytes +    return false; + +   std::u16string data(s.size() / 2, u'\0'); +   std::memcpy(data.data(), s.data(), s.size()); + +   if (unicode::is_valid_utf(data)) +    return true; + +   // maybe reverse endianess +   std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::endian_reverse_inplace(c);}); + +   return unicode::is_valid_utf(data); +  } + }, + { "UTF-16LE", [](const std::string& s) -> bool +  { +   if (s.size() & 1) // need even number of bytes +    return false; + +   std::u16string data(s.size() / 2, u'\0'); +   std::memcpy(data.data(), s.data(), s.size()); + +   if (boost::endian::order::native != boost::endian::order::little) +    std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_little_inplace(c);}); + +   return unicode::is_valid_utf(data); +  } + }, + { "UTF-16BE", [](const std::string& s) -> bool +  { +   if (s.size() & 1) // need even number of bytes +    return false; + +   std::u16string data(s.size() / 2, u'\0'); +   std::memcpy(data.data(), s.data(), s.size()); + +   if (boost::endian::order::native != boost::endian::order::big) +    std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_big_inplace(c);}); + +   return unicode::is_valid_utf(data); +  } + }, + { "UTF-32", [](const std::string& s) -> bool +  { +   if (s.size() & 3) // need even number of bytes +    return false; + +   std::u32string data(s.size() / 4, U'\0'); +   std::memcpy(data.data(), s.data(), s.size()); + +   if (unicode::is_valid_utf(data)) +    return true; + +   // maybe reverse endianess +   std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::endian_reverse_inplace(c);}); + +   return unicode::is_valid_utf(data); +  } + }, + { "UTF-32LE", [](const std::string& s) -> bool +  { +   if (s.size() & 3) // need multiple of 4 bytes +    return false; + +   std::u32string data(s.size() / 4, U'\0'); +   std::memcpy(data.data(), s.data(), s.size()); + +   if (boost::endian::order::native != boost::endian::order::little) +    std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_little_inplace(c);}); + +   return unicode::is_valid_utf(data); +  } + }, + { "UTF-32BE", [](const std::string& s) -> bool +  { +   if (s.size() & 3) // need multiple of 4 bytes +    return false; + +   std::u32string data(s.size() / 4, U'\0'); +   std::memcpy(data.data(), s.data(), s.size()); + +   if (boost::endian::order::native != boost::endian::order::big) +    std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_big_inplace(c);}); + +   return unicode::is_valid_utf(data); +  } + }, +}; + +} +  int main(int argc, char* argv[])  { + if (argc != 3) { +  usage(); +  return 1; + } + + try { +  std::string format {argv[1]}; +  fs::path path {argv[2]}; + +  std::string data{unicode::File::getFile(path)}; + +  auto it { validate_map.find(format) }; +  if (it == validate_map.end()) { +   std::cerr << "Error: Encoding " << format << " not supported." << std::endl; +   return 1; +  } + +  return it->second(data) ? 0 : 1; + + } catch (const std::exception& ex) { +  std::cerr << "Error: " << ex.what() << std::endl; +  return 1; + }   return 0;  } + | 
