From f34a0aa3a2d46d349a41c0b28939176791c2efbe Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Mon, 1 Feb 2021 16:45:18 +0100 Subject: Implemented recode and validate tools --- src/validate.cpp | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) (limited to 'src/validate.cpp') diff --git a/src/validate.cpp b/src/validate.cpp index 8927fe4..78e6175 100644 --- a/src/validate.cpp +++ b/src/validate.cpp @@ -1,4 +1,155 @@ +#include "file.h" + +#include "unicode.h" + +#include + +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +namespace { + +void usage() +{ + std::cout << "Usage: validate " << std::endl; + std::cout << "Format:" << std::endl; + std::cout << " UTF-8 UTF-8" << std::endl; + std::cout << " UTF-16 UTF-16, big or little endian" << std::endl; + std::cout << " UTF-16LE UTF-16, little endian" << std::endl; + std::cout << " UTF-16BE UTF-16, big endian" << std::endl; + std::cout << " UTF-32 UTF-32, big or little endian" << std::endl; + std::cout << " UTF-32LE UTF-32, little endian" << std::endl; + std::cout << " UTF-32BE UTF-32, big endian" << std::endl; + std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl; +} + +std::unordered_map> validate_map +{ + { "UTF-8", [](const std::string& s) -> bool { return unicode::is_valid_utf(s); }}, + { "UTF-16", [](const std::string& s) -> bool + { + if (s.size() & 1) // need even number of bytes + return false; + + std::u16string data(s.size() / 2, u'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (unicode::is_valid_utf(data)) + return true; + + // maybe reverse endianess + std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::endian_reverse_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-16LE", [](const std::string& s) -> bool + { + if (s.size() & 1) // need even number of bytes + return false; + + std::u16string data(s.size() / 2, u'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::little) + std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_little_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-16BE", [](const std::string& s) -> bool + { + if (s.size() & 1) // need even number of bytes + return false; + + std::u16string data(s.size() / 2, u'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::big) + std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_big_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-32", [](const std::string& s) -> bool + { + if (s.size() & 3) // need even number of bytes + return false; + + std::u32string data(s.size() / 4, U'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (unicode::is_valid_utf(data)) + return true; + + // maybe reverse endianess + std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::endian_reverse_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-32LE", [](const std::string& s) -> bool + { + if (s.size() & 3) // need multiple of 4 bytes + return false; + + std::u32string data(s.size() / 4, U'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::little) + std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_little_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-32BE", [](const std::string& s) -> bool + { + if (s.size() & 3) // need multiple of 4 bytes + return false; + + std::u32string data(s.size() / 4, U'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::big) + std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_big_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, +}; + +} + int main(int argc, char* argv[]) { + if (argc != 3) { + usage(); + return 1; + } + + try { + std::string format {argv[1]}; + fs::path path {argv[2]}; + + std::string data{unicode::File::getFile(path)}; + + auto it { validate_map.find(format) }; + if (it == validate_map.end()) { + std::cerr << "Error: Encoding " << format << " not supported." << std::endl; + return 1; + } + + return it->second(data) ? 0 : 1; + + } catch (const std::exception& ex) { + std::cerr << "Error: " << ex.what() << std::endl; + return 1; + } return 0; } + -- cgit v1.2.3