diff options
author | Roland Reichwein <mail@reichwein.it> | 2021-02-01 16:45:18 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2021-02-01 16:45:18 +0100 |
commit | f34a0aa3a2d46d349a41c0b28939176791c2efbe (patch) | |
tree | 663e5d5fd02cbb9b8f44cc502083f85b5b4d5c17 /src/validate.cpp | |
parent | 611601ec36a5603bc9c94cdac9a307c4bb07c929 (diff) |
Implemented recode and validate tools
Diffstat (limited to 'src/validate.cpp')
-rw-r--r-- | src/validate.cpp | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/src/validate.cpp b/src/validate.cpp index 8927fe4..78e6175 100644 --- a/src/validate.cpp +++ b/src/validate.cpp @@ -1,4 +1,155 @@ +#include "file.h" + +#include "unicode.h" + +#include <boost/endian.hpp> + +#include <filesystem> +#include <functional> +#include <iostream> +#include <string> +#include <unordered_map> + +namespace fs = std::filesystem; + +namespace { + +void usage() +{ + std::cout << "Usage: validate <format> <file>" << std::endl; + std::cout << "Format:" << std::endl; + std::cout << " UTF-8 UTF-8" << std::endl; + std::cout << " UTF-16 UTF-16, big or little endian" << std::endl; + std::cout << " UTF-16LE UTF-16, little endian" << std::endl; + std::cout << " UTF-16BE UTF-16, big endian" << std::endl; + std::cout << " UTF-32 UTF-32, big or little endian" << std::endl; + std::cout << " UTF-32LE UTF-32, little endian" << std::endl; + std::cout << " UTF-32BE UTF-32, big endian" << std::endl; + std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl; +} + +std::unordered_map<std::string, std::function<bool(const std::string&)>> validate_map +{ + { "UTF-8", [](const std::string& s) -> bool { return unicode::is_valid_utf(s); }}, + { "UTF-16", [](const std::string& s) -> bool + { + if (s.size() & 1) // need even number of bytes + return false; + + std::u16string data(s.size() / 2, u'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (unicode::is_valid_utf(data)) + return true; + + // maybe reverse endianess + std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::endian_reverse_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-16LE", [](const std::string& s) -> bool + { + if (s.size() & 1) // need even number of bytes + return false; + + std::u16string data(s.size() / 2, u'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::little) + std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_little_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-16BE", [](const std::string& s) -> bool + { + if (s.size() & 1) // need even number of bytes + return false; + + std::u16string data(s.size() / 2, u'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::big) + std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_big_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-32", [](const std::string& s) -> bool + { + if (s.size() & 3) // need even number of bytes + return false; + + std::u32string data(s.size() / 4, U'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (unicode::is_valid_utf(data)) + return true; + + // maybe reverse endianess + std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::endian_reverse_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-32LE", [](const std::string& s) -> bool + { + if (s.size() & 3) // need multiple of 4 bytes + return false; + + std::u32string data(s.size() / 4, U'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::little) + std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_little_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, + { "UTF-32BE", [](const std::string& s) -> bool + { + if (s.size() & 3) // need multiple of 4 bytes + return false; + + std::u32string data(s.size() / 4, U'\0'); + std::memcpy(data.data(), s.data(), s.size()); + + if (boost::endian::order::native != boost::endian::order::big) + std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_big_inplace(c);}); + + return unicode::is_valid_utf(data); + } + }, +}; + +} + int main(int argc, char* argv[]) { + if (argc != 3) { + usage(); + return 1; + } + + try { + std::string format {argv[1]}; + fs::path path {argv[2]}; + + std::string data{unicode::File::getFile(path)}; + + auto it { validate_map.find(format) }; + if (it == validate_map.end()) { + std::cerr << "Error: Encoding " << format << " not supported." << std::endl; + return 1; + } + + return it->second(data) ? 0 : 1; + + } catch (const std::exception& ex) { + std::cerr << "Error: " << ex.what() << std::endl; + return 1; + } return 0; } + |