#include "file.h" #include "unicode.h" #include <boost/endian/conversion.hpp> #include <filesystem> #include <functional> #include <iostream> #include <string> #include <unordered_map> namespace fs = std::filesystem; namespace { void usage() { std::cout << "Usage: validate <format> <file>" << std::endl; std::cout << "Format:" << std::endl; std::cout << " UTF-8 UTF-8" << std::endl; std::cout << " UTF-16 UTF-16, big or little endian" << std::endl; std::cout << " UTF-16LE UTF-16, little endian" << std::endl; std::cout << " UTF-16BE UTF-16, big endian" << std::endl; std::cout << " UTF-32 UTF-32, big or little endian" << std::endl; std::cout << " UTF-32LE UTF-32, little endian" << std::endl; std::cout << " UTF-32BE UTF-32, big endian" << std::endl; std::cout << "Exit code: 0 if valid, 1 otherwise." << std::endl; } std::unordered_map<std::string, std::function<bool(const std::string&)>> validate_map { { "UTF-8", [](const std::string& s) -> bool { return unicode::is_valid_utf(s); }}, { "UTF-16", [](const std::string& s) -> bool { if (s.size() & 1) // need even number of bytes return false; std::u16string data(s.size() / 2, u'\0'); std::memcpy(data.data(), s.data(), s.size()); if (unicode::is_valid_utf(data)) return true; // maybe reverse endianess std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::endian_reverse_inplace(c);}); return unicode::is_valid_utf(data); } }, { "UTF-16LE", [](const std::string& s) -> bool { if (s.size() & 1) // need even number of bytes return false; std::u16string data(s.size() / 2, u'\0'); std::memcpy(data.data(), s.data(), s.size()); if (boost::endian::order::native != boost::endian::order::little) std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_little_inplace(c);}); return unicode::is_valid_utf(data); } }, { "UTF-16BE", [](const std::string& s) -> bool { if (s.size() & 1) // need even number of bytes return false; std::u16string data(s.size() / 2, u'\0'); std::memcpy(data.data(), s.data(), s.size()); if (boost::endian::order::native != boost::endian::order::big) std::for_each(data.begin(), data.end(), [](char16_t& c){boost::endian::native_to_big_inplace(c);}); return unicode::is_valid_utf(data); } }, { "UTF-32", [](const std::string& s) -> bool { if (s.size() & 3) // need even number of bytes return false; std::u32string data(s.size() / 4, U'\0'); std::memcpy(data.data(), s.data(), s.size()); if (unicode::is_valid_utf(data)) return true; // maybe reverse endianess std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::endian_reverse_inplace(c);}); return unicode::is_valid_utf(data); } }, { "UTF-32LE", [](const std::string& s) -> bool { if (s.size() & 3) // need multiple of 4 bytes return false; std::u32string data(s.size() / 4, U'\0'); std::memcpy(data.data(), s.data(), s.size()); if (boost::endian::order::native != boost::endian::order::little) std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_little_inplace(c);}); return unicode::is_valid_utf(data); } }, { "UTF-32BE", [](const std::string& s) -> bool { if (s.size() & 3) // need multiple of 4 bytes return false; std::u32string data(s.size() / 4, U'\0'); std::memcpy(data.data(), s.data(), s.size()); if (boost::endian::order::native != boost::endian::order::big) std::for_each(data.begin(), data.end(), [](char32_t& c){boost::endian::native_to_big_inplace(c);}); return unicode::is_valid_utf(data); } }, }; } int main(int argc, char* argv[]) { if (argc != 3) { usage(); return 1; } try { std::string format {argv[1]}; fs::path path {argv[2]}; std::string data{unicode::File::getFile(path)}; auto it { validate_map.find(format) }; if (it == validate_map.end()) { std::cerr << "Error: Encoding " << format << " not supported." << std::endl; return 1; } return it->second(data) ? 0 : 1; } catch (const std::exception& ex) { std::cerr << "Error: " << ex.what() << std::endl; return 1; } return 0; }