#include "parse.h" #include "asm/assembler.h" #include #include #include #include #include using namespace std::string_literals; namespace { std::unordered_set reg8 { "al", "ah", "bl", "bh", "cl", "ch", "dl", "dh", }; std::unordered_set reg16 { "ax", "sp", "bx", "bp", "cx", "si", "dx", "di", }; std::unordered_set reg32 { "eax", "esp", "ebx", "ebp", "ecx", "esi", "edx", "edi", }; std::unordered_set reg64 { "rax", "rsp", "rbx", "rbp", "rcx", "rsi", "rdx", "rdi", }; // skip optional whitespace void parseWhitespace(const std::string& asm_code, size_t& pos) { std::regex re_whitespace("[ \\t]+", std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) { pos += match[0].length(); } } std::string reg_re{"[[:alpha:]][[:alnum:]]*"}; // parse optional label bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) { parseWhitespace(asm_code, pos); std::regex re_label("("s + reg_re + "):"s, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) { pos += match[0].length(); result = match[1]; return true; } return false; } // parse optional mnemonic // return true iff mnemonic found bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) { parseWhitespace(asm_code, pos); std::regex re_mnemonic(reg_re, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[0].str())}; pos += name.size(); result = name; return true; } return false; } bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); std::regex re_name(reg_re, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[0].str())}; if (reg8.contains(name)) { pos += name.size(); result = Asm::Args::Register8(name); size_hint = 8; return true; } } return false; } bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); std::regex re_name(reg_re, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[0].str())}; if (reg32.contains(name)) { pos += name.size(); result = Asm::Args::Register32(name); size_hint = 32; return true; } } return false; } bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); std::regex re_name(reg_re, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[0].str())}; if (reg64.contains(name)) { pos += name.size(); result = Asm::Args::Register64(name); size_hint = 64; return true; } } return false; } bool parseMem8Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); std::regex re_name("byte ptr *\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[1].str())}; if (reg64.contains(name)) { pos += match[0].length(); result = Asm::Args::Mem8Ptr64{name}; size_hint = 8; return true; } } return false; } bool parseMem32Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); std::regex re_name("(dword ptr *)?\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[2].str())}; if (reg64.contains(name)) { pos += match[0].length(); result = Asm::Args::Mem32Ptr64(name); size_hint = 32; return true; } } return false; } bool parseMem64Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); std::regex re_name("qword ptr *\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { std::string name {boost::algorithm::to_lower_copy(match[1].str())}; if (reg64.contains(name)) { pos += match[0].length(); result = Asm::Args::Mem64Ptr64(name); size_hint = 64; return true; } } return false; } bool parseImmediate8(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { if (size_hint != 8) return false; parseWhitespace(asm_code, pos); std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { int32_t value{}; try { value = stoll(match[0]); } catch (...) { throw std::runtime_error("Assembler parse error: Bad immediate: "s + match[0].str()); } if (value < -128 || value > 255) throw std::runtime_error("Assembler parse error: Bad 8 bit immediate: "s + match[0].str()); pos += match[0].length(); result = Asm::Args::Immediate8(static_cast(value)); return true; } return false; } bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { if (size_hint != 32 && size_hint != 0) return false; parseWhitespace(asm_code, pos); std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { int32_t value{}; try { value = stoll(match[0]); } catch (...) { throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str()); } pos += match[0].length(); result = Asm::Args::Immediate32(static_cast(value)); return true; } return false; } bool parseImmediate64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { if (size_hint != 64) return false; parseWhitespace(asm_code, pos); std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { int64_t value{}; try { value = stoll(match[0]); } catch (...) { throw std::runtime_error("Assembler parse error: Bad immediate: "s + match[0].str()); } pos += match[0].length(); result = Asm::Args::Immediate64(static_cast(value)); return true; } return false; } // parse optional single operand bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { parseWhitespace(asm_code, pos); if (parseRegister8(asm_code, pos, result, size_hint)) return true; if (parseRegister32(asm_code, pos, result, size_hint)) return true; if (parseRegister64(asm_code, pos, result, size_hint)) return true; if (parseMem8Ptr64(asm_code, pos, result, size_hint)) return true; if (parseMem32Ptr64(asm_code, pos, result, size_hint)) return true; if (parseMem64Ptr64(asm_code, pos, result, size_hint)) return true; if (parseImmediate8(asm_code, pos, result, size_hint)) return true; if (parseImmediate32(asm_code, pos, result, size_hint)) return true; if (parseImmediate64(asm_code, pos, result, size_hint)) return true; return false; } // parse optional multiple operands, separated by commas void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) { std::any operand; size_t size_hint{0}; // in bits, 0=no hint if (parseOperand(asm_code, pos, operand, size_hint)) { result.push_back(operand); parseWhitespace(asm_code, pos); while (pos < asm_code.size() && asm_code[pos] == ',') { pos++; if (parseOperand(asm_code, pos, operand, size_hint)) { result.push_back(operand); } else { throw std::runtime_error("Assembler error: expected operand after comma"); } parseWhitespace(asm_code, pos); } } } // parse optional comment void parseComment(const std::string& asm_code, size_t& pos) { parseWhitespace(asm_code, pos); std::regex re_comment("(;|//).*", std::regex_constants::ECMAScript); std::smatch match; if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) { pos += match[0].length(); } } // parse end of line (or whole code) bool parseEol(const std::string& asm_code, size_t& pos) { parseWhitespace(asm_code, pos); if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d) return false; // this is the only case where parseEol() doesn't work while (pos < asm_code.size()) { char c { asm_code[pos] }; if (c == 0x0a || c == 0x0d) { pos++; } else { break; } } return true; } // parse single line void parseLine(const std::string& asm_code, size_t& pos, std::vector>& result) { // all optional: // label: mnemonic operands... ;comment std::string label; std::function label_fn {[](){}}; if (parseLabel(asm_code, pos, label)) label_fn = [&]() { result.emplace_back(std::make_shared