From 5c0611b998e039c8547cfa3841da3567e13446a8 Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Thu, 19 Nov 2020 22:31:33 +0100 Subject: Add assembler parser (WIP) --- asm/parse.cpp | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 249 insertions(+), 2 deletions(-) (limited to 'asm/parse.cpp') diff --git a/asm/parse.cpp b/asm/parse.cpp index 350d86e..3b6e6be 100644 --- a/asm/parse.cpp +++ b/asm/parse.cpp @@ -2,7 +2,254 @@ #include "asm/assembler.h" -std::shared_ptr parseAsm(const std::string& line) +#include + +#include +#include +#include + +using namespace std::string_literals; + +namespace { + + std::unordered_set reg8 { + "al", "ah", + "bl", "bh", + "cl", "ch", + "dl", "dh", + }; + + std::unordered_set reg32 { + "eax", "esp", + "ebx", "ebp", + "ecx", "esi", + "edx", "edi", + }; + + std::unordered_set reg64 { + "rax", "rsp", + "rbx", "rbp", + "rcx", "rsi", + "rdx", "rdi", + }; + + // skip optional whitespace + void parseWhitespace(const std::string& asm_code, size_t& pos) { + std::regex re_whitespace("( \\t)+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) { + pos += match[0].length(); + } + } + + // parse optional label + bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) { + parseWhitespace(asm_code, pos); + + std::regex re_label("([[:alpha:]]([[:alnum:]])+):", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) { + pos += match[0].length(); + result = match[1]; + return true; + } + + return false; + } + + // parse optional mnemonic + // return true iff mnemonic found + bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) { + parseWhitespace(asm_code, pos); + + std::regex re_mnemonic("[[:alpha:]]([[:alnum:]])+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + pos += name.size(); + result = name; + return true; + } + + return false; + } + + bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg8.contains(name)) { + pos += name.size(); + result = Asm::Args::Register8(name); + return true; + } + } + + return false; + } + + bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg32.contains(name)) { + pos += name.size(); + result = Asm::Args::Register32(name); + return true; + } + } + + return false; + } + + bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg64.contains(name)) { + pos += name.size(); + result = Asm::Args::Register64(name); + return true; + } + } + + return false; + } + + bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + int32_t value{}; + try { + value = stoll(match[0]); + } catch (...) { + throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str()); + } + pos += match[0].length(); + result = Asm::Args::Immediate32(static_cast(value)); + return true; + } + + return false; + } + + // parse optional single operand + bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + if (parseRegister8(asm_code, pos, result)) + return true; + if (parseRegister32(asm_code, pos, result)) + return true; + if (parseRegister64(asm_code, pos, result)) + return true; + + if (parseImmediate32(asm_code, pos, result)) + return true; + + return false; + } + + // parse optional multiple operands, separated by commas + void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) { + std::any operand; + if (parseOperand(asm_code, pos, operand)) { + result.push_back(operand); + parseWhitespace(asm_code, pos); + while (pos < asm_code.size() && asm_code[pos] == ',') { + pos++; + if (parseOperand(asm_code, pos, operand)) { + result.push_back(operand); + } else { + throw std::runtime_error("Assembler error: expected operand after comma"); + } + parseWhitespace(asm_code, pos); + } + } + } + + // parse optional comment + void parseComment(const std::string& asm_code, size_t& pos) { + parseWhitespace(asm_code, pos); + + std::regex re_comment("(#|//).*", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) { + pos += match[0].length(); + } + } + + // parse end of line (or whole code) + bool parseEol(const std::string& asm_code, size_t& pos) { + parseWhitespace(asm_code, pos); + + if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d) + return false; // this is the only case where parseEol() doesn't work + + while (pos < asm_code.size()) { + char c { asm_code[pos] }; + if (c == 0x0a || c == 0x0d) { + pos++; + } else { + break; + } + } + + return true; + } + + // parse single line + void parseLine(const std::string& asm_code, size_t& pos, std::vector>& result) { + // all optional: + // label: mnemonic operands... ;comment + + std::string result_string; + if (parseLabel(asm_code, pos, result_string)) + result.emplace_back(std::make_shared