diff options
author | Roland Reichwein <mail@reichwein.it> | 2024-08-31 18:29:58 +0200 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2024-08-31 18:29:58 +0200 |
commit | 02153da4d5954261f6649e2980bc88f6d29e45a6 (patch) | |
tree | 4cc1ce6d1124edf791e48fcdb28b33a614d1a358 /asm/arm/parse.cpp | |
parent | f8c4fe1614cc79df9f97c8a7754cf2a5aaf5063d (diff) |
ARM assembler (WIP)
Diffstat (limited to 'asm/arm/parse.cpp')
-rw-r--r-- | asm/arm/parse.cpp | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/asm/arm/parse.cpp b/asm/arm/parse.cpp new file mode 100644 index 0000000..a3156c2 --- /dev/null +++ b/asm/arm/parse.cpp @@ -0,0 +1,441 @@ +#include "parse.h" + +#include "asm/assembler.h" + +#include <boost/algorithm/string.hpp> + +#include <exception> +#include <functional> +#include <regex> +#include <unordered_set> + +using namespace std::string_literals; + +namespace { + + std::unordered_set<std::string> reg8 { + "al", "ah", + "bl", "bh", + "cl", "ch", + "dl", "dh", + }; + + std::unordered_set<std::string> reg16 { + "ax", "sp", + "bx", "bp", + "cx", "si", + "dx", "di", + }; + + std::unordered_set<std::string> reg32 { + "eax", "esp", + "ebx", "ebp", + "ecx", "esi", + "edx", "edi", + }; + + std::unordered_set<std::string> reg64 { + "rax", "rsp", + "rbx", "rbp", + "rcx", "rsi", + "rdx", "rdi", + }; + + // skip optional whitespace + void parseWhitespace(const std::string& asm_code, size_t& pos) { + std::regex re_whitespace("[ \\t]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) { + pos += match[0].length(); + } + } + + std::string reg_re{"[[:alpha:]][[:alnum:]]*"}; + + // parse optional label + bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) { + parseWhitespace(asm_code, pos); + + std::regex re_label("("s + reg_re + "):"s, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) { + pos += match[0].length(); + result = match[1]; + return true; + } + + return false; + } + + // parse optional mnemonic + // return true iff mnemonic found + bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) { + parseWhitespace(asm_code, pos); + + std::regex re_mnemonic(reg_re, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + pos += name.size(); + result = name; + return true; + } + + return false; + } + + bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name(reg_re, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg8.contains(name)) { + pos += name.size(); + result = Asm::Args::Register8(name); + size_hint = 8; + return true; + } + } + + return false; + } + + bool parseRegister16(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name(reg_re, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg16.contains(name)) { + pos += name.size(); + result = Asm::Args::Register16(name); + size_hint = 16; + return true; + } + } + + return false; + } + + bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name(reg_re, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg32.contains(name)) { + pos += name.size(); + result = Asm::Args::Register32(name); + size_hint = 32; + return true; + } + } + + return false; + } + + bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name(reg_re, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg64.contains(name)) { + pos += name.size(); + result = Asm::Args::Register64(name); + size_hint = 64; + return true; + } + } + + return false; + } + + bool parseMem8Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name("byte ptr *\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[1].str())}; + if (reg64.contains(name)) { + pos += match[0].length(); + result = Asm::Args::Mem8Ptr64{name}; + size_hint = 8; + return true; + } + } + + return false; + } + + bool parseMem16Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name("(word ptr *)?\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[2].str())}; + if (reg16.contains(name)) { + pos += match[0].length(); + result = Asm::Args::Mem16Ptr64(name); + size_hint = 16; + return true; + } + } + + return false; + } + + bool parseMem32Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name("(dword ptr *)?\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[2].str())}; + if (reg64.contains(name)) { + pos += match[0].length(); + result = Asm::Args::Mem32Ptr64(name); + size_hint = 32; + return true; + } + } + + return false; + } + + bool parseMem64Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + std::regex re_name("qword ptr *\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[1].str())}; + if (reg64.contains(name)) { + pos += match[0].length(); + result = Asm::Args::Mem64Ptr64(name); + size_hint = 64; + return true; + } + } + + return false; + } + + bool parseImmediate8(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + if (size_hint != 8) + return false; + + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + int32_t value{}; + try { + value = stoll(match[0]); + } catch (...) { + throw std::runtime_error("Assembler parse error: Bad immediate: "s + match[0].str()); + } + if (value < -128 || value > 255) + throw std::runtime_error("Assembler parse error: Bad 8 bit immediate: "s + match[0].str()); + + pos += match[0].length(); + result = Asm::Args::Immediate8(static_cast<uint8_t>(value)); + return true; + } + + return false; + } + + bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + if (size_hint != 32 && size_hint != 0) + return false; + + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + int32_t value{}; + try { + value = stoll(match[0]); + } catch (...) { + throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str()); + } + pos += match[0].length(); + result = Asm::Args::Immediate32(static_cast<uint32_t>(value)); + return true; + } + + return false; + } + + bool parseImmediate64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + if (size_hint != 64) + return false; + + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + int64_t value{}; + try { + value = stoll(match[0]); + } catch (...) { + throw std::runtime_error("Assembler parse error: Bad immediate: "s + match[0].str()); + } + + pos += match[0].length(); + result = Asm::Args::Immediate64(static_cast<uint64_t>(value)); + return true; + } + + return false; + } + + // parse optional single operand + bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) { + parseWhitespace(asm_code, pos); + + if (parseRegister8(asm_code, pos, result, size_hint)) + return true; + if (parseRegister16(asm_code, pos, result, size_hint)) + return true; + if (parseRegister32(asm_code, pos, result, size_hint)) + return true; + if (parseRegister64(asm_code, pos, result, size_hint)) + return true; + + if (parseMem8Ptr64(asm_code, pos, result, size_hint)) + return true; + if (parseMem16Ptr64(asm_code, pos, result, size_hint)) + return true; + if (parseMem32Ptr64(asm_code, pos, result, size_hint)) + return true; + if (parseMem64Ptr64(asm_code, pos, result, size_hint)) + return true; + + if (parseImmediate8(asm_code, pos, result, size_hint)) + return true; + if (parseImmediate32(asm_code, pos, result, size_hint)) + return true; + if (parseImmediate64(asm_code, pos, result, size_hint)) + return true; + + return false; + } + + // parse optional multiple operands, separated by commas + void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) { + std::any operand; + size_t size_hint{0}; // in bits, 0=no hint + if (parseOperand(asm_code, pos, operand, size_hint)) { + result.push_back(operand); + parseWhitespace(asm_code, pos); + while (pos < asm_code.size() && asm_code[pos] == ',') { + pos++; + if (parseOperand(asm_code, pos, operand, size_hint)) { + result.push_back(operand); + } else { + throw std::runtime_error("Assembler error: expected operand after comma"); + } + parseWhitespace(asm_code, pos); + } + } + } + + // parse optional comment + void parseComment(const std::string& asm_code, size_t& pos) { + parseWhitespace(asm_code, pos); + + std::regex re_comment("(;|//).*", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) { + pos += match[0].length(); + } + } + + // parse end of line (or whole code) + bool parseEol(const std::string& asm_code, size_t& pos) { + parseWhitespace(asm_code, pos); + + if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d) + return false; // this is the only case where parseEol() doesn't work + + while (pos < asm_code.size()) { + char c { asm_code[pos] }; + if (c == 0x0a || c == 0x0d) { + pos++; + } else { + break; + } + } + + return true; + } + + // parse single line + void parseLine(const std::string& asm_code, size_t& pos, std::vector<std::shared_ptr<Chunk>>& result) { + // all optional: + // label: mnemonic operands... ;comment <eol> + + std::string label; + std::function<void()> label_fn {[](){}}; + if (parseLabel(asm_code, pos, label)) + label_fn = [&]() { result.emplace_back(std::make_shared<Label>(label)); }; // defer to successfully completed line + + std::string mnemonic; + Asm::Args args; + std::function<void()> mnemonic_fn {[](){}}; + if (parseMnemonic(asm_code, pos, mnemonic)) { + parseOperands(asm_code, pos, args); + mnemonic_fn = [&]() { result.emplace_back(makeOp(mnemonic, args)); }; // defer to successfully completed line + } + + parseComment(asm_code, pos); + + if (!parseEol(asm_code, pos)) + throw std::runtime_error("Assembler error at pos "s + std::to_string(pos)); + + // Append only if no error occured, to get the correct error + label_fn(); + mnemonic_fn(); + } + +} // namespace + +std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code) +{ + std::vector<std::shared_ptr<Chunk>> result; + size_t pos{0}; + + while (pos != asm_code.size()) { + parseLine(asm_code, pos, result); + } + + return result; +} + |