diff options
author | Roland Reichwein <mail@reichwein.it> | 2020-11-19 22:31:33 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2020-11-19 22:31:33 +0100 |
commit | 5c0611b998e039c8547cfa3841da3567e13446a8 (patch) | |
tree | 3f1dc0a8371996426f99d395ad3f0fa9be503ea5 | |
parent | 1937e301b6cd185c8ce907b9184142e82e76fda4 (diff) |
Add assembler parser (WIP)
-rw-r--r-- | asm/intel64/encode.cpp | 12 | ||||
-rw-r--r-- | asm/parse.cpp | 251 | ||||
-rw-r--r-- | asm/parse.h | 3 | ||||
-rw-r--r-- | tests/test-asm.cpp | 112 | ||||
-rw-r--r-- | tests/test-cpp.cpp | 2 |
5 files changed, 370 insertions, 10 deletions
diff --git a/asm/intel64/encode.cpp b/asm/intel64/encode.cpp index 21b6629..51ca7a0 100644 --- a/asm/intel64/encode.cpp +++ b/asm/intel64/encode.cpp @@ -133,18 +133,18 @@ void Asm::toMachineCode(const FlowGraph::Graph& graph, Segment& segment) if (op.type() == FlowGraph::UnaryOperationType::BitwiseNot) { segment.push_back(makeLoadValue(operands[1], graph)); - segment.push_back(parseAsm("not eax")); + segment.append(parseAsm("not eax")); segment.push_back(makeStoreValue(operands[0], graph)); } else if (op.type() == FlowGraph::UnaryOperationType::LogicalNot) { segment.push_back(makeLoadValue(operands[1], graph)); - segment.push_back(parseAsm("bsr eax")); // ZF=1 iff eax=0 - segment.push_back(parseAsm("lahf")); // ZF in AH bit 6 - segment.push_back(parseAsm("shr eax, 14")); // ZF in eax bit 0 - segment.push_back(parseAsm("and eax, 1")); // now, 0 or 1 is in eax, negated because of zero flag + segment.append(parseAsm("bsr eax")); // ZF=1 iff eax=0 + segment.append(parseAsm("lahf")); // ZF in AH bit 6 + segment.append(parseAsm("shr eax, 14")); // ZF in eax bit 0 + segment.append(parseAsm("and eax, 1")); // now, 0 or 1 is in eax, negated because of zero flag segment.push_back(makeStoreValue(operands[0], graph)); } else if (op.type() == FlowGraph::UnaryOperationType::Minus) { segment.push_back(makeLoadValue(operands[1], graph)); - segment.push_back(parseAsm("neg eax")); + segment.append(parseAsm("neg eax")); segment.push_back(makeStoreValue(operands[0], graph)); } else throw std::runtime_error("ICE: Asm: Unsupported unary operation type: "s + std::to_string(static_cast<int>(op.type()))); diff --git a/asm/parse.cpp b/asm/parse.cpp index 350d86e..3b6e6be 100644 --- a/asm/parse.cpp +++ b/asm/parse.cpp @@ -2,7 +2,254 @@ #include "asm/assembler.h" -std::shared_ptr<Chunk> parseAsm(const std::string& line) +#include <boost/algorithm/string.hpp> + +#include <exception> +#include <regex> +#include <unordered_set> + +using namespace std::string_literals; + +namespace { + + std::unordered_set<std::string> reg8 { + "al", "ah", + "bl", "bh", + "cl", "ch", + "dl", "dh", + }; + + std::unordered_set<std::string> reg32 { + "eax", "esp", + "ebx", "ebp", + "ecx", "esi", + "edx", "edi", + }; + + std::unordered_set<std::string> reg64 { + "rax", "rsp", + "rbx", "rbp", + "rcx", "rsi", + "rdx", "rdi", + }; + + // skip optional whitespace + void parseWhitespace(const std::string& asm_code, size_t& pos) { + std::regex re_whitespace("( \\t)+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) { + pos += match[0].length(); + } + } + + // parse optional label + bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) { + parseWhitespace(asm_code, pos); + + std::regex re_label("([[:alpha:]]([[:alnum:]])+):", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) { + pos += match[0].length(); + result = match[1]; + return true; + } + + return false; + } + + // parse optional mnemonic + // return true iff mnemonic found + bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) { + parseWhitespace(asm_code, pos); + + std::regex re_mnemonic("[[:alpha:]]([[:alnum:]])+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + pos += name.size(); + result = name; + return true; + } + + return false; + } + + bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg8.contains(name)) { + pos += name.size(); + result = Asm::Args::Register8(name); + return true; + } + } + + return false; + } + + bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg32.contains(name)) { + pos += name.size(); + result = Asm::Args::Register32(name); + return true; + } + } + + return false; + } + + bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + std::string name {boost::algorithm::to_lower_copy(match[0].str())}; + if (reg64.contains(name)) { + pos += name.size(); + result = Asm::Args::Register64(name); + return true; + } + } + + return false; + } + + bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) { + int32_t value{}; + try { + value = stoll(match[0]); + } catch (...) { + throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str()); + } + pos += match[0].length(); + result = Asm::Args::Immediate32(static_cast<uint32_t>(value)); + return true; + } + + return false; + } + + // parse optional single operand + bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result) { + parseWhitespace(asm_code, pos); + + if (parseRegister8(asm_code, pos, result)) + return true; + if (parseRegister32(asm_code, pos, result)) + return true; + if (parseRegister64(asm_code, pos, result)) + return true; + + if (parseImmediate32(asm_code, pos, result)) + return true; + + return false; + } + + // parse optional multiple operands, separated by commas + void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) { + std::any operand; + if (parseOperand(asm_code, pos, operand)) { + result.push_back(operand); + parseWhitespace(asm_code, pos); + while (pos < asm_code.size() && asm_code[pos] == ',') { + pos++; + if (parseOperand(asm_code, pos, operand)) { + result.push_back(operand); + } else { + throw std::runtime_error("Assembler error: expected operand after comma"); + } + parseWhitespace(asm_code, pos); + } + } + } + + // parse optional comment + void parseComment(const std::string& asm_code, size_t& pos) { + parseWhitespace(asm_code, pos); + + std::regex re_comment("(#|//).*", std::regex_constants::ECMAScript); + + std::smatch match; + if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) { + pos += match[0].length(); + } + } + + // parse end of line (or whole code) + bool parseEol(const std::string& asm_code, size_t& pos) { + parseWhitespace(asm_code, pos); + + if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d) + return false; // this is the only case where parseEol() doesn't work + + while (pos < asm_code.size()) { + char c { asm_code[pos] }; + if (c == 0x0a || c == 0x0d) { + pos++; + } else { + break; + } + } + + return true; + } + + // parse single line + void parseLine(const std::string& asm_code, size_t& pos, std::vector<std::shared_ptr<Chunk>>& result) { + // all optional: + // label: mnemonic operands... ;comment <eol> + + std::string result_string; + if (parseLabel(asm_code, pos, result_string)) + result.emplace_back(std::make_shared<Label>(result_string)); + + if (parseMnemonic(asm_code, pos, result_string)) { + Asm::Args args; + parseOperands(asm_code, pos, args); + result.emplace_back(makeOp(result_string, args)); + } + + parseComment(asm_code, pos); + + if (!parseEol(asm_code, pos)) + throw std::runtime_error("Assembler error at pos "s + std::to_string(pos)); + } + +} // namespace + +std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code) { - return makeOp("lahf"); // TODO + std::vector<std::shared_ptr<Chunk>> result; + size_t pos{0}; + + while (pos != asm_code.size()) { + parseLine(asm_code, pos, result); + } + + return result; } + diff --git a/asm/parse.h b/asm/parse.h index 1e6a202..1b55f7f 100644 --- a/asm/parse.h +++ b/asm/parse.h @@ -5,4 +5,5 @@ #include <memory> #include <string> -std::shared_ptr<Chunk> parseAsm(const std::string& line); +// asm_code: multiline asm code +std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code); diff --git a/tests/test-asm.cpp b/tests/test-asm.cpp index 2d3afa0..f4a1a2c 100644 --- a/tests/test-asm.cpp +++ b/tests/test-asm.cpp @@ -1,5 +1,6 @@ #include "asm/chunk.h" #include "asm/assembler.h" +#include "asm/parse.h" #include "asm/segment.h" #include "asm/intel64/all_ops.h" @@ -37,6 +38,20 @@ protected: } }; +class AsmParseTest: public ::testing::Test +{ +protected: + AsmParseTest() { + //debug = true; + } + ~AsmParseTest() { + } + void SetUp(){ + } + void TearDown(){ + } +}; + TEST_F(AsmTest, Intel64_add) { Segment segment; Asm::Args args{{Asm::Args::Register32("eax"), Asm::Args::Immediate32(1)}}; @@ -130,3 +145,100 @@ TEST_F(AsmTest, Intel64_multiple) { 0x01, 0x02, 0x03 // data })); } + +TEST_F(AsmParseTest, parse_empty) { + std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm("")}; + ASSERT_EQ(chunks0.size(), 0); + + std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("\n\n")}; + ASSERT_EQ(chunks1.size(), 0); + + std::vector<std::shared_ptr<Chunk>> chunks2{parseAsm("\n\n")}; + ASSERT_EQ(chunks2.size(), 0); +} + +TEST_F(AsmParseTest, parse_op_0) { + std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm("nop")}; + ASSERT_EQ(chunks0.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_1) { + std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("neg edi")}; + ASSERT_EQ(chunks1.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_2) { + std::vector<std::shared_ptr<Chunk>> chunks2{parseAsm("add eax, edx")}; + ASSERT_EQ(chunks2.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_3) { + std::vector<std::shared_ptr<Chunk>> chunks3{parseAsm("add eax, 3")}; + ASSERT_EQ(chunks3.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_4) { + std::vector<std::shared_ptr<Chunk>> chunks4{parseAsm("add [edi], 3")}; + ASSERT_EQ(chunks4.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_5) { + std::vector<std::shared_ptr<Chunk>> chunks5{parseAsm("add byte ptr [edi], 3")}; + ASSERT_EQ(chunks5.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_6) { + std::vector<std::shared_ptr<Chunk>> chunks6{parseAsm("add dword ptr[edi], 3")}; + ASSERT_EQ(chunks6.size(), 1); +} + +TEST_F(AsmParseTest, parse_op_7) { + std::vector<std::shared_ptr<Chunk>> chunks7{parseAsm("add qword ptr[edi], 3")}; + ASSERT_EQ(chunks7.size(), 1); +} + +TEST_F(AsmParseTest, parse_label) { + std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm("label1:")}; + ASSERT_EQ(chunks0.size(), 1); + + std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("label2: add eax, 3")}; + ASSERT_EQ(chunks1.size(), 2); +} + +TEST_F(AsmParseTest, parse_multiline) { + std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("add eax, 3\n")}; + ASSERT_EQ(chunks1.size(), 1); + + std::vector<std::shared_ptr<Chunk>> chunks2{parseAsm("label2: add eax, 3\n mul rdx")}; + ASSERT_EQ(chunks2.size(), 3); + + std::vector<std::shared_ptr<Chunk>> chunks3{parseAsm("label2: add eax, 3\n mul rdx\n")}; + ASSERT_EQ(chunks3.size(), 3); + + std::vector<std::shared_ptr<Chunk>> chunks4{parseAsm("label2: add eax, 3\n\n\n mul rdx")}; + ASSERT_EQ(chunks4.size(), 3); +} + +TEST_F(AsmParseTest, parse_comment) { + std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm(" ; Comment 1")}; + ASSERT_EQ(chunks0.size(), 0); + + std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("label2: add eax, 3 ; A comment")}; + ASSERT_EQ(chunks1.size(), 2); + + std::vector<std::shared_ptr<Chunk>> chunks3{parseAsm("label2: add eax, 3 // Another comment")}; + ASSERT_EQ(chunks3.size(), 2); + + std::vector<std::shared_ptr<Chunk>> chunks4{parseAsm("label2: add eax, 3 // Another comment\nadd rax, rdi")}; + ASSERT_EQ(chunks4.size(), 3); + + std::vector<std::shared_ptr<Chunk>> chunks5{parseAsm("label2: add eax, 3 // Another comment\n \t add rax, rdi")}; + ASSERT_EQ(chunks5.size(), 3); +} + +TEST_F(AsmParseTest, parse_error) { + ASSERT_THROW(parseAsm(" add label1:"), std::runtime_error); + + ASSERT_THROW(parseAsm(" add eax , "), std::runtime_error); + ASSERT_THROW(parseAsm(" add eax,"), std::runtime_error); +} diff --git a/tests/test-cpp.cpp b/tests/test-cpp.cpp index 95d271d..2e1b29e 100644 --- a/tests/test-cpp.cpp +++ b/tests/test-cpp.cpp @@ -24,7 +24,7 @@ class CppTest: public ::testing::Test { protected: CppTest() { - debug = true; + //debug = true; } ~CppTest() { } |