summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-11-19 22:31:33 +0100
committerRoland Reichwein <mail@reichwein.it>2020-11-19 22:31:33 +0100
commit5c0611b998e039c8547cfa3841da3567e13446a8 (patch)
tree3f1dc0a8371996426f99d395ad3f0fa9be503ea5
parent1937e301b6cd185c8ce907b9184142e82e76fda4 (diff)
Add assembler parser (WIP)
-rw-r--r--asm/intel64/encode.cpp12
-rw-r--r--asm/parse.cpp251
-rw-r--r--asm/parse.h3
-rw-r--r--tests/test-asm.cpp112
-rw-r--r--tests/test-cpp.cpp2
5 files changed, 370 insertions, 10 deletions
diff --git a/asm/intel64/encode.cpp b/asm/intel64/encode.cpp
index 21b6629..51ca7a0 100644
--- a/asm/intel64/encode.cpp
+++ b/asm/intel64/encode.cpp
@@ -133,18 +133,18 @@ void Asm::toMachineCode(const FlowGraph::Graph& graph, Segment& segment)
if (op.type() == FlowGraph::UnaryOperationType::BitwiseNot) {
segment.push_back(makeLoadValue(operands[1], graph));
- segment.push_back(parseAsm("not eax"));
+ segment.append(parseAsm("not eax"));
segment.push_back(makeStoreValue(operands[0], graph));
} else if (op.type() == FlowGraph::UnaryOperationType::LogicalNot) {
segment.push_back(makeLoadValue(operands[1], graph));
- segment.push_back(parseAsm("bsr eax")); // ZF=1 iff eax=0
- segment.push_back(parseAsm("lahf")); // ZF in AH bit 6
- segment.push_back(parseAsm("shr eax, 14")); // ZF in eax bit 0
- segment.push_back(parseAsm("and eax, 1")); // now, 0 or 1 is in eax, negated because of zero flag
+ segment.append(parseAsm("bsr eax")); // ZF=1 iff eax=0
+ segment.append(parseAsm("lahf")); // ZF in AH bit 6
+ segment.append(parseAsm("shr eax, 14")); // ZF in eax bit 0
+ segment.append(parseAsm("and eax, 1")); // now, 0 or 1 is in eax, negated because of zero flag
segment.push_back(makeStoreValue(operands[0], graph));
} else if (op.type() == FlowGraph::UnaryOperationType::Minus) {
segment.push_back(makeLoadValue(operands[1], graph));
- segment.push_back(parseAsm("neg eax"));
+ segment.append(parseAsm("neg eax"));
segment.push_back(makeStoreValue(operands[0], graph));
} else
throw std::runtime_error("ICE: Asm: Unsupported unary operation type: "s + std::to_string(static_cast<int>(op.type())));
diff --git a/asm/parse.cpp b/asm/parse.cpp
index 350d86e..3b6e6be 100644
--- a/asm/parse.cpp
+++ b/asm/parse.cpp
@@ -2,7 +2,254 @@
#include "asm/assembler.h"
-std::shared_ptr<Chunk> parseAsm(const std::string& line)
+#include <boost/algorithm/string.hpp>
+
+#include <exception>
+#include <regex>
+#include <unordered_set>
+
+using namespace std::string_literals;
+
+namespace {
+
+ std::unordered_set<std::string> reg8 {
+ "al", "ah",
+ "bl", "bh",
+ "cl", "ch",
+ "dl", "dh",
+ };
+
+ std::unordered_set<std::string> reg32 {
+ "eax", "esp",
+ "ebx", "ebp",
+ "ecx", "esi",
+ "edx", "edi",
+ };
+
+ std::unordered_set<std::string> reg64 {
+ "rax", "rsp",
+ "rbx", "rbp",
+ "rcx", "rsi",
+ "rdx", "rdi",
+ };
+
+ // skip optional whitespace
+ void parseWhitespace(const std::string& asm_code, size_t& pos) {
+ std::regex re_whitespace("( \\t)+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) {
+ pos += match[0].length();
+ }
+ }
+
+ // parse optional label
+ bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_label("([[:alpha:]]([[:alnum:]])+):", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) {
+ pos += match[0].length();
+ result = match[1];
+ return true;
+ }
+
+ return false;
+ }
+
+ // parse optional mnemonic
+ // return true iff mnemonic found
+ bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_mnemonic("[[:alpha:]]([[:alnum:]])+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ pos += name.size();
+ result = name;
+ return true;
+ }
+
+ return false;
+ }
+
+ bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg8.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register8(name);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg32.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register32(name);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:alpha:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg64.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register64(name);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ int32_t value{};
+ try {
+ value = stoll(match[0]);
+ } catch (...) {
+ throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str());
+ }
+ pos += match[0].length();
+ result = Asm::Args::Immediate32(static_cast<uint32_t>(value));
+ return true;
+ }
+
+ return false;
+ }
+
+ // parse optional single operand
+ bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result) {
+ parseWhitespace(asm_code, pos);
+
+ if (parseRegister8(asm_code, pos, result))
+ return true;
+ if (parseRegister32(asm_code, pos, result))
+ return true;
+ if (parseRegister64(asm_code, pos, result))
+ return true;
+
+ if (parseImmediate32(asm_code, pos, result))
+ return true;
+
+ return false;
+ }
+
+ // parse optional multiple operands, separated by commas
+ void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) {
+ std::any operand;
+ if (parseOperand(asm_code, pos, operand)) {
+ result.push_back(operand);
+ parseWhitespace(asm_code, pos);
+ while (pos < asm_code.size() && asm_code[pos] == ',') {
+ pos++;
+ if (parseOperand(asm_code, pos, operand)) {
+ result.push_back(operand);
+ } else {
+ throw std::runtime_error("Assembler error: expected operand after comma");
+ }
+ parseWhitespace(asm_code, pos);
+ }
+ }
+ }
+
+ // parse optional comment
+ void parseComment(const std::string& asm_code, size_t& pos) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_comment("(#|//).*", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) {
+ pos += match[0].length();
+ }
+ }
+
+ // parse end of line (or whole code)
+ bool parseEol(const std::string& asm_code, size_t& pos) {
+ parseWhitespace(asm_code, pos);
+
+ if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d)
+ return false; // this is the only case where parseEol() doesn't work
+
+ while (pos < asm_code.size()) {
+ char c { asm_code[pos] };
+ if (c == 0x0a || c == 0x0d) {
+ pos++;
+ } else {
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ // parse single line
+ void parseLine(const std::string& asm_code, size_t& pos, std::vector<std::shared_ptr<Chunk>>& result) {
+ // all optional:
+ // label: mnemonic operands... ;comment <eol>
+
+ std::string result_string;
+ if (parseLabel(asm_code, pos, result_string))
+ result.emplace_back(std::make_shared<Label>(result_string));
+
+ if (parseMnemonic(asm_code, pos, result_string)) {
+ Asm::Args args;
+ parseOperands(asm_code, pos, args);
+ result.emplace_back(makeOp(result_string, args));
+ }
+
+ parseComment(asm_code, pos);
+
+ if (!parseEol(asm_code, pos))
+ throw std::runtime_error("Assembler error at pos "s + std::to_string(pos));
+ }
+
+} // namespace
+
+std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code)
{
- return makeOp("lahf"); // TODO
+ std::vector<std::shared_ptr<Chunk>> result;
+ size_t pos{0};
+
+ while (pos != asm_code.size()) {
+ parseLine(asm_code, pos, result);
+ }
+
+ return result;
}
+
diff --git a/asm/parse.h b/asm/parse.h
index 1e6a202..1b55f7f 100644
--- a/asm/parse.h
+++ b/asm/parse.h
@@ -5,4 +5,5 @@
#include <memory>
#include <string>
-std::shared_ptr<Chunk> parseAsm(const std::string& line);
+// asm_code: multiline asm code
+std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code);
diff --git a/tests/test-asm.cpp b/tests/test-asm.cpp
index 2d3afa0..f4a1a2c 100644
--- a/tests/test-asm.cpp
+++ b/tests/test-asm.cpp
@@ -1,5 +1,6 @@
#include "asm/chunk.h"
#include "asm/assembler.h"
+#include "asm/parse.h"
#include "asm/segment.h"
#include "asm/intel64/all_ops.h"
@@ -37,6 +38,20 @@ protected:
}
};
+class AsmParseTest: public ::testing::Test
+{
+protected:
+ AsmParseTest() {
+ //debug = true;
+ }
+ ~AsmParseTest() {
+ }
+ void SetUp(){
+ }
+ void TearDown(){
+ }
+};
+
TEST_F(AsmTest, Intel64_add) {
Segment segment;
Asm::Args args{{Asm::Args::Register32("eax"), Asm::Args::Immediate32(1)}};
@@ -130,3 +145,100 @@ TEST_F(AsmTest, Intel64_multiple) {
0x01, 0x02, 0x03 // data
}));
}
+
+TEST_F(AsmParseTest, parse_empty) {
+ std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm("")};
+ ASSERT_EQ(chunks0.size(), 0);
+
+ std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("\n\n")};
+ ASSERT_EQ(chunks1.size(), 0);
+
+ std::vector<std::shared_ptr<Chunk>> chunks2{parseAsm("\n\n")};
+ ASSERT_EQ(chunks2.size(), 0);
+}
+
+TEST_F(AsmParseTest, parse_op_0) {
+ std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm("nop")};
+ ASSERT_EQ(chunks0.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_1) {
+ std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("neg edi")};
+ ASSERT_EQ(chunks1.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_2) {
+ std::vector<std::shared_ptr<Chunk>> chunks2{parseAsm("add eax, edx")};
+ ASSERT_EQ(chunks2.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_3) {
+ std::vector<std::shared_ptr<Chunk>> chunks3{parseAsm("add eax, 3")};
+ ASSERT_EQ(chunks3.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_4) {
+ std::vector<std::shared_ptr<Chunk>> chunks4{parseAsm("add [edi], 3")};
+ ASSERT_EQ(chunks4.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_5) {
+ std::vector<std::shared_ptr<Chunk>> chunks5{parseAsm("add byte ptr [edi], 3")};
+ ASSERT_EQ(chunks5.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_6) {
+ std::vector<std::shared_ptr<Chunk>> chunks6{parseAsm("add dword ptr[edi], 3")};
+ ASSERT_EQ(chunks6.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_op_7) {
+ std::vector<std::shared_ptr<Chunk>> chunks7{parseAsm("add qword ptr[edi], 3")};
+ ASSERT_EQ(chunks7.size(), 1);
+}
+
+TEST_F(AsmParseTest, parse_label) {
+ std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm("label1:")};
+ ASSERT_EQ(chunks0.size(), 1);
+
+ std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("label2: add eax, 3")};
+ ASSERT_EQ(chunks1.size(), 2);
+}
+
+TEST_F(AsmParseTest, parse_multiline) {
+ std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("add eax, 3\n")};
+ ASSERT_EQ(chunks1.size(), 1);
+
+ std::vector<std::shared_ptr<Chunk>> chunks2{parseAsm("label2: add eax, 3\n mul rdx")};
+ ASSERT_EQ(chunks2.size(), 3);
+
+ std::vector<std::shared_ptr<Chunk>> chunks3{parseAsm("label2: add eax, 3\n mul rdx\n")};
+ ASSERT_EQ(chunks3.size(), 3);
+
+ std::vector<std::shared_ptr<Chunk>> chunks4{parseAsm("label2: add eax, 3\n\n\n mul rdx")};
+ ASSERT_EQ(chunks4.size(), 3);
+}
+
+TEST_F(AsmParseTest, parse_comment) {
+ std::vector<std::shared_ptr<Chunk>> chunks0{parseAsm(" ; Comment 1")};
+ ASSERT_EQ(chunks0.size(), 0);
+
+ std::vector<std::shared_ptr<Chunk>> chunks1{parseAsm("label2: add eax, 3 ; A comment")};
+ ASSERT_EQ(chunks1.size(), 2);
+
+ std::vector<std::shared_ptr<Chunk>> chunks3{parseAsm("label2: add eax, 3 // Another comment")};
+ ASSERT_EQ(chunks3.size(), 2);
+
+ std::vector<std::shared_ptr<Chunk>> chunks4{parseAsm("label2: add eax, 3 // Another comment\nadd rax, rdi")};
+ ASSERT_EQ(chunks4.size(), 3);
+
+ std::vector<std::shared_ptr<Chunk>> chunks5{parseAsm("label2: add eax, 3 // Another comment\n \t add rax, rdi")};
+ ASSERT_EQ(chunks5.size(), 3);
+}
+
+TEST_F(AsmParseTest, parse_error) {
+ ASSERT_THROW(parseAsm(" add label1:"), std::runtime_error);
+
+ ASSERT_THROW(parseAsm(" add eax , "), std::runtime_error);
+ ASSERT_THROW(parseAsm(" add eax,"), std::runtime_error);
+}
diff --git a/tests/test-cpp.cpp b/tests/test-cpp.cpp
index 95d271d..2e1b29e 100644
--- a/tests/test-cpp.cpp
+++ b/tests/test-cpp.cpp
@@ -24,7 +24,7 @@ class CppTest: public ::testing::Test
{
protected:
CppTest() {
- debug = true;
+ //debug = true;
}
~CppTest() {
}