summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2024-08-31 18:29:58 +0200
committerRoland Reichwein <mail@reichwein.it>2024-08-31 18:29:58 +0200
commit02153da4d5954261f6649e2980bc88f6d29e45a6 (patch)
tree4cc1ce6d1124edf791e48fcdb28b33a614d1a358
parentf8c4fe1614cc79df9f97c8a7754cf2a5aaf5063d (diff)
ARM assembler (WIP)
-rw-r--r--asm/arm/README.md4
-rw-r--r--asm/arm/TODO1
-rw-r--r--asm/arm/YMakefile8
-rw-r--r--asm/arm/assembler.h38
-rw-r--r--asm/arm/instruction.h109
-rw-r--r--asm/arm/main.cpp41
-rw-r--r--asm/arm/parse.cpp441
-rw-r--r--asm/arm/parse.h7
8 files changed, 649 insertions, 0 deletions
diff --git a/asm/arm/README.md b/asm/arm/README.md
new file mode 100644
index 0000000..742c243
--- /dev/null
+++ b/asm/arm/README.md
@@ -0,0 +1,4 @@
+Assembler for ARMv6-M
+=====================
+
+Supports thumb instruction set
diff --git a/asm/arm/TODO b/asm/arm/TODO
new file mode 100644
index 0000000..2d47ab0
--- /dev/null
+++ b/asm/arm/TODO
@@ -0,0 +1 @@
+mnemonic alias
diff --git a/asm/arm/YMakefile b/asm/arm/YMakefile
new file mode 100644
index 0000000..833bb88
--- /dev/null
+++ b/asm/arm/YMakefile
@@ -0,0 +1,8 @@
+<ymake>
+ <build>
+ <name>arm</name>
+ <source>main.cpp</source>
+ <linklib>fmt</linklib>
+ <linklib>reichwein</linklib>
+ </build>
+</ymake>
diff --git a/asm/arm/assembler.h b/asm/arm/assembler.h
new file mode 100644
index 0000000..13eca00
--- /dev/null
+++ b/asm/arm/assembler.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "instruction.h"
+#include "parse.h"
+
+class Encoding
+{
+};
+
+class OpCode: public Encoding
+{
+private:
+ Instruction _instruction;
+ std::vector<std::string> _arguments;
+};
+
+class Data: public Encoding
+{
+};
+
+using Assembly = std::vector<Encoding>;
+
+class Assembler
+{
+public:
+ Assembler(){}
+
+// passes:
+// 0. parse
+// 1. assign sizes and offsets to codes (assign labels)
+// 2. assemble to code sequence (use labels for offsets)
+
+ code_sequence encode(const std::string& source){ throw std::runtime_error("Assembler.encode unimplemented");}
+ std::string decode(code_sequence) { throw std::runtime_error("Assembler.decode unimplemented");}
+};
diff --git a/asm/arm/instruction.h b/asm/arm/instruction.h
new file mode 100644
index 0000000..224cc6e
--- /dev/null
+++ b/asm/arm/instruction.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using namespace std::string_literals;
+
+using code_sequence = std::vector<uint8_t>;
+
+template<typename from_t>
+code_sequence to_code_sequence(from_t v);
+
+template<>
+code_sequence to_code_sequence<uint16_t>(uint16_t v) {
+ return code_sequence{static_cast<uint8_t>(v & 0xFF), static_cast<uint8_t>(v >> 8)};
+}
+
+template<>
+code_sequence to_code_sequence<uint32_t>(uint32_t v) {
+ return code_sequence{static_cast<uint8_t>(v & 0xFF), static_cast<uint8_t>(v >> 8), static_cast<uint8_t>(v >> 16),static_cast<uint8_t>(v >> 24) };
+}
+
+uint32_t high_bits(uint8_t number)
+{
+ return ~((static_cast<uint32_t>(1) << (32 - number)) - 1);
+}
+
+// Identify operator with leading bits
+class Pattern
+{
+public:
+ Pattern(uint32_t bits, uint32_t mask): _bits{bits}, _mask{mask} {}
+
+ template<typename T>
+ T encode()
+ {
+ return static_cast<T>(_bits);
+ }
+
+
+private:
+ uint32_t _bits;
+ uint32_t _mask;
+};
+
+class Operand
+{
+protected:
+ Operand(uint8_t pos): _pos(pos){}
+private:
+ uint8_t _pos;
+};
+
+class Register: public Operand
+{
+public:
+ Register(uint8_t pos): Operand{pos} {};
+};
+
+class Immediate: public Operand
+{
+public:
+ Immediate(uint8_t pos, uint8_t bits): Operand{pos}, _bits{bits} {}
+private:
+ uint8_t _bits;
+};
+
+using Operands = std::vector<std::shared_ptr<Operand>>;
+
+// Pattern [, Operand, ...]
+class Instruction
+{
+public:
+ Instruction(const std::string& mnemonic, uint8_t size, Pattern pattern, Operands operands): _mnemonic(mnemonic), _size(size), _pattern(pattern), _operands(operands) {}
+ code_sequence encode(const std::vector<std::string>& arguments)
+ {
+ if (_size == 2) { // 16 bit thumb insn
+ uint16_t result{ _pattern.encode<uint16_t>()};
+ return to_code_sequence(result);
+ } else if (_size == 4) { // 32 bit thumb insn
+ uint32_t result{ _pattern.encode<uint32_t>()};
+ return to_code_sequence(result);
+ } else {
+ throw std::runtime_error("Unsupported instruction size "s + std::to_string(_size));
+ }
+ }
+
+private:
+ std::string _mnemonic;
+ uint8_t _size;
+ Pattern _pattern;
+ Operands _operands;
+};
+
+namespace {
+ // factory functions
+ std::shared_ptr<Operand> imm(uint8_t pos, uint8_t size){ return std::make_shared<Immediate>(pos, size); };
+ std::shared_ptr<Operand> reg(uint8_t pos) { return std::make_shared<Register>(pos); };
+
+ std::vector<Instruction> insns{
+ {"adcs", 2, Pattern(0x4140, high_bits(10)), Operands{reg(0), reg(3)}},
+
+ {"lsls", 2, Pattern(0x0000, high_bits(5)), Operands{reg(0), reg(3), imm(6, 5)}}
+ };
+};
+
+
diff --git a/asm/arm/main.cpp b/asm/arm/main.cpp
new file mode 100644
index 0000000..20e67ab
--- /dev/null
+++ b/asm/arm/main.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <stdexcept>
+
+#include <fmt/format.h>
+
+#include <libreichwein/file.h>
+
+#include "assembler.h"
+#include "instruction.h"
+
+void dump(const code_sequence& c)
+{
+ for (size_t i = 0; i < c.size(); ++i) {
+ if ((i % 16) == 0) {
+ if (i > 0)
+ std::cout << std::endl;
+ } else {
+ std::cout << " ";
+ }
+ std::cout << fmt::format("{:02x}", c[i]);
+ }
+ std::cout << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+ try {
+ if (argc == 2) {
+ Assembler assembler;
+ std::string source {Reichwein::File::getFile(argv[1])};
+ code_sequence result {assembler.encode(source)};
+ dump(result);
+ }
+ return 0;
+ } catch (const std::exception& ex) {
+ std::cerr << "Error: " << ex.what() << std::endl;
+ return 1;
+ }
+ return 2;
+}
+
diff --git a/asm/arm/parse.cpp b/asm/arm/parse.cpp
new file mode 100644
index 0000000..a3156c2
--- /dev/null
+++ b/asm/arm/parse.cpp
@@ -0,0 +1,441 @@
+#include "parse.h"
+
+#include "asm/assembler.h"
+
+#include <boost/algorithm/string.hpp>
+
+#include <exception>
+#include <functional>
+#include <regex>
+#include <unordered_set>
+
+using namespace std::string_literals;
+
+namespace {
+
+ std::unordered_set<std::string> reg8 {
+ "al", "ah",
+ "bl", "bh",
+ "cl", "ch",
+ "dl", "dh",
+ };
+
+ std::unordered_set<std::string> reg16 {
+ "ax", "sp",
+ "bx", "bp",
+ "cx", "si",
+ "dx", "di",
+ };
+
+ std::unordered_set<std::string> reg32 {
+ "eax", "esp",
+ "ebx", "ebp",
+ "ecx", "esi",
+ "edx", "edi",
+ };
+
+ std::unordered_set<std::string> reg64 {
+ "rax", "rsp",
+ "rbx", "rbp",
+ "rcx", "rsi",
+ "rdx", "rdi",
+ };
+
+ // skip optional whitespace
+ void parseWhitespace(const std::string& asm_code, size_t& pos) {
+ std::regex re_whitespace("[ \\t]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_whitespace, std::regex_constants::match_continuous)) {
+ pos += match[0].length();
+ }
+ }
+
+ std::string reg_re{"[[:alpha:]][[:alnum:]]*"};
+
+ // parse optional label
+ bool parseLabel(const std::string& asm_code, size_t& pos, std::string& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_label("("s + reg_re + "):"s, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_label, std::regex_constants::match_continuous)) {
+ pos += match[0].length();
+ result = match[1];
+ return true;
+ }
+
+ return false;
+ }
+
+ // parse optional mnemonic
+ // return true iff mnemonic found
+ bool parseMnemonic(const std::string& asm_code, size_t& pos, std::string& result) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_mnemonic(reg_re, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_mnemonic, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ pos += name.size();
+ result = name;
+ return true;
+ }
+
+ return false;
+ }
+
+ bool parseRegister8(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name(reg_re, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg8.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register8(name);
+ size_hint = 8;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseRegister16(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name(reg_re, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg16.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register16(name);
+ size_hint = 16;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseRegister32(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name(reg_re, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg32.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register32(name);
+ size_hint = 32;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseRegister64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name(reg_re, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[0].str())};
+ if (reg64.contains(name)) {
+ pos += name.size();
+ result = Asm::Args::Register64(name);
+ size_hint = 64;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseMem8Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("byte ptr *\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[1].str())};
+ if (reg64.contains(name)) {
+ pos += match[0].length();
+ result = Asm::Args::Mem8Ptr64{name};
+ size_hint = 8;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseMem16Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("(word ptr *)?\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[2].str())};
+ if (reg16.contains(name)) {
+ pos += match[0].length();
+ result = Asm::Args::Mem16Ptr64(name);
+ size_hint = 16;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseMem32Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("(dword ptr *)?\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[2].str())};
+ if (reg64.contains(name)) {
+ pos += match[0].length();
+ result = Asm::Args::Mem32Ptr64(name);
+ size_hint = 32;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseMem64Ptr64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("qword ptr *\\[("s + reg_re + ")\\]"s, std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ std::string name {boost::algorithm::to_lower_copy(match[1].str())};
+ if (reg64.contains(name)) {
+ pos += match[0].length();
+ result = Asm::Args::Mem64Ptr64(name);
+ size_hint = 64;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ bool parseImmediate8(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ if (size_hint != 8)
+ return false;
+
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ int32_t value{};
+ try {
+ value = stoll(match[0]);
+ } catch (...) {
+ throw std::runtime_error("Assembler parse error: Bad immediate: "s + match[0].str());
+ }
+ if (value < -128 || value > 255)
+ throw std::runtime_error("Assembler parse error: Bad 8 bit immediate: "s + match[0].str());
+
+ pos += match[0].length();
+ result = Asm::Args::Immediate8(static_cast<uint8_t>(value));
+ return true;
+ }
+
+ return false;
+ }
+
+ bool parseImmediate32(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ if (size_hint != 32 && size_hint != 0)
+ return false;
+
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ int32_t value{};
+ try {
+ value = stoll(match[0]);
+ } catch (...) {
+ throw std::runtime_error("Assembler parse error: Bad Immediate: "s + match[0].str());
+ }
+ pos += match[0].length();
+ result = Asm::Args::Immediate32(static_cast<uint32_t>(value));
+ return true;
+ }
+
+ return false;
+ }
+
+ bool parseImmediate64(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ if (size_hint != 64)
+ return false;
+
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_name("[[:digit:]]+|0x[[:xdigit:]]+", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_name, std::regex_constants::match_continuous)) {
+ int64_t value{};
+ try {
+ value = stoll(match[0]);
+ } catch (...) {
+ throw std::runtime_error("Assembler parse error: Bad immediate: "s + match[0].str());
+ }
+
+ pos += match[0].length();
+ result = Asm::Args::Immediate64(static_cast<uint64_t>(value));
+ return true;
+ }
+
+ return false;
+ }
+
+ // parse optional single operand
+ bool parseOperand(const std::string& asm_code, size_t& pos, std::any& result, size_t& size_hint) {
+ parseWhitespace(asm_code, pos);
+
+ if (parseRegister8(asm_code, pos, result, size_hint))
+ return true;
+ if (parseRegister16(asm_code, pos, result, size_hint))
+ return true;
+ if (parseRegister32(asm_code, pos, result, size_hint))
+ return true;
+ if (parseRegister64(asm_code, pos, result, size_hint))
+ return true;
+
+ if (parseMem8Ptr64(asm_code, pos, result, size_hint))
+ return true;
+ if (parseMem16Ptr64(asm_code, pos, result, size_hint))
+ return true;
+ if (parseMem32Ptr64(asm_code, pos, result, size_hint))
+ return true;
+ if (parseMem64Ptr64(asm_code, pos, result, size_hint))
+ return true;
+
+ if (parseImmediate8(asm_code, pos, result, size_hint))
+ return true;
+ if (parseImmediate32(asm_code, pos, result, size_hint))
+ return true;
+ if (parseImmediate64(asm_code, pos, result, size_hint))
+ return true;
+
+ return false;
+ }
+
+ // parse optional multiple operands, separated by commas
+ void parseOperands(const std::string& asm_code, size_t& pos, Asm::Args& result) {
+ std::any operand;
+ size_t size_hint{0}; // in bits, 0=no hint
+ if (parseOperand(asm_code, pos, operand, size_hint)) {
+ result.push_back(operand);
+ parseWhitespace(asm_code, pos);
+ while (pos < asm_code.size() && asm_code[pos] == ',') {
+ pos++;
+ if (parseOperand(asm_code, pos, operand, size_hint)) {
+ result.push_back(operand);
+ } else {
+ throw std::runtime_error("Assembler error: expected operand after comma");
+ }
+ parseWhitespace(asm_code, pos);
+ }
+ }
+ }
+
+ // parse optional comment
+ void parseComment(const std::string& asm_code, size_t& pos) {
+ parseWhitespace(asm_code, pos);
+
+ std::regex re_comment("(;|//).*", std::regex_constants::ECMAScript);
+
+ std::smatch match;
+ if (std::regex_search(asm_code.cbegin() + pos, asm_code.cend(), match, re_comment, std::regex_constants::match_continuous)) {
+ pos += match[0].length();
+ }
+ }
+
+ // parse end of line (or whole code)
+ bool parseEol(const std::string& asm_code, size_t& pos) {
+ parseWhitespace(asm_code, pos);
+
+ if (pos < asm_code.size() && asm_code[pos] != 0x0a && asm_code[pos] != 0x0d)
+ return false; // this is the only case where parseEol() doesn't work
+
+ while (pos < asm_code.size()) {
+ char c { asm_code[pos] };
+ if (c == 0x0a || c == 0x0d) {
+ pos++;
+ } else {
+ break;
+ }
+ }
+
+ return true;
+ }
+
+ // parse single line
+ void parseLine(const std::string& asm_code, size_t& pos, std::vector<std::shared_ptr<Chunk>>& result) {
+ // all optional:
+ // label: mnemonic operands... ;comment <eol>
+
+ std::string label;
+ std::function<void()> label_fn {[](){}};
+ if (parseLabel(asm_code, pos, label))
+ label_fn = [&]() { result.emplace_back(std::make_shared<Label>(label)); }; // defer to successfully completed line
+
+ std::string mnemonic;
+ Asm::Args args;
+ std::function<void()> mnemonic_fn {[](){}};
+ if (parseMnemonic(asm_code, pos, mnemonic)) {
+ parseOperands(asm_code, pos, args);
+ mnemonic_fn = [&]() { result.emplace_back(makeOp(mnemonic, args)); }; // defer to successfully completed line
+ }
+
+ parseComment(asm_code, pos);
+
+ if (!parseEol(asm_code, pos))
+ throw std::runtime_error("Assembler error at pos "s + std::to_string(pos));
+
+ // Append only if no error occured, to get the correct error
+ label_fn();
+ mnemonic_fn();
+ }
+
+} // namespace
+
+std::vector<std::shared_ptr<Chunk>> parseAsm(const std::string& asm_code)
+{
+ std::vector<std::shared_ptr<Chunk>> result;
+ size_t pos{0};
+
+ while (pos != asm_code.size()) {
+ parseLine(asm_code, pos, result);
+ }
+
+ return result;
+}
+
diff --git a/asm/arm/parse.h b/asm/arm/parse.h
new file mode 100644
index 0000000..4dce4b2
--- /dev/null
+++ b/asm/arm/parse.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <vector>
+#include <string>
+
+// asm_code: multiline asm code
+std::vector<std::vector<std::string>> parseAsm(const std::string& asm_code);