diff options
author | Roland Reichwein <mail@reichwein.it> | 2020-10-11 14:25:23 +0200 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2020-10-11 14:25:23 +0200 |
commit | efbbad04bc093a133ca2aa5a462de0d37b04f929 (patch) | |
tree | 8dc39bee70cd40a03e844b47303528bc63ffbe24 /intel.cpp | |
parent | 0c5a5a94dab6d19192946f0a57d920628823ece1 (diff) |
Added COFF, Intel ASM
Diffstat (limited to 'intel.cpp')
-rw-r--r-- | intel.cpp | 503 |
1 files changed, 503 insertions, 0 deletions
diff --git a/intel.cpp b/intel.cpp new file mode 100644 index 0000000..dfcaa75 --- /dev/null +++ b/intel.cpp @@ -0,0 +1,503 @@ +// Intel assembly language + + +// segments: code, stack + +#include "minicc.h" + +#include <algorithm> +#include <array> +#include <deque> +#include <functional> +#include <stdexcept> +#include <functional> +#include <stdexcept> +#include <string> +#include <unordered_map> +#include <vector> + +using namespace std::string_literals; +using namespace std::placeholders; + +namespace { + + // binary code operators + std::vector<uint8_t> operator+(std::vector<uint8_t> a, const std::vector<uint8_t>& b) { + a.insert(a.end(), b.begin(), b.end()); + return a; + } + + std::vector<uint8_t> operator+(std::vector<uint8_t> a, const uint8_t& b) { + a.push_back(b); + return a; + } + + // REX prefix: 0b0100WRXB + std::vector<uint8_t> REX(std::string s) { + uint8_t result{0b01000000}; + if (s == "W") + result |= 0b00001000; + if (s == "R") + result |= 0b00000100; + if (s == "X") + result |= 0b00000010; + if (s == "B") + result |= 0b00000001; + + return { result }; + } + + std::vector<uint8_t> imm8(std::string s) { + long value{ std::stol(s) }; + uint8_t* bin = reinterpret_cast<uint8_t*>(&value); + return { uint8_t(*bin & 0xFF) }; + } + + std::vector<uint8_t> imm32(std::string s) { + long value{ std::stol(s) }; + uint32_t* bin = reinterpret_cast<uint32_t*>(&value); + return {uint8_t(*bin & 0xFF), uint8_t(*bin >> 8 & 0xFF), uint8_t(*bin >> 16 & 0xFF), uint8_t(*bin >> 24 & 0xFF) }; + } + + std::unordered_map<std::string, size_t> IndexOfRegister{ + {"al", 0}, {"ah", 4}, + {"bl", 3}, {"bh", 7}, + {"cl", 1}, {"ch", 5}, + {"dl", 2}, {"dh", 6}, + + {"ax", 0}, {"sp", 4}, + {"bx", 3}, {"bp", 7}, + {"cx", 1}, {"si", 5}, + {"dx", 2}, {"di", 6}, + + {"eax", 0}, {"esp", 4}, + {"ebx", 3}, {"ebp", 7}, + {"ecx", 1}, {"esi", 5}, + {"edx", 2}, {"edi", 6}, + }; + + // Manual, page 530 + // Reg + Reg/Memory + uint8_t ModRM(std::string reg, std::string rm) { + // TODO: extend + uint8_t result{0b11000000}; + + auto index1{ IndexOfRegister.find(reg) }; + if (index1 == IndexOfRegister.end()) + throw std::runtime_error("Unknown register for arg1: "s + reg); + + result |= (index1->second << 3); + + auto index2{ IndexOfRegister.find(rm) }; + if (index2 == IndexOfRegister.end()) + throw std::runtime_error("Unknown register for arg2: "s + rm); + + result |= index2->second; + + return result; + } + + enum class AddressType { + Relative8, + Relative16, + Relative32, + Absolute8, + Absolute16, + Absolute32, + }; + + struct Address + { + AddressType type; + size_t position; // relative to respective machine code, e.g. byte 1 in jump + std::string label; // where to jump to, as label + }; + + struct InstructionCode + { + std::vector<uint8_t> machine_code; + std::vector<Address> addresses; + }; + + // List of alternative codes + typedef std::deque<InstructionCode> InstructionCodeList; + + bool O1{ true }; // Optimization + + using OP_T = std::vector<uint8_t>; + + InstructionCodeList op_jmp(const std::vector<Token>& sl, std::vector<uint8_t> op_bytes_8, std::vector<uint8_t> op_bytes_32) + { + if (sl.size() == 2) { // JMP rel8 / rel32 + const std::string& label{ sl[1].value }; + InstructionCodeList result; + if (op_bytes_32.size() > 0) { + op_bytes_32.resize(op_bytes_32.size() + 4, 0x00); + result.push_back({ op_bytes_32, { {AddressType::Relative32, op_bytes_32.size() - 4, label} } } ); + } + if (op_bytes_8.size() > 0 && (O1 || op_bytes_32.size() == 0)) { + op_bytes_8.push_back(0x00); + result.push_back({ op_bytes_8, { {AddressType::Relative8, op_bytes_8.size() - 1, label} } }); + } + return result; + } + + // ... TODO + throw std::runtime_error("Unknown command: "s + sl[0].value); + } + + std::unordered_map<std::string, std::function<InstructionCodeList(const std::vector<Token>&)>> ops{ + + // Integer Addition + {"add", [](const std::vector<Token>& sl) -> InstructionCodeList { + if (sl.size() == 3) { + if (sl[1].value == "eax") { // ADD EAX, imm32 + return { { std::vector<uint8_t>{ 0x05 } +imm32(sl[2].value), {} } }; + } else if (sl[1].value == "rax") { // ADD RAX, imm32 + return { { REX("W") + std::vector<uint8_t>{ 0x05 } +imm32(sl[2].value), {} } }; + } + } + + // ... TODO + throw std::runtime_error("Unknown command: "s + sl[0].value); + }}, + + // Call Procedure + {"call", std::bind(op_jmp, _1, OP_T{}, OP_T{ 0xE8 })}, + + // Interrupt + {"int", [](const std::vector<Token>& sl) -> InstructionCodeList { + if (sl.size() == 2) { + if (sl[1].value == "0") { // INT 0 + return { { std::vector<uint8_t>{ 0xCE }} }; + } else if (sl[1].value == "1") { // INT 1 + return { { std::vector<uint8_t>{ 0xF1 }} }; + } else if (sl[1].value == "3") { // INT 3 + return { { std::vector<uint8_t>{ 0xCC }} }; + } else { // INT <...> + return { { std::vector<uint8_t>{ 0xCD } +imm8(sl[2].value) } }; + } + } + + // ... TODO + throw std::runtime_error("Unknown command: "s + sl[0].value); + }}, + + // Unconditional Jump + {"jmp", std::bind(op_jmp, _1, OP_T{ 0xEB }, OP_T{ 0xE9 })}, + + // Conditional Jumps + {"ja", std::bind(op_jmp, _1, OP_T{ 0x77 }, OP_T{ 0x0F, 0x87 })}, + {"jae", std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, + {"jb", std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, + {"jbe", std::bind(op_jmp, _1, OP_T{ 0x76 }, OP_T{ 0x0F, 0x86 })}, + {"jc", std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, + {"jecxz", std::bind(op_jmp, _1, OP_T{ 0xE3 }, OP_T{})}, + {"jrcxz", std::bind(op_jmp, _1, OP_T{ 0xE3 }, OP_T{})}, + {"je", std::bind(op_jmp, _1, OP_T{ 0x74 }, OP_T{ 0x0F, 0x84 })}, + {"jg", std::bind(op_jmp, _1, OP_T{ 0x7F }, OP_T{ 0x0F, 0x8F })}, + {"jge", std::bind(op_jmp, _1, OP_T{ 0x7D }, OP_T{ 0x0F, 0x8D })}, + {"jl", std::bind(op_jmp, _1, OP_T{ 0x7C }, OP_T{ 0x0F, 0x8C })}, + {"jle", std::bind(op_jmp, _1, OP_T{ 0x7E }, OP_T{ 0x0F, 0x8E })}, + {"jna", std::bind(op_jmp, _1, OP_T{ 0x76 }, OP_T{ 0x0F, 0x86 })}, + {"jnae", std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, + {"jnb", std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, + {"jnbe", std::bind(op_jmp, _1, OP_T{ 0x77 }, OP_T{ 0x0F, 0x87 })}, + {"jnc", std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, + {"jne", std::bind(op_jmp, _1, OP_T{ 0x75 }, OP_T{ 0x0F, 0x85 })}, + {"jng", std::bind(op_jmp, _1, OP_T{ 0x7E }, OP_T{ 0x0F, 0x8E })}, + {"jnge", std::bind(op_jmp, _1, OP_T{ 0x7C }, OP_T{ 0x0F, 0x8C })}, + {"jnl", std::bind(op_jmp, _1, OP_T{ 0x7D }, OP_T{ 0x0F, 0x8D })}, + {"jnle", std::bind(op_jmp, _1, OP_T{ 0x7F }, OP_T{ 0x0F, 0x8F })}, + {"jno", std::bind(op_jmp, _1, OP_T{ 0x71 }, OP_T{ 0x0F, 0x81 })}, + {"jnp", std::bind(op_jmp, _1, OP_T{ 0x7B }, OP_T{ 0x0F, 0x8B })}, + {"jns", std::bind(op_jmp, _1, OP_T{ 0x79 }, OP_T{ 0x0F, 0x89 })}, + {"jnz", std::bind(op_jmp, _1, OP_T{ 0x75 }, OP_T{ 0x0F, 0x85 })}, + {"jo", std::bind(op_jmp, _1, OP_T{ 0x70 }, OP_T{ 0x0F, 0x80 })}, + {"jp", std::bind(op_jmp, _1, OP_T{ 0x7A }, OP_T{ 0x0F, 0x8A })}, + {"jpe", std::bind(op_jmp, _1, OP_T{ 0x7A }, OP_T{ 0x0F, 0x8A })}, + {"jpo", std::bind(op_jmp, _1, OP_T{ 0x7B }, OP_T{ 0x0F, 0x8B })}, + {"js", std::bind(op_jmp, _1, OP_T{ 0x78 }, OP_T{ 0x0F, 0x88 })}, + {"jz", std::bind(op_jmp, _1, OP_T{ 0x74 }, OP_T{ 0x0F, 0x84 })}, + + // Memory Move + { "mov", [](const std::vector<Token>& sl) -> InstructionCodeList { + if (sl.size() == 3) { + return { { std::vector<uint8_t>{ 0x88 } + ModRM(sl[2].value, sl[1].value), {} } }; // r/m8, r8: ModRM:r/m (w), ModRM:reg (r) + } + + // ... TODO + throw std::runtime_error("Unknown command: "s + sl[0].value); + }}, + + // No Operation + { "nop", [](const std::vector<Token>& sl) -> InstructionCodeList { + return {{ std::vector<uint8_t>{ 0x90 }, {}}}; + }}, + + // Return from procedure + { "ret", [](const std::vector<Token>& sl) -> InstructionCodeList { + return {{ std::vector<uint8_t>{ 0xC3 }, {}}}; // near return; TODO: far return is 0xCB + }}, + + { "xor", [](const std::vector<Token>& sl) -> InstructionCodeList { + if (sl.size() == 3) { + return { { std::vector<uint8_t>{ 0x33 } + ModRM(sl[1].value, sl[2].value) } }; // r8, r/m8: ModRM:reg (w), ModRM:r/m (r) + } + + // ... TODO + throw std::runtime_error("Unknown command: "s + sl[0].value); + }}, + + }; + +#if 0 + prefixes{ + "lock", 0xf0, + + // branch hint + 0x2e, "branch not taken" + 0x3e, "branch taken" + + 0x66, "operand size override" // switch between 16 and 32 bit operands + 0x67, "address size override" // switch between 16 and 32 bit addresses + }; + }; +#endif + +#ifdef ASM_PARSER + BNF GetBNF() { + // TODO: + return { + { "assembler-unit", { + {} + }}, + { "immediate-32", { + {} + }}, + { "mnemonic", { + {} + }}, + { "register", { + {} + }}, + { "register-8", { + {} + }}, + { "register-16", { + {} + }}, + { "register-32", { + {} + }}, + { "register-64", { + {} + }}, + + }; + }; +#endif + + // Checks a 32 bit relative address if it's valid as 8 bit address + bool IsSmallAddress(const InstructionCode& insn) { + if (insn.addresses.size() != 1) + throw std::runtime_error("Bad number of addresses in insn"); + + size_t i{insn.addresses[0].position}; + + if (i > insn.machine_code.size() - 3) + throw std::runtime_error("Bad Address index "s + std::to_string(i) + " in insn with "s + std::to_string(insn.machine_code.size()) + " bytes"s); + + if (std::count(insn.machine_code.begin() + i, insn.machine_code.begin() + i + 3, 0x00) == 3 || + std::count(insn.machine_code.begin() + i, insn.machine_code.begin() + i + 3, 0xFF) == 3) + return true; + + return false; + } + + +} // namespace + +class Assembler { + + std::unordered_map<std::string, size_t> labels; ///< labels with their positions in instruction list + + /// 1st Level: Instructions + /// 2nd Level: Alternatives + /// 3rd Level: Bytes of single instruction + std::vector<InstructionCodeList> insn_list; + + uint64_t addressFromInstructionIndex(size_t index) + { + // TODO: cache this to prevent repetitive summing + + if (index > insn_list.size()) + throw std::runtime_error("Index "s + std::to_string(index) + " out of range ("s + std::to_string(insn_list.size()) + ")"s); + + uint64_t sum{}; + + for (size_t i = 0; i < index; i++) { + if (insn_list[i].size() < 1) { + throw std::runtime_error("Insufficient alternatives at index "s + std::to_string(i)); + } + + sum += static_cast<uint64_t>(insn_list[i][0].machine_code.size()); + } + + return sum; + } + + uint64_t addressFromLabel(std::string label) + { + auto it{ labels.find(label) }; + if (it == labels.end()) + throw std::runtime_error("Label not found: "s + label); + + return addressFromInstructionIndex(it->second); + } + + std::unordered_map<AddressType, std::function<void(std::vector<uint8_t>&, const Address&, uint64_t)>> addressInserters{ + {AddressType::Relative8, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) + { + int64_t difference = static_cast<int64_t>(addressFromLabel(target_address.label)) - insn_address; + if (difference < -128 || difference > 127) + throw std::runtime_error("Distance too big"); + + int8_t diff8 = static_cast<int8_t>(difference); + uint8_t diff_u8 = *reinterpret_cast<uint8_t*>(&diff8); + + machine_code[target_address.position] = diff_u8; + } + }, + {AddressType::Relative16, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) { throw std::runtime_error("Relative16 Address not yet supported."); }}, + {AddressType::Relative32, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) + { + int64_t difference = static_cast<int64_t>(addressFromLabel(target_address.label)) - insn_address; + if (difference < -4294967296 || difference > 4294967295) + throw std::runtime_error("Distance too big"); + + int32_t diff32 = static_cast<int32_t>(difference); + uint32_t diff_u32 = *reinterpret_cast<uint32_t*>(&diff32); + + machine_code[target_address.position] = diff_u32 & 0xFF; // little endian + machine_code[target_address.position + 1] = diff_u32 >> 8 & 0xFF; + machine_code[target_address.position + 2] = diff_u32 >> 16 & 0xFF; + machine_code[target_address.position + 3] = diff_u32 >> 24 & 0xFF; + } + }, + {AddressType::Absolute8, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute8 Address not yet supported."); }}, + {AddressType::Absolute16, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute16 Address not yet supported."); }}, + {AddressType::Absolute32, [&](std::vector<uint8_t>& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute32 Address not yet supported."); }}, + }; + + void produce_machine_code(std::vector<std::vector<Token>>& tl) + { + for (const auto& t : tl) { + // label: + // label: mnemonic arg1, arg2, arg3 + // mnemonic arg1, arg2, arg3 + + if (t.size() == 2 && t[0].type == "label" && t[1].type == ":") { // label + if (labels.find(t[0].value) != labels.end()) + throw std::runtime_error("Label already defined: "s + t[0].value); + + labels[t[0].value] = insn_list.size(); + } else if (t.size() >= 1 && t[0].type == "instruction") { // instruction + std::string instruction{ t[0].value }; + auto it = ops.find(instruction); + if (it == ops.end()) + throw std::runtime_error("Unknown instruction: "s + instruction); + + InstructionCodeList codes = it->second(t); + + if (codes.size() == 0) + throw std::runtime_error("No instruction generated"); + + insn_list.push_back(codes); + + } else + throw std::runtime_error("Syntax error"s); + } + } + + void insert_addresses() + { + for (size_t i = 0; i < insn_list.size(); i++) { + InstructionCodeList& list{ insn_list[i] }; + if (list.size() == 0) + throw std::runtime_error("No instruction at index "s + std::to_string(i)); + + InstructionCode& code{ list[0] }; + + for (const auto& address : code.addresses) { + addressInserters[address.type](code.machine_code, address, addressFromInstructionIndex(i)); + } + } + } + + void optimize() + { + // reduce Jump sizes via alternatives if possible + bool changed{}; + do { + changed = false; + + for (size_t i = 0; i < insn_list.size(); i++) { + InstructionCodeList& list{ insn_list[i] }; // Alternatives + + // apply specific heuristics to optimization case + if (list.size() == 2) { + if (list[0].addresses.size() == 1 && list[1].addresses.size() == 1) { + if (list[0].addresses[0].type == AddressType::Relative32 && list[1].addresses[0].type == AddressType::Relative8) { + if (IsSmallAddress(list[0])) { + list.pop_front(); + break; // start over from start of program + } + } + } + } + } + + if (changed) + insert_addresses(); // update + + } while (changed); + } + + std::vector<uint8_t> collect_code() + { + std::vector<uint8_t> result; + + // collect generated machine instructions for result + // Alternatives already resolved, if configured. Consider only 1st entry (no matter if optimized or not). + for (size_t i = 0; i < insn_list.size(); i++) { + InstructionCodeList& list{ insn_list[i] }; + if (list.size() == 0) + throw std::runtime_error("No instruction at index "s + std::to_string(i)); + + InstructionCode& code{ list[0] }; + + result.insert(result.end(), code.machine_code.begin(), code.machine_code.end()); + } + + return result; + } + +public: + Assembler() {} + + std::vector<uint8_t> assemble(std::vector<std::vector<Token>> tl) + { + labels.clear(); + insn_list.clear(); + + produce_machine_code(tl); // 1st pass + insert_addresses(); // 2nd pass + if (O1) { + optimize(); // 3rd pass + } + + return collect_code(); // 4th pass + } + +}; // class Assembler |