// Intel assembly language // segments: code, stack #include "minicc.h" #include #include #include #include #include #include #include #include #include #include using namespace std::string_literals; using namespace std::placeholders; namespace { // binary code operators std::vector operator+(std::vector a, const std::vector& b) { a.insert(a.end(), b.begin(), b.end()); return a; } std::vector operator+(std::vector a, const uint8_t& b) { a.push_back(b); return a; } // REX prefix: 0b0100WRXB std::vector REX(std::string s) { uint8_t result{0b01000000}; if (s == "W") result |= 0b00001000; if (s == "R") result |= 0b00000100; if (s == "X") result |= 0b00000010; if (s == "B") result |= 0b00000001; return { result }; } std::vector imm8(std::string s) { long value{ std::stol(s) }; uint8_t* bin = reinterpret_cast(&value); return { uint8_t(*bin & 0xFF) }; } std::vector imm32(std::string s) { long value{ std::stol(s) }; uint32_t* bin = reinterpret_cast(&value); return {uint8_t(*bin & 0xFF), uint8_t(*bin >> 8 & 0xFF), uint8_t(*bin >> 16 & 0xFF), uint8_t(*bin >> 24 & 0xFF) }; } std::unordered_map IndexOfRegister{ {"al", 0}, {"ah", 4}, {"bl", 3}, {"bh", 7}, {"cl", 1}, {"ch", 5}, {"dl", 2}, {"dh", 6}, {"ax", 0}, {"sp", 4}, {"bx", 3}, {"bp", 7}, {"cx", 1}, {"si", 5}, {"dx", 2}, {"di", 6}, {"eax", 0}, {"esp", 4}, {"ebx", 3}, {"ebp", 7}, {"ecx", 1}, {"esi", 5}, {"edx", 2}, {"edi", 6}, }; // Manual, page 530 // Reg + Reg/Memory uint8_t ModRM(std::string reg, std::string rm) { // TODO: extend uint8_t result{0b11000000}; auto index1{ IndexOfRegister.find(reg) }; if (index1 == IndexOfRegister.end()) throw std::runtime_error("Unknown register for arg1: "s + reg); result |= (index1->second << 3); auto index2{ IndexOfRegister.find(rm) }; if (index2 == IndexOfRegister.end()) throw std::runtime_error("Unknown register for arg2: "s + rm); result |= index2->second; return result; } enum class AddressType { Relative8, Relative16, Relative32, Absolute8, Absolute16, Absolute32, }; struct Address { AddressType type; size_t position; // relative to respective machine code, e.g. byte 1 in jump std::string label; // where to jump to, as label }; struct InstructionCode { std::vector machine_code; std::vector
addresses; }; // List of alternative codes typedef std::deque InstructionCodeList; bool O1{ true }; // Optimization using OP_T = std::vector; InstructionCodeList op_jmp(const std::vector& sl, std::vector op_bytes_8, std::vector op_bytes_32) { if (sl.size() == 2) { // JMP rel8 / rel32 const std::string& label{ sl[1].value }; InstructionCodeList result; if (op_bytes_32.size() > 0) { op_bytes_32.resize(op_bytes_32.size() + 4, 0x00); result.push_back({ op_bytes_32, { {AddressType::Relative32, op_bytes_32.size() - 4, label} } } ); } if (op_bytes_8.size() > 0 && (O1 || op_bytes_32.size() == 0)) { op_bytes_8.push_back(0x00); result.push_back({ op_bytes_8, { {AddressType::Relative8, op_bytes_8.size() - 1, label} } }); } return result; } // ... TODO throw std::runtime_error("Unknown command: "s + sl[0].value); } std::unordered_map&)>> ops{ // Integer Addition {"add", [](const std::vector& sl) -> InstructionCodeList { if (sl.size() == 3) { if (sl[1].value == "eax") { // ADD EAX, imm32 return { { std::vector{ 0x05 } +imm32(sl[2].value), {} } }; } else if (sl[1].value == "rax") { // ADD RAX, imm32 return { { REX("W") + std::vector{ 0x05 } +imm32(sl[2].value), {} } }; } } // ... TODO throw std::runtime_error("Unknown command: "s + sl[0].value); }}, // Call Procedure {"call", std::bind(op_jmp, _1, OP_T{}, OP_T{ 0xE8 })}, // Interrupt {"int", [](const std::vector& sl) -> InstructionCodeList { if (sl.size() == 2) { if (sl[1].value == "0") { // INT 0 return { { std::vector{ 0xCE }} }; } else if (sl[1].value == "1") { // INT 1 return { { std::vector{ 0xF1 }} }; } else if (sl[1].value == "3") { // INT 3 return { { std::vector{ 0xCC }} }; } else { // INT <...> return { { std::vector{ 0xCD } +imm8(sl[2].value) } }; } } // ... TODO throw std::runtime_error("Unknown command: "s + sl[0].value); }}, // Unconditional Jump {"jmp", std::bind(op_jmp, _1, OP_T{ 0xEB }, OP_T{ 0xE9 })}, // Conditional Jumps {"ja", std::bind(op_jmp, _1, OP_T{ 0x77 }, OP_T{ 0x0F, 0x87 })}, {"jae", std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, {"jb", std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, {"jbe", std::bind(op_jmp, _1, OP_T{ 0x76 }, OP_T{ 0x0F, 0x86 })}, {"jc", std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, {"jecxz", std::bind(op_jmp, _1, OP_T{ 0xE3 }, OP_T{})}, {"jrcxz", std::bind(op_jmp, _1, OP_T{ 0xE3 }, OP_T{})}, {"je", std::bind(op_jmp, _1, OP_T{ 0x74 }, OP_T{ 0x0F, 0x84 })}, {"jg", std::bind(op_jmp, _1, OP_T{ 0x7F }, OP_T{ 0x0F, 0x8F })}, {"jge", std::bind(op_jmp, _1, OP_T{ 0x7D }, OP_T{ 0x0F, 0x8D })}, {"jl", std::bind(op_jmp, _1, OP_T{ 0x7C }, OP_T{ 0x0F, 0x8C })}, {"jle", std::bind(op_jmp, _1, OP_T{ 0x7E }, OP_T{ 0x0F, 0x8E })}, {"jna", std::bind(op_jmp, _1, OP_T{ 0x76 }, OP_T{ 0x0F, 0x86 })}, {"jnae", std::bind(op_jmp, _1, OP_T{ 0x72 }, OP_T{ 0x0F, 0x82 })}, {"jnb", std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, {"jnbe", std::bind(op_jmp, _1, OP_T{ 0x77 }, OP_T{ 0x0F, 0x87 })}, {"jnc", std::bind(op_jmp, _1, OP_T{ 0x73 }, OP_T{ 0x0F, 0x83 })}, {"jne", std::bind(op_jmp, _1, OP_T{ 0x75 }, OP_T{ 0x0F, 0x85 })}, {"jng", std::bind(op_jmp, _1, OP_T{ 0x7E }, OP_T{ 0x0F, 0x8E })}, {"jnge", std::bind(op_jmp, _1, OP_T{ 0x7C }, OP_T{ 0x0F, 0x8C })}, {"jnl", std::bind(op_jmp, _1, OP_T{ 0x7D }, OP_T{ 0x0F, 0x8D })}, {"jnle", std::bind(op_jmp, _1, OP_T{ 0x7F }, OP_T{ 0x0F, 0x8F })}, {"jno", std::bind(op_jmp, _1, OP_T{ 0x71 }, OP_T{ 0x0F, 0x81 })}, {"jnp", std::bind(op_jmp, _1, OP_T{ 0x7B }, OP_T{ 0x0F, 0x8B })}, {"jns", std::bind(op_jmp, _1, OP_T{ 0x79 }, OP_T{ 0x0F, 0x89 })}, {"jnz", std::bind(op_jmp, _1, OP_T{ 0x75 }, OP_T{ 0x0F, 0x85 })}, {"jo", std::bind(op_jmp, _1, OP_T{ 0x70 }, OP_T{ 0x0F, 0x80 })}, {"jp", std::bind(op_jmp, _1, OP_T{ 0x7A }, OP_T{ 0x0F, 0x8A })}, {"jpe", std::bind(op_jmp, _1, OP_T{ 0x7A }, OP_T{ 0x0F, 0x8A })}, {"jpo", std::bind(op_jmp, _1, OP_T{ 0x7B }, OP_T{ 0x0F, 0x8B })}, {"js", std::bind(op_jmp, _1, OP_T{ 0x78 }, OP_T{ 0x0F, 0x88 })}, {"jz", std::bind(op_jmp, _1, OP_T{ 0x74 }, OP_T{ 0x0F, 0x84 })}, // Memory Move { "mov", [](const std::vector& sl) -> InstructionCodeList { if (sl.size() == 3) { return { { std::vector{ 0x88 } + ModRM(sl[2].value, sl[1].value), {} } }; // r/m8, r8: ModRM:r/m (w), ModRM:reg (r) } // ... TODO throw std::runtime_error("Unknown command: "s + sl[0].value); }}, // No Operation { "nop", [](const std::vector& sl) -> InstructionCodeList { return {{ std::vector{ 0x90 }, {}}}; }}, // Return from procedure { "ret", [](const std::vector& sl) -> InstructionCodeList { return {{ std::vector{ 0xC3 }, {}}}; // near return; TODO: far return is 0xCB }}, { "xor", [](const std::vector& sl) -> InstructionCodeList { if (sl.size() == 3) { return { { std::vector{ 0x33 } + ModRM(sl[1].value, sl[2].value) } }; // r8, r/m8: ModRM:reg (w), ModRM:r/m (r) } // ... TODO throw std::runtime_error("Unknown command: "s + sl[0].value); }}, }; #if 0 prefixes{ "lock", 0xf0, // branch hint 0x2e, "branch not taken" 0x3e, "branch taken" 0x66, "operand size override" // switch between 16 and 32 bit operands 0x67, "address size override" // switch between 16 and 32 bit addresses }; }; #endif #ifdef ASM_PARSER BNF GetBNF() { // TODO: return { { "assembler-unit", { {} }}, { "immediate-32", { {} }}, { "mnemonic", { {} }}, { "register", { {} }}, { "register-8", { {} }}, { "register-16", { {} }}, { "register-32", { {} }}, { "register-64", { {} }}, }; }; #endif // Checks a 32 bit relative address if it's valid as 8 bit address bool IsSmallAddress(const InstructionCode& insn) { if (insn.addresses.size() != 1) throw std::runtime_error("Bad number of addresses in insn"); size_t i{insn.addresses[0].position}; if (i > insn.machine_code.size() - 3) throw std::runtime_error("Bad Address index "s + std::to_string(i) + " in insn with "s + std::to_string(insn.machine_code.size()) + " bytes"s); if (std::count(insn.machine_code.begin() + i, insn.machine_code.begin() + i + 3, 0x00) == 3 || std::count(insn.machine_code.begin() + i, insn.machine_code.begin() + i + 3, 0xFF) == 3) return true; return false; } } // namespace class Assembler { std::unordered_map labels; ///< labels with their positions in instruction list /// 1st Level: Instructions /// 2nd Level: Alternatives /// 3rd Level: Bytes of single instruction std::vector insn_list; uint64_t addressFromInstructionIndex(size_t index) { // TODO: cache this to prevent repetitive summing if (index > insn_list.size()) throw std::runtime_error("Index "s + std::to_string(index) + " out of range ("s + std::to_string(insn_list.size()) + ")"s); uint64_t sum{}; for (size_t i = 0; i < index; i++) { if (insn_list[i].size() < 1) { throw std::runtime_error("Insufficient alternatives at index "s + std::to_string(i)); } sum += static_cast(insn_list[i][0].machine_code.size()); } return sum; } uint64_t addressFromLabel(std::string label) { auto it{ labels.find(label) }; if (it == labels.end()) throw std::runtime_error("Label not found: "s + label); return addressFromInstructionIndex(it->second); } std::unordered_map&, const Address&, uint64_t)>> addressInserters{ {AddressType::Relative8, [&](std::vector& machine_code, const Address& target_address, uint64_t insn_address) { int64_t difference = static_cast(addressFromLabel(target_address.label)) - insn_address; if (difference < -128 || difference > 127) throw std::runtime_error("Distance too big"); int8_t diff8 = static_cast(difference); uint8_t diff_u8 = *reinterpret_cast(&diff8); machine_code[target_address.position] = diff_u8; } }, {AddressType::Relative16, [&](std::vector& machine_code, const Address& target_address, uint64_t insn_address) { throw std::runtime_error("Relative16 Address not yet supported."); }}, {AddressType::Relative32, [&](std::vector& machine_code, const Address& target_address, uint64_t insn_address) { int64_t difference = static_cast(addressFromLabel(target_address.label)) - insn_address; if (difference < -4294967296 || difference > 4294967295) throw std::runtime_error("Distance too big"); int32_t diff32 = static_cast(difference); uint32_t diff_u32 = *reinterpret_cast(&diff32); machine_code[target_address.position] = diff_u32 & 0xFF; // little endian machine_code[target_address.position + 1] = diff_u32 >> 8 & 0xFF; machine_code[target_address.position + 2] = diff_u32 >> 16 & 0xFF; machine_code[target_address.position + 3] = diff_u32 >> 24 & 0xFF; } }, {AddressType::Absolute8, [&](std::vector& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute8 Address not yet supported."); }}, {AddressType::Absolute16, [&](std::vector& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute16 Address not yet supported."); }}, {AddressType::Absolute32, [&](std::vector& machine_code, const Address& target_address, uint64_t insn_address) {throw std::runtime_error("Absolute32 Address not yet supported."); }}, }; void produce_machine_code(std::vector>& tl) { for (const auto& t : tl) { // label: // label: mnemonic arg1, arg2, arg3 // mnemonic arg1, arg2, arg3 if (t.size() == 2 && t[0].type == "label" && t[1].type == ":") { // label if (labels.find(t[0].value) != labels.end()) throw std::runtime_error("Label already defined: "s + t[0].value); labels[t[0].value] = insn_list.size(); } else if (t.size() >= 1 && t[0].type == "instruction") { // instruction std::string instruction{ t[0].value }; auto it = ops.find(instruction); if (it == ops.end()) throw std::runtime_error("Unknown instruction: "s + instruction); InstructionCodeList codes = it->second(t); if (codes.size() == 0) throw std::runtime_error("No instruction generated"); insn_list.push_back(codes); } else throw std::runtime_error("Syntax error"s); } } void insert_addresses() { for (size_t i = 0; i < insn_list.size(); i++) { InstructionCodeList& list{ insn_list[i] }; if (list.size() == 0) throw std::runtime_error("No instruction at index "s + std::to_string(i)); InstructionCode& code{ list[0] }; for (const auto& address : code.addresses) { addressInserters[address.type](code.machine_code, address, addressFromInstructionIndex(i)); } } } void optimize() { // reduce Jump sizes via alternatives if possible bool changed{}; do { changed = false; for (size_t i = 0; i < insn_list.size(); i++) { InstructionCodeList& list{ insn_list[i] }; // Alternatives // apply specific heuristics to optimization case if (list.size() == 2) { if (list[0].addresses.size() == 1 && list[1].addresses.size() == 1) { if (list[0].addresses[0].type == AddressType::Relative32 && list[1].addresses[0].type == AddressType::Relative8) { if (IsSmallAddress(list[0])) { list.pop_front(); break; // start over from start of program } } } } } if (changed) insert_addresses(); // update } while (changed); } std::vector collect_code() { std::vector result; // collect generated machine instructions for result // Alternatives already resolved, if configured. Consider only 1st entry (no matter if optimized or not). for (size_t i = 0; i < insn_list.size(); i++) { InstructionCodeList& list{ insn_list[i] }; if (list.size() == 0) throw std::runtime_error("No instruction at index "s + std::to_string(i)); InstructionCode& code{ list[0] }; result.insert(result.end(), code.machine_code.begin(), code.machine_code.end()); } return result; } public: Assembler() {} std::vector assemble(std::vector> tl) { labels.clear(); insn_list.clear(); produce_machine_code(tl); // 1st pass insert_addresses(); // 2nd pass if (O1) { optimize(); // 3rd pass } return collect_code(); // 4th pass } }; // class Assembler