#include "cpp.h" #include "flowgraph/node.h" #include "bnf.h" #include "cppbnf.h" #include "debug.h" #include "lexer.h" #include "grammer.h" #include "minicc.h" #include #include #include #include #include #include #include using namespace Gram; namespace fs = std::filesystem; CPP::CPP(){} CPP::~CPP(){} // Phase 1: Map physical character set to basic source character set void CPP::source_charset_map() { // TODO } // Phase 2: Escape backslashed line endings void CPP::backslash_escape() { // TODO } // Phase 3: Parse preprocessing tokens, TODO: discard comments std::vector CPP::preprocessing_tokenize(const std::string& s) { auto bnf{SubBNF(CPPBNF::GetCppBNFLex(), "preprocessing-token")}; Lex::Lexer lexer(bnf, "preprocessing-token"); return lexer.Lex(s); } // Phase 4: Preprocessing void CPP::preprocess() { // TODO } // Phase 5: Map chars and strings to execution charset void CPP::execution_charset_map() { // TODO } // Phase 6: Concatenate adjacent string literals void CPP::concatenate_strings() { // TODO } std::string CPP::valueOfNode(index_t node_index) const { std::string result; std::optional pos0; index_t last_index; std::vector todo(1, int32_t(node_index)); while (!todo.empty()) { int32_t current_index = todo.back(); todo.pop_back(); // visit node if token if (ChildIdIsToken(current_index)) { if (!pos0) { pos0 = m_tokens[TokenIdFromChildId(current_index)].location.pos; } last_index = TokenIdFromChildId(current_index); } else { const TreeNode &node{m_nodes[current_index]}; // iterate backwards in childs, to get depth-first search in tree, from the beginning std::for_each(node.child_ids.rbegin(), node.child_ids.rend(), [&](int32_t child){ todo.push_back(child); }); } } if (!pos0) throw std::runtime_error("ICE: Node value not available"); return m_code.substr(*pos0, m_tokens[last_index].location.pos - *pos0) + m_tokens[last_index].value; }; std::string CPP::typeOfNode(index_t node_id) const { if (node_id >= m_nodes.size()) throw std::runtime_error("CPP::typeOfNode(): node_id="s + std::to_string(node_id) + ", m_nodes.size()="s + std::to_string(m_nodes.size())); return m_nodes[node_id].type; } namespace { std::unordered_set pp_types { "identifier", "pp-number", "character-literal", "user-defined-character-literal", "string-literal", "user-defined-string-literal", "preprocessing-op-or-punc" }; std::unordered_set keywords { "alignas", "alignof", "asm", "auto", "bool", "break", "case", "catch", "char", "char8_t", "char16_t", "char32_t", "class", "concept", "const", "consteval", "constexpr", "constinit", "const_cast", "continue", "co_await", "co_return", "co_yield", "decltype", "default", "delete", "do", "double", "dynamic_cast", "else", "enum", "explicit", "export", "extern", "false", "float", "for", "friend", "goto", "if", "inline", "int", "long", "mutable", "namespace", "new", "noexcept", "nullptr", "operator", "private", "protected", "public", "register", "reinterpret_cast", "requires", "return", "short", "signed", "sizeof", "static", "static_assert", "static_cast", "struct", "switch", "template", "this", "thread_local", "throw", "true", "try", "typedef", "typeid", "typename", "union", "unsigned", "using", "virtual", "void", "volatile", "wchar_t", "while", }; } // Phase 7.a: Create tokens from preprocessing tokens std::vector CPP::tokens_from_pptokens(const std::vector& pp_tokens) { std::vector result; // "identifier" + value -> "identifier" + value, except identifiers from table 5.11, p.14 -> keyword as value, value // "pp-number" + value -> "literal" + value // "character-literal" -> "literal" + value // "user-defined-character-literal" -> "literal" + value // "string-literal" -> "literal" + value // "user-defined-string-literal" -> "literal" + value // "preprocessing-op-or-punc" -> value+value (operator,punctuator) for (auto& token: pp_tokens) { if (pp_types.find(token.type) != pp_types.end()) { if (token.type == "identifier") { if (keywords.find(token.value) != keywords.end()) result.emplace_back(Token{token.value, token.value, token.location}); else result.emplace_back(Token{"identifier"s, token.value, token.location}); } else if (token.type == "preprocessing-op-or-punc") result.emplace_back(Token{token.value, token.value, token.location}); else result.emplace_back(Token{"literal", token.value, token.location}); } else throw std::runtime_error("Unhandled preprocessing token: "s + token.toString()); } return result; } // Phase 7.b: Grammar Analysis std::vector CPP::analysis(const std::vector& tokens) { auto bnf = SubBNF(CPPBNF::GetCppBNFGram(), "translation-unit"); Gram::Compiler compiler(bnf, "translation-unit"); return compiler.compile(tokens); } std::string CPP::locationOfNode(index_t node_index) const { if (node_index >= m_nodes.size()) throw std::runtime_error("ICE: locationOfNode(): Bad node_index "s + std::to_string(node_index) + " vs. "s + std::to_string(m_nodes.size())); for (const auto& child_id: m_nodes[node_index].child_ids) { if (ChildIdIsNode(child_id)) { std::string location{locationOfNode(child_id)}; if (location.size() > 0) return location; } else { // ChildIdIsToken return m_tokens[TokenIdFromChildId(child_id)].location.toString(); } } return ""; // not found } void CPP::compileError(index_t node_id, const std::string& msg) const { std::string location{locationOfNode(node_id)}; if (location.size() == 0) location = "Node "s + std::to_string(node_id) + "doesn't have any token"; throw std::runtime_error("Compile error: "s + location + ": "s + msg); } std::string CPP::typeOfChild(int32_t child_id) const { if (ChildIdIsToken(child_id)) { index_t token_id {TokenIdFromChildId(child_id)}; return m_tokens[token_id].type; } else { // ChildIdIsNode return m_nodes[child_id].type; } } bool CPP::childTypesOfNodeMatch(index_t node_id, const std::vector& pattern) const { const std::vector& child_ids{m_nodes[node_id].child_ids}; if (child_ids.size() != pattern.size()) return false; // mismatch for (size_t i = 0; i < child_ids.size(); i++) { if (pattern[i] != "" && typeOfChild(child_ids[i]) != pattern[i]) return false; // mismatch } return true; // match } //std::unordered_map> // precondition: stack contains child values c1, ..., cn on top -> to be popped // postcondition: stack contains value on top -> to be pushed void CPP::getValueOfNode(index_t index) { size_t num_childs {m_nodes[index].child_ids.size()}; if (mValues.size() < num_childs) throw std::runtime_error("Expected num_childs elements on Values stack at "s + locationOfNode(index)); std::shared_ptr result {nullptr}; mValues.resize(mValues.size() - num_childs); mValues.push_back(result); } // pushes result onto stack void CPP::getValueOfToken(index_t index) { if (m_tokens[index].type == "literal") { FlowGraph::Data data{FlowGraph::MakeConstantInt(stoi(m_tokens[index].value))}; mValues.push_back(std::make_shared(data)); } else { mValues.push_back(std::make_shared(nullptr)); } } void CPP::visitRecursive(index_t node_id) { const auto& childs {m_nodes[node_id].child_ids}; for (const auto child: childs) { if (ChildIdIsNode(child)) { visitRecursive(child); } else { getValueOfToken(TokenIdFromChildId(child)); } } getValueOfNode(node_id); } // Phase 7.c: Translate void CPP::translate() { if (m_nodes.size() == 0) throw std::runtime_error("ICE: Tree is empty"); // TODO: this could be implemented iteratively, via a stack? visitRecursive(0); } // Phase 8: Instantiate objects void CPP::instantiate() { // TODO } // Phase 9: Link libraries void CPP::link() { // TODO } // phases of translation, according to standard void CPP::compile(const std::string& code) { m_code = code; source_charset_map(); // phase 1 backslash_escape(); // phase 2 auto pp_tokens = preprocessing_tokenize(code); // phase 3 preprocess(); // phase 4 execution_charset_map(); // phase 5 concatenate_strings(); // phase 6 m_tokens = tokens_from_pptokens(pp_tokens); // phase 7a m_nodes = analysis(m_tokens); // phase 7b translate(); // phase 7c instantiate(); // phase 8 link(); // phase 9 } std::vector CPP::getCode() { // TODO return { 0x48, 0xc7, 0xc0, 0x3c, 0x00, 0x00, 0x00, // mov $0x3c,%rax # syscall 60 0x48, 0x31, 0xff, // xor %rdi,%rdi # exit code 0 0x0f, 0x05, // syscall }; } std::vector CPP::getData() { return {}; // TODO }