From 74350b52fee9f576a1cc71d99cfd4ebdf5a73e0f Mon Sep 17 00:00:00 2001 From: Roland Reichwein Date: Mon, 16 Mar 2020 21:27:38 +0100 Subject: Fixed lexer --- cppbnf.cpp | 2 +- lexer.cpp | 62 +++++++++++++++++++++++++++++++++++++++----------------------- lexer.h | 14 +++++++------- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/cppbnf.cpp b/cppbnf.cpp index 4818fc3..fd246ba 100755 --- a/cppbnf.cpp +++ b/cppbnf.cpp @@ -220,7 +220,7 @@ BNF GetCppBNFLex() {"preprocessing-token", { {"header-name"}, - {"import-keyword"}, + //{"import-keyword"}, // TODO {"identifier"}, {"pp-number"}, {"character-literal"}, diff --git a/lexer.cpp b/lexer.cpp index 9ef29b1..0944738 100644 --- a/lexer.cpp +++ b/lexer.cpp @@ -4,9 +4,8 @@ using namespace Lex; -size_t Lexer::newState(std::string state_type) +size_t Lexer::newState() { - m_state_types.push_back(state_type); return states++; } @@ -66,19 +65,19 @@ std::vector> Lexer::getSuccessors(size_t state) } // Helper function -void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type) +void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol) { if (isTerminal(m_bnf, symbol)) { // end recursion with new transition if (symbol.size() != 1) throw std::runtime_error("Bad sized terminal symbol: "s + symbol); addTransition(state0, state1, symbol[0]); } else { // recurse via non-terminal symbol - addPath(state0, state1, symbol, type); + addPath(state0, state1, symbol); } } // Helper function: add one rule -void Lexer::addRule(const std::vector& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type) +void Lexer::addRule(const std::vector& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol) { size_t previousState{state0}; @@ -88,27 +87,24 @@ void Lexer::addRule(const std::vector& list, size_t list_index_from if (symbol == rule_symbol) throw std::runtime_error("Recursion found but not allowed"); - size_t state{newState(type)}; - addPathOrTransition(previousState, state, symbol, type); + size_t state{newState()}; + addPathOrTransition(previousState, state, symbol); previousState = state; } if (list.back() == rule_symbol) throw std::runtime_error("Recursion found but not allowed"); // last transition - addPathOrTransition(previousState, state1, list.back(), type); + addPathOrTransition(previousState, state1, list.back()); } // Add paths between state0 and state1, including new states and transitions -void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type) +void Lexer::addPath(size_t state0, size_t state1, std::string s) { - if (type == "" && s != "" && s != m_top) - type = s; - // state0 -> [paths] -> state01 -> state1 // ^ v // [recursion paths] - size_t state01{newState(type)}; + size_t state01{newState()}; auto it {m_bnf.find(s)}; if (it == m_bnf.end()) @@ -120,9 +116,9 @@ void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string typ throw std::runtime_error("List too small in rule "s + s); if (list[0] == s) { // recursion rule - addRule(list, 1, list_size, state01, state01, s, type); + addRule(list, 1, list_size, state01, state01, s); } else { // non-recursion rule - addRule(list, 0, list_size, state0, state01, s, type); + addRule(list, 0, list_size, state0, state01, s); } } addTransition(state01, state1, 0); // empty transition to end @@ -154,14 +150,35 @@ void Lexer::removeEmptyTransitions() } } -Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()), m_endState(newState()) +Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()) { - addPath(m_startState, m_endState, m_top, ""); + // types are the second level symbols, directly under top + auto it {m_bnf.find(m_top)}; + if (it == m_bnf.end()) + throw std::runtime_error("Start symbol "s + m_top + " not found."s); + + auto& list {it->second}; + + for (auto& element : list) { + if (element.size() != 1) + throw std::runtime_error("Bad type rule in "s + m_top + ": size = "s + std::to_string(element.size())); + + auto endState{newState()}; + std::string type{element[0]}; + m_state_types[endState] = type; + + addPath(m_startState, endState, type); + } replaceEmptyTransitions(); removeEmptyTransitions(); } +bool Lexer::isEndState(size_t state) +{ + return m_state_types.find(state) != m_state_types.end(); +} + Token Lexer::getToken(const std::string& s, Location& location) { Location oldLocation{location}; // start of token @@ -175,16 +192,16 @@ Token Lexer::getToken(const std::string& s, Location& location) // match as much as possible while (location.pos < s.size() && states.size() > 0) { char currentChar{s[location.pos]}; - std::cout << "DEBUG: Char: " << currentChar << std::endl; + //std::cout << "DEBUG: Char: " << currentChar << std::endl; for (const auto& state: states) { std::vector> successors{getSuccessors(state)}; for (const auto& [nextState, c]: successors) { if (c == currentChar) { - if (nextState == m_endState) { // save intermediate result upon match + if (isEndState(nextState)) { // save intermediate result upon match found = location; found.advance(); - state_type = m_state_types[state]; + state_type = m_state_types[nextState]; } else { newStates.push_back(nextState); } @@ -198,10 +215,9 @@ Token Lexer::getToken(const std::string& s, Location& location) std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; - if (found.pos > 0) - std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; - else + if (found.pos == 0) throw std::runtime_error("Bad Token at "s + oldLocation.toString()); + //else std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; location = found; // reset to end of match diff --git a/lexer.h b/lexer.h index a033cbc..aa724bb 100644 --- a/lexer.h +++ b/lexer.h @@ -18,23 +18,23 @@ class Lexer // Graph size_t states{}; // start, ... - std::unordered_map>> transitions; //transitions: state -> {state,character}, ...; empty transition is marked by \0 - std::vector m_state_types; + std::unordered_map>> transitions; // transitions: state -> {state,character}, ...; empty transition is marked by \0 + std::unordered_map m_state_types; // only necessary for 2nd level symbol names size_t m_startState; - size_t m_endState; // Graph manipulation - size_t newState(std::string state_type = ""); + size_t newState(); + bool isEndState(size_t state); void addTransition(size_t state0, size_t state1, char c); void removeTransition(size_t state0, size_t state1, char c); std::vector> getSuccessorsViaEmpty(size_t state); std::vector> getSuccessors(size_t state); // Build up automaton, recursively - void addPath(size_t state0, size_t state1, std::string s, std::string type); - void addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type); - void addRule(const std::vector& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type); + void addPath(size_t state0, size_t state1, std::string s); + void addPathOrTransition(size_t state0, size_t state1, std::string symbol); + void addRule(const std::vector& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol); Token getToken(const std::string& s, Location& location); void skipWhitespace(const std::string& s, Location& location); -- cgit v1.2.3