diff options
author | Roland Reichwein <mail@reichwein.it> | 2020-03-16 21:27:38 +0100 |
---|---|---|
committer | Roland Reichwein <mail@reichwein.it> | 2020-03-16 21:27:38 +0100 |
commit | 74350b52fee9f576a1cc71d99cfd4ebdf5a73e0f (patch) | |
tree | fbcc823f28f31b3671af3a1e01a109abae0dbb92 | |
parent | 9502989aec80e8f75cf14e7dd7d1d85333090866 (diff) |
Fixed lexer
-rwxr-xr-x | cppbnf.cpp | 2 | ||||
-rw-r--r-- | lexer.cpp | 62 | ||||
-rw-r--r-- | lexer.h | 14 |
3 files changed, 47 insertions, 31 deletions
@@ -220,7 +220,7 @@ BNF GetCppBNFLex() {"preprocessing-token", { {"header-name"}, - {"import-keyword"}, + //{"import-keyword"}, // TODO {"identifier"}, {"pp-number"}, {"character-literal"}, @@ -4,9 +4,8 @@ using namespace Lex; -size_t Lexer::newState(std::string state_type) +size_t Lexer::newState() { - m_state_types.push_back(state_type); return states++; } @@ -66,19 +65,19 @@ std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state) } // Helper function -void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type) +void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol) { if (isTerminal(m_bnf, symbol)) { // end recursion with new transition if (symbol.size() != 1) throw std::runtime_error("Bad sized terminal symbol: "s + symbol); addTransition(state0, state1, symbol[0]); } else { // recurse via non-terminal symbol - addPath(state0, state1, symbol, type); + addPath(state0, state1, symbol); } } // Helper function: add one rule -void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type) +void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol) { size_t previousState{state0}; @@ -88,27 +87,24 @@ void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from if (symbol == rule_symbol) throw std::runtime_error("Recursion found but not allowed"); - size_t state{newState(type)}; - addPathOrTransition(previousState, state, symbol, type); + size_t state{newState()}; + addPathOrTransition(previousState, state, symbol); previousState = state; } if (list.back() == rule_symbol) throw std::runtime_error("Recursion found but not allowed"); // last transition - addPathOrTransition(previousState, state1, list.back(), type); + addPathOrTransition(previousState, state1, list.back()); } // Add paths between state0 and state1, including new states and transitions -void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type) +void Lexer::addPath(size_t state0, size_t state1, std::string s) { - if (type == "" && s != "" && s != m_top) - type = s; - // state0 -> [paths] -> state01 -> state1 // ^ v // [recursion paths] - size_t state01{newState(type)}; + size_t state01{newState()}; auto it {m_bnf.find(s)}; if (it == m_bnf.end()) @@ -120,9 +116,9 @@ void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string typ throw std::runtime_error("List too small in rule "s + s); if (list[0] == s) { // recursion rule - addRule(list, 1, list_size, state01, state01, s, type); + addRule(list, 1, list_size, state01, state01, s); } else { // non-recursion rule - addRule(list, 0, list_size, state0, state01, s, type); + addRule(list, 0, list_size, state0, state01, s); } } addTransition(state01, state1, 0); // empty transition to end @@ -154,14 +150,35 @@ void Lexer::removeEmptyTransitions() } } -Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()), m_endState(newState()) +Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top), m_startState(newState()) { - addPath(m_startState, m_endState, m_top, ""); + // types are the second level symbols, directly under top + auto it {m_bnf.find(m_top)}; + if (it == m_bnf.end()) + throw std::runtime_error("Start symbol "s + m_top + " not found."s); + + auto& list {it->second}; + + for (auto& element : list) { + if (element.size() != 1) + throw std::runtime_error("Bad type rule in "s + m_top + ": size = "s + std::to_string(element.size())); + + auto endState{newState()}; + std::string type{element[0]}; + m_state_types[endState] = type; + + addPath(m_startState, endState, type); + } replaceEmptyTransitions(); removeEmptyTransitions(); } +bool Lexer::isEndState(size_t state) +{ + return m_state_types.find(state) != m_state_types.end(); +} + Token Lexer::getToken(const std::string& s, Location& location) { Location oldLocation{location}; // start of token @@ -175,16 +192,16 @@ Token Lexer::getToken(const std::string& s, Location& location) // match as much as possible while (location.pos < s.size() && states.size() > 0) { char currentChar{s[location.pos]}; - std::cout << "DEBUG: Char: " << currentChar << std::endl; + //std::cout << "DEBUG: Char: " << currentChar << std::endl; for (const auto& state: states) { std::vector<std::pair<size_t, char>> successors{getSuccessors(state)}; for (const auto& [nextState, c]: successors) { if (c == currentChar) { - if (nextState == m_endState) { // save intermediate result upon match + if (isEndState(nextState)) { // save intermediate result upon match found = location; found.advance(); - state_type = m_state_types[state]; + state_type = m_state_types[nextState]; } else { newStates.push_back(nextState); } @@ -198,10 +215,9 @@ Token Lexer::getToken(const std::string& s, Location& location) std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)}; - if (found.pos > 0) - std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; - else + if (found.pos == 0) throw std::runtime_error("Bad Token at "s + oldLocation.toString()); + //else std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl; location = found; // reset to end of match @@ -18,23 +18,23 @@ class Lexer // Graph size_t states{}; // start, ... - std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; //transitions: state -> {state,character}, ...; empty transition is marked by \0 - std::vector<std::string> m_state_types; + std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; // transitions: state -> {state,character}, ...; empty transition is marked by \0 + std::unordered_map<size_t, std::string> m_state_types; // only necessary for 2nd level symbol names size_t m_startState; - size_t m_endState; // Graph manipulation - size_t newState(std::string state_type = ""); + size_t newState(); + bool isEndState(size_t state); void addTransition(size_t state0, size_t state1, char c); void removeTransition(size_t state0, size_t state1, char c); std::vector<std::pair<size_t, char>> getSuccessorsViaEmpty(size_t state); std::vector<std::pair<size_t, char>> getSuccessors(size_t state); // Build up automaton, recursively - void addPath(size_t state0, size_t state1, std::string s, std::string type); - void addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type); - void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type); + void addPath(size_t state0, size_t state1, std::string s); + void addPathOrTransition(size_t state0, size_t state1, std::string symbol); + void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol); Token getToken(const std::string& s, Location& location); void skipWhitespace(const std::string& s, Location& location); |