summaryrefslogtreecommitdiffhomepage
path: root/lexer.cpp
diff options
context:
space:
mode:
authorRoland Reichwein <mail@reichwein.it>2020-03-15 18:19:49 +0100
committerRoland Reichwein <mail@reichwein.it>2020-03-15 18:19:49 +0100
commit9f69b006dde3c3fbe19ed3e0275d3b7348f2aa87 (patch)
tree6ac42793568339463f913cf39474794c8613d0b6 /lexer.cpp
parent3a7006fcf5f8ecffd852fbba6d8ee03ce8a35dce (diff)
New lexer
Diffstat (limited to 'lexer.cpp')
-rw-r--r--lexer.cpp166
1 files changed, 165 insertions, 1 deletions
diff --git a/lexer.cpp b/lexer.cpp
index 4a458c3..61caae1 100644
--- a/lexer.cpp
+++ b/lexer.cpp
@@ -2,14 +2,178 @@
using namespace Lex;
-Lexer::Lexer(const BNF& bnf, const std::string& Top)
+size_t Lexer::newState(std::string state_type)
{
+ m_state_types.push_back(state_type);
+ return states++;
+}
+
+void Lexer::addTransition(size_t state0, size_t state1, char c)
+{
+ auto it{transitions.find(state0)};
+
+ if (it == transitions.end()) { // new entry is a list with 1 entry
+ transitions[state0] = {{state1, c}};
+ } else { // extend list entry
+ it->second.push_back({state1, c});
+ }
+}
+
+std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state)
+{
+ std::vector<std::pair<size_t, char>> result;
+
+ auto it{transitions.find(state)};
+
+ if (it != transitions.end()) { // new list entry
+ for (auto &i: it->second) {
+ if (i.first == m_endState || i.second != '\0') { // add transition to end state or transition via actual char
+ result.push_back(i);
+ } else { // follow empty transition
+ auto successors { getSuccessors(i.first) };
+ result.insert(result.end(), successors.begin(), successors.end());
+ }
+ }
+ }
+
+ return result;
+}
+
+// Helper function
+void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type)
+{
+ if (isTerminal(m_bnf, symbol)) { // end recursion with new transition
+ if (symbol.size() != 1)
+ throw std::runtime_error("Bad sized terminal symbol: "s + symbol);
+ addTransition(state0, state1, symbol[0]);
+ } else { // recurse via non-terminal symbol
+ addPath(state0, state1, symbol, type);
+ }
+}
+
+// Helper function: add one rule
+void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type)
+{
+ size_t previousState{state0};
+
+ // add intermediate states with transitions
+ for (size_t i = list_index_from; i < list_index_to - 1; i++) {
+ std::string symbol{list[i]};
+ if (symbol == rule_symbol)
+ throw std::runtime_error("Recursion found but not allowed");
+
+ size_t state{newState(type)};
+ addPathOrTransition(previousState, state, symbol, type);
+ previousState = state;
+ }
+ if (list.back() == rule_symbol)
+ throw std::runtime_error("Recursion found but not allowed");
+
+ // last transition
+ addPathOrTransition(previousState, state1, list.back(), type);
+}
+
+// Add paths between state0 and state1, including new states and transitions
+void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type)
+{
+ if (type == "" && s != "" && s != m_top)
+ type = s;
+
+ // state0 -> [paths] -> state01 -> state1
+ // ^ v
+ // [recursion paths]
+ size_t state01{newState(type)};
+ auto it {m_bnf.find(s)};
+
+ if (it == m_bnf.end())
+ throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s);
+
+ for (auto& list: it->second) { // for every path between state0 and state1
+ size_t list_size{list.size()};
+ if (list_size < 1)
+ throw std::runtime_error("List too small in rule "s + s);
+
+ if (list[0] == s) { // recursion rule
+ addRule(list, 1, list_size, state01, state01, s, type);
+ } else { // non-recursion rule
+ addRule(list, 0, list_size, state0, state01, s, type);
+ }
+ }
+ addTransition(state01, state1, 0); // empty transition to end
+}
+
+Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top)
+ , m_startState(newState()), m_endState(newState())
+{
+ addPath(m_startState, m_endState, m_top, "");
+}
+
+Token Lexer::getToken(const std::string& s, Location& location)
+{
+ Location oldLocation{location}; // start of token
+
+ std::vector<size_t> states{m_startState}; // can be in multiple states at once
+ std::vector<size_t> newStates;
+
+ Location found;
+ std::string state_type;
+
+ // match as much as possible
+ while (location.pos < s.size() && states.size() > 0) {
+ char currentChar{s[location.pos]};
+ std::cout << "DEBUG: Char: " << currentChar << std::endl;
+
+ for (const auto& state: states) {
+ std::vector<std::pair<size_t, char>> successors{getSuccessors(state)};
+ for (const auto& [nextState, c]: successors) {
+ if (c == currentChar) {
+ newStates.push_back(nextState);
+ if (nextState == m_endState) { // save intermediate result upon match
+ found = location;
+ found.advance();
+ state_type = m_state_types[state];
+ }
+ } else if (nextState == m_endState) { // save intermediate result w/o match of c
+ found = location;
+ state_type = m_state_types[state];
+ }
+ }
+ }
+ states = newStates;
+ newStates.clear();
+ location.advance(currentChar == '\n');
+ }
+
+ std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)};
+
+ if (found.pos > 0)
+ std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl;
+ else
+ throw std::runtime_error("Tokenize error at "s + oldLocation.toString());
+
+ location = found; // reset to end of match
+
+ return {state_type, value, oldLocation};
+}
+
+void Lexer::skipWhitespace(const std::string& s, Location& location)
+{
+ while (location.pos < s.size() && std::isspace(s[location.pos])) {
+ location.advance(s[location.pos] == '\n');
+ }
}
std::vector<Token> Lexer::Lex(const std::string& s)
{
std::vector<Token> result;
+ Location location;
+ skipWhitespace(s, location);
+ while (location.pos < s.size()) {
+ result.emplace_back(getToken(s, location));
+ skipWhitespace(s, location);
+ }
+
return result;
}