New lexer

author: Roland Reichwein <mail@reichwein.it> 2020-03-15 18:19:49 +0100
committer: Roland Reichwein <mail@reichwein.it> 2020-03-15 18:19:49 +0100
commit: 9f69b006dde3c3fbe19ed3e0275d3b7348f2aa87 (patch)
tree: 6ac42793568339463f913cf39474794c8613d0b6
parent: 3a7006fcf5f8ecffd852fbba6d8ee03ce8a35dce (diff)
9 files changed, 241 insertions, 19 deletions
diff --git a/bnf.cpp b/bnf.cpp
index 4dd896e..32e1175 100644
--- a/bnf.cpp
+++ b/bnf.cpp
@@ -68,3 +68,8 @@ BNF SubBNF(const BNF& bnf, const std::string& top)
  return result;
 }
 
+bool isTerminal(const BNF& bnf, const std::string& symbol)
+{
+ return bnf.find(symbol) == bnf.end();
+}
+
diff --git a/bnf.h b/bnf.h
index 13337bf..9244b4d 100644
--- a/bnf.h
+++ b/bnf.h
@@ -15,3 +15,5 @@ std::unordered_map<std::string, std::set<std::string>> Reverse(BNF bnf); // unus
 std::unordered_map<std::string, std::set<std::string>> reverseFirst(BNF bnf);
 
 BNF SubBNF(const BNF& bnf, const std::string& top);
+
+bool isTerminal(const BNF& bnf, const std::string& symbol);
diff --git a/cpp.cpp b/cpp.cpp
index 714045b..3d8c20d 100644
--- a/cpp.cpp
+++ b/cpp.cpp
@@ -247,12 +247,14 @@ TEST_F(CppTest, preprocessing_tokenize) {
 }
 #endif
 
+#if 0
 TEST_F(CppTest, preprocessing_tokenize2) {
  CPP cpp;
  auto ppTree = cpp.preprocessing_tokenize("in ma");
 
  cpp.tokens_from_pptokens(ppTree);
 }
+#endif
 
 #if 0
 TEST(Cpp, translate) {
diff --git a/cppbnf.cpp b/cppbnf.cpp
index 0b6e58f..4818fc3 100755
--- a/cppbnf.cpp
+++ b/cppbnf.cpp
@@ -39,11 +39,6 @@ namespace {
   return result;
  }
 
- bool isTerminal(const std::string& symbol, const BNF& bnf)
- {
-  return bnf.find(symbol) == bnf.end();
- }
-
  size_t numberOfStartSymbols(const BNF& bnf)
  {
   // exactly 1 start symbol
@@ -75,7 +70,7 @@ namespace {
    for (const auto& list : lists) {
     for (const auto& i : list) {
 
-     if (!isTerminal(i, bnf)) {
+     if (!isTerminal(bnf, i)) {
       // every non-terminal symbol must be longer that 1 char
       if (i.size() == 1) {
        std::cerr << "Warning: Non-Terminal symbol " << i << " in " << symbol << " is too short." << std::endl;
@@ -107,7 +102,7 @@ namespace {
   for (const auto& [symbol, lists] : bnf) {
    for (const auto& list : lists) {
     for (const auto& i : list) {
-     if (i.size() != 1 && isTerminal(i, bnf)) {
+     if (i.size() != 1 && isTerminal(bnf, i)) {
       std::cerr << "Warning: Terminal symbol in " << symbol << " is too long: "s << i << std::endl;
       return false;
      }
@@ -197,7 +192,7 @@ namespace {
   for (auto& [symbol, lists] : bnf) {
    for (auto& list : lists) {
     for (int i = 0; i < list.size(); i++) {
-     if (list[i].size() > 1 && isTerminal(list[i], bnf)) {
+     if (list[i].size() > 1 && isTerminal(bnf, list[i])) {
       auto newList = vectorize(list[i]);
       list.erase(list.begin() + i);
       list.insert(list.begin() + i, newList.begin(), newList.end());
diff --git a/lexer.cpp b/lexer.cpp
index 4a458c3..61caae1 100644
--- a/lexer.cpp
+++ b/lexer.cpp
@@ -2,14 +2,178 @@
 
 using namespace Lex;
 
-Lexer::Lexer(const BNF& bnf, const std::string& Top)
+size_t Lexer::newState(std::string state_type)
 {
+ m_state_types.push_back(state_type);
+ return states++;
+}
+
+void Lexer::addTransition(size_t state0, size_t state1, char c)
+{
+ auto it{transitions.find(state0)};
+
+ if (it == transitions.end()) { // new entry is a list with 1 entry
+  transitions[state0] = {{state1, c}};
+ } else { // extend list entry
+  it->second.push_back({state1, c});
+ }
+}
+
+std::vector<std::pair<size_t, char>> Lexer::getSuccessors(size_t state)
+{
+ std::vector<std::pair<size_t, char>> result;
+
+ auto it{transitions.find(state)};
+
+ if (it != transitions.end()) { // new list entry
+  for (auto &i: it->second) {
+   if (i.first == m_endState || i.second != '\0') { // add transition to end state or transition via actual char
+    result.push_back(i);
+   } else { // follow empty transition
+    auto successors { getSuccessors(i.first) };
+    result.insert(result.end(), successors.begin(), successors.end());
+   }
+  }
+ }
+
+ return result;
+}
+
+// Helper function
+void Lexer::addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type)
+{
+ if (isTerminal(m_bnf, symbol)) { // end recursion with new transition
+  if (symbol.size() != 1)
+   throw std::runtime_error("Bad sized terminal symbol: "s + symbol);
+  addTransition(state0, state1, symbol[0]);
+ } else { // recurse via non-terminal symbol
+  addPath(state0, state1, symbol, type);
+ }
+}
+
+// Helper function: add one rule
+void Lexer::addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type)
+{
+ size_t previousState{state0};
+ 
+ // add intermediate states with transitions
+ for (size_t i = list_index_from; i < list_index_to - 1; i++) {
+  std::string symbol{list[i]};
+  if (symbol == rule_symbol)
+   throw std::runtime_error("Recursion found but not allowed");
+
+  size_t state{newState(type)};
+  addPathOrTransition(previousState, state, symbol, type);
+  previousState = state;
+ }
+ if (list.back() == rule_symbol)
+  throw std::runtime_error("Recursion found but not allowed");
+
+ // last transition
+ addPathOrTransition(previousState, state1, list.back(), type);
+}
+
+// Add paths between state0 and state1, including new states and transitions
+void Lexer::addPath(size_t state0, size_t state1, std::string s, std::string type)
+{
+ if (type == "" && s != "" && s != m_top)
+  type = s;
+
+ // state0 -> [paths] -> state01 -> state1
+ //                      ^     v
+ //                 [recursion paths]
+ size_t state01{newState(type)};
+ auto it {m_bnf.find(s)};
+
+ if (it == m_bnf.end())
+  throw std::runtime_error("Path ("s + std::to_string(state0) + ", "s + std::to_string(state1) + ") not possible."s);
+
+ for (auto& list: it->second) { // for every path between state0 and state1
+  size_t list_size{list.size()};
+  if (list_size < 1)
+   throw std::runtime_error("List too small in rule "s + s);
+
+  if (list[0] == s) { // recursion rule
+   addRule(list, 1, list_size, state01, state01, s, type);
+  } else { // non-recursion rule
+   addRule(list, 0, list_size, state0, state01, s, type);
+  }
+ }
+ addTransition(state01, state1, 0); // empty transition to end
+}
+
+Lexer::Lexer(const BNF& bnf, const std::string& top): m_bnf(bnf), m_top(top)
+                                                      , m_startState(newState()), m_endState(newState())
+{
+ addPath(m_startState, m_endState, m_top, "");
+}
+
+Token Lexer::getToken(const std::string& s, Location& location)
+{
+ Location oldLocation{location}; // start of token
+
+ std::vector<size_t> states{m_startState}; // can be in multiple states at once
+ std::vector<size_t> newStates;
+
+ Location found;
+ std::string state_type;
+
+ // match as much as possible
+ while (location.pos < s.size() && states.size() > 0) {
+  char currentChar{s[location.pos]};
+  std::cout << "DEBUG: Char: " << currentChar << std::endl;
+
+  for (const auto& state: states) {
+   std::vector<std::pair<size_t, char>> successors{getSuccessors(state)};
+   for (const auto& [nextState, c]: successors) {
+    if (c == currentChar) {
+     newStates.push_back(nextState);
+     if (nextState == m_endState) { // save intermediate result upon match
+      found = location;
+      found.advance();
+      state_type = m_state_types[state];
+     }
+    } else if (nextState == m_endState) { // save intermediate result w/o match of c
+     found = location;
+     state_type = m_state_types[state];
+    }
+   }
+  }
+  states = newStates;
+  newStates.clear();
+  location.advance(currentChar == '\n');
+ }
+
+ std::string value {s.substr(oldLocation.pos, found.pos - oldLocation.pos)};
+
+ if (found.pos > 0)
+  std::cout << "DEBUG: Matched " << found.pos - oldLocation.pos << " chars: " << value << "|" << state_type << std::endl;
+ else
+  throw std::runtime_error("Tokenize error at "s + oldLocation.toString());
+
+ location = found; // reset to end of match
+
+ return {state_type, value, oldLocation};
+}
+
+void Lexer::skipWhitespace(const std::string& s, Location& location)
+{
+ while (location.pos < s.size() && std::isspace(s[location.pos])) {
+  location.advance(s[location.pos] == '\n');
+ }
 }
 
 std::vector<Token> Lexer::Lex(const std::string& s)
 {
  std::vector<Token> result;
 
+ Location location;
+ skipWhitespace(s, location);
+ while (location.pos < s.size()) {
+  result.emplace_back(getToken(s, location));
+  skipWhitespace(s, location);
+ }
+
  return result;
 }
 
diff --git a/lexer.h b/lexer.h
index f9363ef..eb41e3d 100644
--- a/lexer.h
+++ b/lexer.h
@@ -3,16 +3,42 @@
 #include "minicc.h"
 #include "bnf.h"
 
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 namespace Lex {
 
 class Lexer
 {
+ // constructor input
+ const BNF& m_bnf;
+ const std::string& m_top;
+
+ // Graph
+ 
+ size_t states{}; // start, ...
+ std::unordered_map<size_t, std::vector<std::pair<size_t, char>>> transitions; //transitions: state -> {state,character}, ...; empty transition is marked by \0
+ std::vector<std::string> m_state_types;
+ size_t m_startState;
+ size_t m_endState;
+
+ // Graph manipulation
+ 
+ size_t newState(std::string state_type = "");
+ void addTransition(size_t state0, size_t state1, char c);
+ std::vector<std::pair<size_t, char>> getSuccessors(size_t state);
+
+ // Build up automaton, recursively
+ void addPath(size_t state0, size_t state1, std::string s, std::string type);
+ void addPathOrTransition(size_t state0, size_t state1, std::string symbol, std::string type);
+ void addRule(const std::vector<std::string>& list, size_t list_index_from, size_t list_index_to, size_t state0, size_t state1, const std::string& rule_symbol, std::string type);
 
- //states; // start, ...
- //transitions; // state, state, character
+ Token getToken(const std::string& s, Location& location);
+ void skipWhitespace(const std::string& s, Location& location);
 
 public:
- Lexer(const BNF& bnf, const std::string& Top);
+ Lexer(const BNF& bnf, const std::string& top);
  std::vector<Token> Lex(const std::string& s);
 
 };
diff --git a/minicc.cpp b/minicc.cpp
index 48d2a15..d180517 100644
--- a/minicc.cpp
+++ b/minicc.cpp
@@ -14,6 +14,8 @@
 #include <utility>
 #include <vector>
 
+using namespace std::string_literals;
+
 std::vector<std::string> split(std::string s)
 {
  std::vector<std::string> result;
@@ -37,3 +39,18 @@ std::ostream& operator<<(std::ostream& os, const Token& token) {
  return os << token.type << ": " << token.value << "(" << token.location.line << ":" << token.location.column << ")";
 }
 
+void Location::advance(bool newline)
+{
+ pos++;
+ if (newline) {
+  line++;
+  column = 1;
+ } else {
+  column++;
+ }
+}
+
+std::string Location::toString()
+{
+ return std::to_string(line) + ":"s + std::to_string(column);
+}
diff --git a/minicc.h b/minicc.h
index 9bc8945..861a30a 100644
--- a/minicc.h
+++ b/minicc.h
@@ -10,8 +10,12 @@ using index_t = size_t;
 std::vector<std::string> split(std::string s);
 
 struct Location {
- size_t line;
- size_t column;
+ size_t line{1};
+ size_t column{1};
+ size_t pos{0};
+
+ void advance(bool newline = false);
+ std::string toString();
 };
 
 bool operator==(const Location &a, const Location &b);
diff --git a/test-lexer.cpp b/test-lexer.cpp
index a94e550..9f1cb77 100644
--- a/test-lexer.cpp
+++ b/test-lexer.cpp
@@ -1,5 +1,6 @@
 #include "bnf.h"
 #include "cpp.h"
+#include "cppbnf.h"
 #include "lexer.h"
 #include "grammer.h"
 #include "minicc.h"
@@ -19,14 +20,20 @@
 #include <utility>
 #include <vector>
 
-class Test: public ::testing::Test {
+class LexerTest: public ::testing::Test {
 protected:
- Test(){
+ LexerTest(){
   debug = false;
  }
- ~Test() override {}
+ ~LexerTest() override {}
 };
 
-TEST_F(Test, BNF) {
-}
+TEST_F(LexerTest, Lex) {
+ auto bnf{SubBNF(GetCppBNFLex(), "preprocessing-token")};
+
+ Lex::Lexer lexer(bnf, "preprocessing-token");
 
+ std::vector<Token> tokens{lexer.Lex("int main() { return 1; }")};
+
+ ASSERT_EQ(tokens.size(), 9);
+}
author	Roland Reichwein <mail@reichwein.it>	2020-03-15 18:19:49 +0100
committer	Roland Reichwein <mail@reichwein.it>	2020-03-15 18:19:49 +0100
commit	9f69b006dde3c3fbe19ed3e0275d3b7348f2aa87 (patch)
tree	6ac42793568339463f913cf39474794c8613d0b6
parent	3a7006fcf5f8ecffd852fbba6d8ee03ce8a35dce (diff)