ton/tolk/lexer.cpp

/*
    This file is part of TON Blockchain Library.

    TON Blockchain Library is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    TON Blockchain Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with TON Blockchain Library.  If not, see <http://www.gnu.org/licenses/>.
*/
#include "lexer.h"
#include "symtable.h"
#include <sstream>
#include <cassert>

namespace tolk {

/*
 *
 *   LEXER
 *
 */

std::string Lexem::lexem_name_str(int idx) {
  if (idx == Eof) {
    return "end of file";
  } else if (idx == Ident) {
    return "identifier";
  } else if (idx == Number) {
    return "number";
  } else if (idx == String) {
    return "string";
  } else if (idx == Special) {
    return "special";
  } else if (symbols.get_keyword(idx)) {
    return "`" + symbols.get_keyword(idx)->str + "`";
  } else {
    std::ostringstream os{"<unknown lexem of type "};
    os << idx << ">";
    return os.str();
  }
}

std::string Lexem::name_str() const {
  if (tp == Ident) {
    return std::string{"identifier `"} + symbols.get_name(val) + "`";
  } else if (tp == String) {
    return std::string{"string \""} + str + '"';
  } else {
    return lexem_name_str(tp);
  }
}

bool is_number(std::string str) {
  auto st = str.begin(), en = str.end();
  if (st == en) {
    return false;
  }
  if (*st == '-') {
    st++;
  }
  bool hex = false;
  if (st + 1 < en && *st == '0' && st[1] == 'x') {
    st += 2;
    hex = true;
  }
  if (st == en) {
    return false;
  }
  while (st < en) {
    int c = *st;
    if (c >= '0' && c <= '9') {
      ++st;
      continue;
    }
    if (!hex) {
      return false;
    }
    c |= 0x20;
    if (c < 'a' || c > 'f') {
      return false;
    }
    ++st;
  }
  return true;
}

int Lexem::classify() {
  if (tp != Unknown) {
    return tp;
  }
  sym_idx_t i = symbols.lookup(str);
  if (i) {
    assert(str == symbols[i]->str);
    str = symbols[i]->str;
    sym_idx_t idx = symbols[i]->idx;
    tp = (idx < 0 ? -idx : Ident);
    val = i;
  } else if (is_number(str)) {
    tp = Number;
  } else {
    tp = 0;
  }
  if (tp == Unknown) {
    tp = Ident;
    val = symbols.lookup(str, 1);
  }
  return tp;
}

int Lexem::set(std::string _str, const SrcLocation& _loc, int _tp, int _val) {
  str = _str;
  loc = _loc;
  tp = _tp;
  val = _val;
  return classify();
}

Lexer::Lexer(SourceReader& _src, std::string active_chars, std::string quote_chars, std::string multiline_quote)
    : src(_src), eof(false), lexem("", src.here(), Lexem::Undefined), peek_lexem("", {}, Lexem::Undefined),
      multiline_quote(std::move(multiline_quote)) {
  std::memset(char_class, 0, sizeof(char_class));
  unsigned char activity = cc::active;
  for (char c : active_chars) {
    if (c == ' ') {
      if (!--activity) {
        activity = cc::allow_repeat;
      }
    } else if ((unsigned)c < 0x80) {
      char_class[(unsigned)c] |= activity;
    }
  }
  for (int c : quote_chars) {
    if (c > ' ' && c <= 0x7f) {
      char_class[(unsigned)c] |= cc::quote_char;
    }
  }
}

void Lexer::set_comment_tokens(const std::string &eol_cmts, const std::string &open_cmts, const std::string &close_cmts) {
  set_spec(eol_cmt, eol_cmts);
  set_spec(cmt_op, open_cmts);
  set_spec(cmt_cl, close_cmts);
}

void Lexer::set_comment2_tokens(const std::string &eol_cmts2, const std::string &open_cmts2, const std::string &close_cmts2) {
  set_spec(eol_cmt2, eol_cmts2);
  set_spec(cmt_op2, open_cmts2);
  set_spec(cmt_cl2, close_cmts2);
}

void Lexer::start_parsing() {
  next();
}

void Lexer::set_spec(std::array<int, 3>& arr, std::string setup) {
  arr[0] = arr[1] = arr[2] = -0x100;
  std::size_t n = setup.size(), i;
  for (i = 0; i < n; i++) {
    if (setup[i] == ' ') {
      continue;
    }
    if (i == n - 1 || setup[i + 1] == ' ') {
      arr[0] = setup[i];
    } else if (i == n - 2 || (i < n - 2 && setup[i + 2] == ' ')) {
      arr[1] = setup[i];
      arr[2] = setup[++i];
    } else {
      while (i < n && setup[i] != ' ') {
        i++;
      }
    }
  }
}

bool Lexer::is_multiline_quote(const char* begin, const char* end) {
  if (multiline_quote.empty()) {
    return false;
  }
  for (const char& c : multiline_quote) {
    if (begin == end || *begin != c) {
      return false;
    }
    ++begin;
  }
  return true;
}

void Lexer::expect(int exp_tp, const char* msg) {
  if (tp() != exp_tp) {
    throw ParseError{lexem.loc, (msg ? std::string{msg} : Lexem::lexem_name_str(exp_tp)) + " expected instead of " +
                                    cur().name_str()};
  }
  next();
}

const Lexem& Lexer::next() {
  if (peek_lexem.valid()) {
    lexem = std::move(peek_lexem);
    peek_lexem.clear({}, Lexem::Undefined);
    eof = (lexem.tp == Lexem::Eof);
    return lexem;
  }
  if (eof) {
    return lexem.clear(src.here(), Lexem::Eof);
  }
  long long comm = 1;
  // the code below is very complicated, because it tried to support one-symbol start/end and nesting
  // in Tolk, we decided to stop supporting nesting (it was never used in practice and almost impossible for js highlighters)
  // later on I'll simplify this code (more precisely, rewrite lexer from scratch)
  while (!src.seek_eof()) {
    int cc = src.cur_char(), nc = src.next_char();
    // note, that in practice, [0]-th element is -256, condition for [0]-th is always false
    // todo rewrite this all in the future
    if (cc == eol_cmt[0] || (cc == eol_cmt[1] && nc == eol_cmt[2]) || cc == eol_cmt2[0] || (cc == eol_cmt2[1] && nc == eol_cmt2[2])) {
      if (comm == 1) {    // just "//" — skip a whole line
        src.load_line();
      } else {            // if "//" is nested into "/*", continue reading, since "*/" may be met
        src.advance(1);
      }
    } else if (cc == cmt_op[1] && nc == cmt_op[2] || cc == cmt_op2[1] && nc == cmt_op2[2]) {
      src.advance(2);
      comm = comm * 2 + 1;
    } else if (cc == cmt_op[0] || cc == cmt_op2[0]) {  // always false
      src.advance(1);
      comm *= 2;
    } else if (comm == 1) {
      break; // means that we are not inside a comment
    } else if (cc == cmt_cl[1] && nc == cmt_cl[2] || cc == cmt_cl2[1] && nc == cmt_cl2[2]) {
      if (!(comm & 1)) { // always false
        src.error(std::string{"a `"} + (char)cmt_op[0] + "` comment closed by `" + (char)cmt_cl[1] + (char)cmt_cl[2] +
                  "`");
      }
      // note that {- may be closed with */, but assume it's ok (we'll get rid of {- in the future)
      comm = 1;
      src.advance(2);
    } else if (cc == cmt_cl[0] || cc == cmt_cl2[0]) { // always false
      if (!(comm & 1)) {
        src.error(std::string{"a `"} + (char)cmt_op[1] + (char)cmt_op[2] + "` comment closed by `" + (char)cmt_cl[0] +
                  "`");
      }
      comm = 1;
      src.advance(1);
    } else {
      src.advance(1);
    }
    if (comm < 0) {
      src.error("too many nested comments");
    }
  }
  if (src.seek_eof()) {
    eof = true;
    if (comm > 1) {
      src.error("comment extends past end of file");
    }
    return lexem.clear(src.here(), Lexem::Eof);
  }
  if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
    src.advance(multiline_quote.size());
    const char* end = nullptr;
    SrcLocation here = src.here();
    std::string body;
    while (!src.is_eof()) {
      if (src.is_eoln()) {
        body.push_back('\n');
        src.load_line();
        continue;
      }
      if (is_multiline_quote(src.get_ptr(), src.get_end_ptr())) {
        end = src.get_ptr();
        src.advance(multiline_quote.size());
        break;
      }
      body.push_back(src.cur_char());
      src.advance(1);
    }
    if (!end) {
      src.error("string extends past end of file");
    }
    lexem.set(body, here, Lexem::String);
    int c = src.cur_char();
    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
      lexem.val = c;
      src.advance(1);
    }
    return lexem;
  }
  int c = src.cur_char();
  const char* end = src.get_ptr();
  if (is_quote_char(c) || c == '`') {
    int qc = c;
    ++end;
    while (end < src.get_end_ptr() && *end != qc) {
      ++end;
    }
    if (*end != qc) {
      src.error(qc == '`' ? "a `back-quoted` token extends past end of line" : "string extends past end of line");
    }
    lexem.set(std::string{src.get_ptr() + 1, end}, src.here(), qc == '`' ? Lexem::Unknown : Lexem::String);
    src.set_ptr(end + 1);
    c = src.cur_char();
    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
      lexem.val = c;
      src.set_ptr(end + 2);
    }
    // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
    return lexem;
  }
  int len = 0, pc = -0x100;
  while (end < src.get_end_ptr()) {
    c = *end;
    bool repeated = (c == pc && is_repeatable(c));
    if (c == ' ' || c == 9 || (len && is_left_active(c) && !repeated)) {
      break;
    }
    ++len;
    ++end;
    if (is_right_active(c) && !repeated) {
      break;
    }
    pc = c;
  }
  lexem.set(std::string{src.get_ptr(), end}, src.here());
  src.set_ptr(end);
  // std::cerr << lexem.name_str() << ' ' << lexem.str << std::endl;
  return lexem;
}

const Lexem& Lexer::peek() {
  if (peek_lexem.valid()) {
    return peek_lexem;
  }
  if (eof) {
    return lexem.clear(src.here(), Lexem::Eof);
  }
  Lexem keep = std::move(lexem);
  next();
  peek_lexem = std::move(lexem);
  lexem = std::move(keep);
  eof = false;
  return peek_lexem;
}

}  // namespace tolk