ton/tolk-tester/tolk-tester.py

# Usage: `tolk-tester.py tests_dir` OR `tolk-tester.py test_file.tolk`
# from current dir, providing some env (see getenv() calls).
# Every .tolk file should provide /* testcase description in a comment */, consider tests/ folder.
#
# Tests for Tolk can be
# * positive (compiled to .fif, run with fift, compared output with the one expected)
# * negative (compilation fails, and it's expected; patterns in stderr can be specified)
#
# Note, that there is also tolk-tester.js to test Tolk compiled to WASM.
# Don't forget to keep it identical to Python version!

import os
import os.path
import re
import shutil
import subprocess
import sys
import tempfile
from typing import List


def getenv(name, default=None):
    if name in os.environ:
        return os.environ[name]
    if default is None:
        print("Environment variable", name, "is not set", file=sys.stderr)
        exit(1)
    return default


TOLK_EXECUTABLE = getenv("TOLK_EXECUTABLE", "tolk")
FIFT_EXECUTABLE = getenv("FIFT_EXECUTABLE", "fift")
FIFT_LIBS_FOLDER = getenv("FIFTPATH")  # this env is needed for fift to work properly
TMP_DIR = tempfile.mkdtemp()


class CmdLineOptions:
    def __init__(self, argv: List[str]):
        if len(argv) != 2:
            print("Usage: tolk-tester.py tests_dir OR tolk-tester.py test_file.tolk", file=sys.stderr)
            exit(1)
        if not os.path.exists(argv[1]):
            print("Input '%s' doesn't exist" % argv[1], file=sys.stderr)
            exit(1)

        if os.path.isdir(argv[1]):
            self.tests_dir = argv[1]
            self.test_file = None
        else:
            self.tests_dir = os.path.dirname(argv[1])
            self.test_file = argv[1]

    def find_tests(self) -> List[str]:
        if self.test_file is not None:  # an option to run (debug) a single test
            return [self.test_file]

        tests = [f for f in os.listdir(self.tests_dir) if f.endswith(".tolk") or f.endswith(".ton")]
        tests.sort()
        return [os.path.join(self.tests_dir, f) for f in tests]


class ParseInputError(Exception):
    pass


class TolkCompilationFailedError(Exception):
    def __init__(self, message: str, stderr: str):
        super().__init__(message)
        self.stderr = stderr


class TolkCompilationSucceededError(Exception):
    pass


class FiftExecutionFailedError(Exception):
    def __init__(self, message: str, stderr: str):
        super().__init__(message)
        self.stderr = stderr


class CompareOutputError(Exception):
    def __init__(self, message: str, output: str):
        super().__init__(message)
        self.output = output


class CompareFifCodegenError(Exception):
    pass


class CompareCodeHashError(Exception):
    pass


class TolkTestCaseInputOutput:
    """
    In positive tests, there are several testcases "input X should produce output Y".
    They are written as a table:
    @testcase | method_id | input (one or several) | output
    """
    reJustNumber = re.compile(r"[-+]?\d+")
    reMathExpr = re.compile(r"[0x123456789()+\-*/<>]+")

    def __init__(self, method_id_str: str, input_str: str, output_str: str):
        processed_inputs = []
        for in_arg in input_str.split(" "):
            if len(in_arg) == 0:
                continue
            elif in_arg.startswith("x{") or TolkTestCaseInputOutput.reJustNumber.fullmatch(in_arg):
                processed_inputs.append(in_arg)
            elif TolkTestCaseInputOutput.reMathExpr.fullmatch(in_arg):
                processed_inputs.append(str(eval(in_arg)))
            elif in_arg == "null":
                processed_inputs.append("null")
            else:
                raise ParseInputError("'%s' can't be evaluated" % in_arg)

        self.method_id = int(method_id_str)
        self.input = " ".join(processed_inputs)
        self.expected_output = output_str

    def check(self, stdout_lines: List[str], line_idx: int):
        if stdout_lines[line_idx] != self.expected_output:
            raise CompareOutputError("error on case #%d (%d | %s): expected '%s', found '%s'" % (line_idx + 1, self.method_id, self.input, self.expected_output, stdout_lines[line_idx]), "\n".join(stdout_lines))


class TolkTestCaseStderr:
    """
    @stderr checks, when compilation fails, that stderr (compilation error) is expected.
    If it's multiline, all lines must be present in specified order.
    """

    def __init__(self, stderr_pattern: List[str], avoid: bool):
        self.stderr_pattern = stderr_pattern
        self.avoid = avoid

    def check(self, stderr: str):
        line_match = self.find_pattern_in_stderr(stderr.splitlines())
        if line_match == -1 and not self.avoid:
            raise CompareOutputError("pattern not found in stderr:\n%s" %
                                     "\n".join(map(lambda x: "    " + x, self.stderr_pattern)), stderr)
        elif line_match != -1 and self.avoid:
            raise CompareOutputError("pattern found (line %d), but not expected to be:\n%s" %
                                     (line_match + 1, "\n".join(map(lambda x: "    " + x, self.stderr_pattern))), stderr)

    def find_pattern_in_stderr(self, stderr: List[str]) -> int:
        for line_start in range(len(stderr)):
            if self.try_match_pattern(0, stderr, line_start):
                return line_start
        return -1

    def try_match_pattern(self, pattern_offset: int, stderr: List[str], offset: int) -> bool:
        if pattern_offset >= len(self.stderr_pattern):
            return True
        if offset >= len(stderr):
            return False

        line_pattern = self.stderr_pattern[pattern_offset]
        line_output = stderr[offset]
        return line_output.find(line_pattern) != -1 and self.try_match_pattern(pattern_offset + 1, stderr, offset + 1)


class TolkTestCaseFifCodegen:
    """
    @fif_codegen checks that contents of compiled.fif matches the expected pattern.
    @fif_codegen_avoid checks that is does not match the pattern.
    The pattern is a multiline piece of fift code, optionally with "..." meaning "any lines here".
    See tests/codegen_check_demo.tolk of how it looks.
    A notable thing about indentations (spaces at line starts):
    Taking them into account will complicate the code without reasonable profit,
    that's why we just trim every string.
    And one more word about //comments. Tolk inserts them into fift output.
    If a line in the pattern contains a //comment, it's expected to be equal.
    If a line does not, we just compare a command.
    """

    def __init__(self, fif_pattern: List[str], avoid: bool):
        self.fif_pattern = [s.strip() for s in fif_pattern]
        self.avoid = avoid

    def check(self, fif_output: List[str]):
        line_match = self.find_pattern_in_fif_output(fif_output)
        if line_match == -1 and not self.avoid:
            raise CompareFifCodegenError("pattern not found:\n%s" %
                                         "\n".join(map(lambda x: "    " + x, self.fif_pattern)))
        elif line_match != -1 and self.avoid:
            raise CompareFifCodegenError("pattern found (line %d), but not expected to be:\n%s" %
                                         (line_match + 1, "\n".join(map(lambda x: "    " + x, self.fif_pattern))))

    def find_pattern_in_fif_output(self, fif_output: List[str]) -> int:
        for line_start in range(len(fif_output)):
            if self.try_match_pattern(0, fif_output, line_start):
                return line_start
        return -1

    def try_match_pattern(self, pattern_offset: int, fif_output: List[str], offset: int) -> bool:
        if pattern_offset >= len(self.fif_pattern):
            return True
        if offset >= len(fif_output):
            return False
        line_pattern = self.fif_pattern[pattern_offset]
        line_output = fif_output[offset]

        if line_pattern != "...":
            if not TolkTestCaseFifCodegen.does_line_match(line_pattern, line_output):
                return False
            return self.try_match_pattern(pattern_offset + 1, fif_output, offset + 1)
        while offset < len(fif_output):
            if self.try_match_pattern(pattern_offset + 1, fif_output, offset):
                return True
            offset = offset + 1
        return False

    @staticmethod
    def split_line_to_cmd_and_comment(trimmed_line: str) -> tuple:
        pos = trimmed_line.find("//")
        if pos == -1:
            return trimmed_line, None
        else:
            return trimmed_line[:pos].rstrip(), trimmed_line[pos + 2:].lstrip()

    @staticmethod
    def does_line_match(line_pattern: str, line_output: str) -> bool:
        cmd_pattern, comment_pattern = TolkTestCaseFifCodegen.split_line_to_cmd_and_comment(line_pattern)
        cmd_output, comment_output = TolkTestCaseFifCodegen.split_line_to_cmd_and_comment(line_output.strip())
        return cmd_pattern == cmd_output and (comment_pattern is None or comment_pattern == comment_output)


class TolkTestCaseExpectedHash:
    """
    @code_hash checks that hash of compiled output.fif matches the provided value.
    It's used to "record" code boc hash and to check that it remains the same on compiler modifications.
    Being much less flexible than @fif_codegen, it nevertheless gives a guarantee of bytecode stability.
    """

    def __init__(self, expected_hash: str):
        self.code_hash = expected_hash

    def check(self, fif_code_hash: str):
        if self.code_hash != fif_code_hash:
            raise CompareCodeHashError("expected %s, actual %s" % (self.code_hash, fif_code_hash))


class TolkTestFile:
    def __init__(self, tolk_filename: str, artifacts_folder: str):
        self.line_idx = 0
        self.tolk_filename = tolk_filename
        self.artifacts_folder = artifacts_folder
        self.compilation_should_fail = False
        self.stderr_includes: List[TolkTestCaseStderr] = []
        self.input_output: List[TolkTestCaseInputOutput] = []
        self.fif_codegen: List[TolkTestCaseFifCodegen] = []
        self.expected_hash: TolkTestCaseExpectedHash | None = None
        self.experimental_options: str | None = None

    def parse_input_from_tolk_file(self):
        with open(self.tolk_filename, "r") as fd:
            lines = fd.read().splitlines()
        self.line_idx = 0

        while self.line_idx < len(lines):
            line = lines[self.line_idx]
            if line.startswith("@testcase"):
                s = [x.strip() for x in line.split("|")]
                if len(s) != 4:
                    raise ParseInputError("incorrect format of @testcase: %s" % line)
                self.input_output.append(TolkTestCaseInputOutput(s[1], s[2], s[3]))
            elif line.startswith("@compilation_should_fail"):
                self.compilation_should_fail = True
            elif line.startswith("@stderr"):
                self.stderr_includes.append(TolkTestCaseStderr(self.parse_string_value(lines), False))
            elif line.startswith("@fif_codegen_avoid"):
                self.fif_codegen.append(TolkTestCaseFifCodegen(self.parse_string_value(lines), True))
            elif line.startswith("@fif_codegen"):
                self.fif_codegen.append(TolkTestCaseFifCodegen(self.parse_string_value(lines), False))
            elif line.startswith("@code_hash"):
                self.expected_hash = TolkTestCaseExpectedHash(self.parse_string_value(lines, False)[0])
            elif line.startswith("@experimental_options"):
                self.experimental_options = line[22:]
            self.line_idx = self.line_idx + 1

        if len(self.input_output) == 0 and not self.compilation_should_fail:
            raise ParseInputError("no @testcase present")
        if len(self.input_output) != 0 and self.compilation_should_fail:
            raise ParseInputError("@testcase present, but compilation_should_fail")

    def parse_string_value(self, lines: List[str], allow_multiline = True) -> List[str]:
        # a tag must be followed by a space (single-line), e.g. '@stderr some text'
        # or be a multi-line value, surrounded by """
        line = lines[self.line_idx]
        pos_sp = line.find(' ')
        is_multi_line = lines[self.line_idx + 1] == '"""'
        is_single_line = pos_sp != -1
        if not is_single_line and not is_multi_line:
            raise ParseInputError('%s value is empty (not followed by a string or a multiline """)' % line)
        if is_single_line and is_multi_line:
            raise ParseInputError('%s value is both single-line and followed by """' % line[:pos_sp])
        if is_multi_line and not allow_multiline:
            raise ParseInputError("%s value should be single-line" % line)

        if is_single_line:
            return [line[pos_sp + 1:].strip()]

        self.line_idx += 2
        s_multiline = []
        while self.line_idx < len(lines) and lines[self.line_idx] != '"""':
            s_multiline.append(lines[self.line_idx])
            self.line_idx = self.line_idx + 1
        return s_multiline

    def get_compiled_fif_filename(self):
        return self.artifacts_folder + "/compiled.fif"

    def get_runner_fif_filename(self):
        return self.artifacts_folder + "/runner.fif"

    def run_and_check(self):
        cmd_args = [TOLK_EXECUTABLE, "-o", self.get_compiled_fif_filename()]
        if self.experimental_options:
            cmd_args = cmd_args + ["-x", self.experimental_options]
        res = subprocess.run(cmd_args + [self.tolk_filename], capture_output=True, timeout=10)
        exit_code = res.returncode
        stderr = str(res.stderr, "utf-8")
        stdout = str(res.stdout, "utf-8")

        if exit_code == 0 and self.compilation_should_fail:
            raise TolkCompilationSucceededError("compilation succeeded, but it should have failed")

        for should_include in self.stderr_includes: # @stderr is used to check errors and warnings
            should_include.check(stderr)

        if exit_code != 0 and self.compilation_should_fail:
            return

        if exit_code != 0 and not self.compilation_should_fail:
            raise TolkCompilationFailedError("tolk exit_code = %d" % exit_code, stderr)

        with open(self.get_runner_fif_filename(), "w") as fd:
            fd.write("\"%s\" include <s constant code\n" % self.get_compiled_fif_filename())
            for t in self.input_output:
                fd.write("%s %d code 1 runvmx abort\"exitcode is not 0\" .s cr { drop } depth 1- times\n" % (t.input, t.method_id))
            if self.expected_hash is not None:
                fd.write("\"%s\" include hash .s\n" % self.get_compiled_fif_filename())

        res = subprocess.run([FIFT_EXECUTABLE, self.get_runner_fif_filename()], capture_output=True, timeout=10)
        exit_code = res.returncode
        stderr = str(res.stderr, "utf-8")
        stdout = str(res.stdout, "utf-8")
        if exit_code != 0:
            raise FiftExecutionFailedError("fift exit_code = %d" % exit_code, stderr)

        stdout_lines = [x.strip() for x in stdout.split("\n")]
        stdout_lines = [x for x in stdout_lines if x != ""]
        fif_code_hash = None
        if self.expected_hash is not None:  # then the last stdout line is a hash
            fif_code_hash = stdout_lines[-1]
            stdout_lines = stdout_lines[:-1]

        if len(stdout_lines) != len(self.input_output):
            raise CompareOutputError("unexpected number of fift output: %d lines, but %d testcases" % (len(stdout_lines), len(self.input_output)), stdout)

        for i in range(len(stdout_lines)):
            self.input_output[i].check(stdout_lines, i)

        if len(self.fif_codegen):
            with open(self.get_compiled_fif_filename()) as fd:
                fif_output = fd.readlines()
            for fif_codegen in self.fif_codegen:
                fif_codegen.check(fif_output)

        if self.expected_hash is not None:
            self.expected_hash.check(fif_code_hash)


def run_all_tests(tests: List[str]):
    for ti in range(len(tests)):
        tolk_filename = tests[ti]
        print("Running test %d/%d: %s" % (ti + 1, len(tests), tolk_filename), file=sys.stderr)

        artifacts_folder = os.path.join(TMP_DIR, tolk_filename)
        testcase = TolkTestFile(tolk_filename, artifacts_folder)
        try:
            if not os.path.exists(artifacts_folder):
                os.makedirs(artifacts_folder)
            testcase.parse_input_from_tolk_file()
            testcase.run_and_check()
            shutil.rmtree(artifacts_folder)

            if testcase.compilation_should_fail:
                print("  OK, compilation failed as it should", file=sys.stderr)
            else:
                print("  OK, %d cases" % len(testcase.input_output), file=sys.stderr)
        except ParseInputError as e:
            print("  Error parsing input (cur line #%d):" % (testcase.line_idx + 1), e, file=sys.stderr)
            exit(2)
        except TolkCompilationFailedError as e:
            print("  Error compiling tolk:", e, file=sys.stderr)
            print("  stderr:", file=sys.stderr)
            print(e.stderr.rstrip(), file=sys.stderr)
            exit(2)
        except TolkCompilationSucceededError as e:
            print("  Error:", e, file=sys.stderr)
            exit(2)
        except FiftExecutionFailedError as e:
            print("  Error executing fift:", e, file=sys.stderr)
            print("  stderr:", file=sys.stderr)
            print(e.stderr.rstrip(), file=sys.stderr)
            print("  compiled.fif at:", testcase.get_compiled_fif_filename(), file=sys.stderr)
            exit(2)
        except CompareOutputError as e:
            print("  Mismatch in output:", e, file=sys.stderr)
            print("  Full output:", file=sys.stderr)
            print(e.output.rstrip(), file=sys.stderr)
            print("  Was compiled to:", testcase.get_compiled_fif_filename(), file=sys.stderr)
            exit(2)
        except CompareFifCodegenError as e:
            print("  Mismatch in fif codegen:", e, file=sys.stderr)
            print("  Was compiled to:", testcase.get_compiled_fif_filename(), file=sys.stderr)
            print(open(testcase.get_compiled_fif_filename()).read(), file=sys.stderr)
            exit(2)
        except CompareCodeHashError as e:
            print("  Mismatch in code hash:", e, file=sys.stderr)
            print("  Was compiled to:", testcase.get_compiled_fif_filename(), file=sys.stderr)
            exit(2)


tests = CmdLineOptions(sys.argv).find_tests()
print("Found", len(tests), "tests", file=sys.stderr)
run_all_tests(tests)
print("Done, %d tests" % len(tests), file=sys.stderr)