From acc33ffeb8f5eae5e6bb805f1cb409841f0aad75 Mon Sep 17 00:00:00 2001 From: Kyle Gunger Date: Sun, 31 Mar 2024 03:09:06 -0400 Subject: Tenative tokenizer --- tnslc/compile/compile.tnsl | 1 + tnslc/compile/error.tnsl | 14 ++ tnslc/compile/generator.tnsl | 27 ++- tnslc/compile/tokenizer.tnsl | 388 ++++++++++++++++++++++++++++++++++++++++++- tnslc/test.tnsl | 3 + tnslc/tnslc.tnsl | 2 +- 6 files changed, 419 insertions(+), 16 deletions(-) create mode 100644 tnslc/compile/error.tnsl create mode 100644 tnslc/test.tnsl diff --git a/tnslc/compile/compile.tnsl b/tnslc/compile/compile.tnsl index 00fe081..eca0247 100644 --- a/tnslc/compile/compile.tnsl +++ b/tnslc/compile/compile.tnsl @@ -3,4 +3,5 @@ :import "tokenizer.tnsl" :import "lexer.tnsl" :import "generator.tnsl" + :import "error.tnsl" ;/ diff --git a/tnslc/compile/error.tnsl b/tnslc/compile/error.tnsl new file mode 100644 index 0000000..7857075 --- /dev/null +++ b/tnslc/compile/error.tnsl @@ -0,0 +1,14 @@ + +~uint8 ERR_NUM = ":%d\0" +~uint8 TOK_PRNT = " \"%s\": \0" + +/; report_error (utils.File file, Token token, ~uint8 message) + ~uint s = file.path.to_cstr('/') + _printf(s) + _print_num(ERR_NUM, token.line) + _print_num(ERR_NUM, token.col) + _print_num(TOK_PRNT, token.data) + _printf(message) + _printf(newline) +;/ + diff --git a/tnslc/compile/generator.tnsl b/tnslc/compile/generator.tnsl index 28a834d..eedc552 100644 --- a/tnslc/compile/generator.tnsl +++ b/tnslc/compile/generator.tnsl @@ -1,16 +1,25 @@ -/; generate (utils.File fin, fout) - fin.open() - fout.create() +~uint8 TOKEN_COUNT = "Token count: %d\n\0" - uint8 buf = fin.read() - /; loop (fin.at_end == false && fout.at_end == false) - fout.write(buf) - buf = fin.read() +/; generate (~utils.File fin, fout) + + utils.Vector tokens = tokenize(fin) + + _print_num(TOKEN_COUNT, tokens.count) + + fout`.create() + + /; loop (int i = 0; i < tokens.count) [i++] + ~Token tok = tokens.get(i) + ~uint8 buf = tok`.sprint() + fout`.write_cstr(buf) + fout`.write('\n') + _delete(buf) ;/ - fin.close() - fout.close() + fout`.close() + + free_token_list(~tokens) ;/ diff --git a/tnslc/compile/tokenizer.tnsl b/tnslc/compile/tokenizer.tnsl index 722a5a0..e528e34 100644 --- a/tnslc/compile/tokenizer.tnsl +++ b/tnslc/compile/tokenizer.tnsl @@ -1,9 +1,13 @@ +bool HAD_ERROR = false + struct Token { ~uint8 data, int _type, line, - col + col, + int + closing # only has meaning for delimiters } /; method Token @@ -14,10 +18,46 @@ struct Token { /; eq_str(~uint8 str) [bool] return utils.strcmp(self.data, str) ;/ + + /; sprint [~uint8] + utils.Vector out + out.init(1) + + ~uint8 tmp + + out.push_char('{') + + out.push_cstr(self.data) + + out.push_char(',') + out.push_char(' ') + + tmp = utils.int_to_str(self._type) + out.push_cstr(tmp) + _delete(tmp) + + out.push_char(',') + out.push_char(' ') + + tmp = utils.int_to_str(self.line) + out.push_cstr(tmp) + _delete(tmp) + + out.push_char(',') + out.push_char(' ') + + tmp = utils.int_to_str(self.col) + out.push_cstr(tmp) + _delete(tmp) + + out.push_char('}') + + return out.as_cstr() + ;/ ;/ /; _is_space(uint8 char) [bool] - /; if (char == '\t' || char == '\n' || char == '\r' || char == ' ') + /; if (char == '\t' || char == '\r' || char == ' ') return true ;/ return false @@ -56,20 +96,356 @@ struct Token { ~uint8 KEYTYPES = "uint8,uint16,uint32,uint64,uint,int8,int16,int32,int64,int,float32,float64,float,bool,vect,void\0" ~uint8 LITERALS = "false,true\0" -~uint8 RESERVED = "~`!@#$%^&*()[]{}+_=\"\'\\|;:/?.>,<\0" +~uint8 RESERVED = "~`!@#$%^&*()[]{}+=\"\'\\|;:/?.>,<\0" ~uint8 OPS = "`~!%^&*-=+./><\0" -~uint8 MULTI_OPS = "==,&&,||,^^,!==,!&&,!||,!^^,!<,!>,<<,>>,!&,!|,!^,++,--,>==,<==,len\0" +~uint8 MULTI_OPS = "==,&&,||,^^,!=,!&&,!||,!^^,!<,!>,<<,>>,!&,!|,!^,++,--,>=,<=,len\0" ~uint8 DELIMS = "()[]{}\0" ~uint8 MULTI_DELIMS = ";:#\0" +int TT_DEFWORD = 0 +int TT_KEYWORD = 1 +int TT_KEYTYPE = 2 +int TT_LITERAL = 3 +int TT_AUGMENT = 4 +int TT_DELIMIT = 5 +int TT_SPLITTR = 6 +int TT_INVALID = 7 + +/; opposite_delim (uint8 c) [uint8] + /; loop (int i = 0; DELIMS{i} !== 0) [i++] + /; if (DELIMS{i} == c) + /; if (i % 2 == 1) + return DELIMS{i - 1} + ;; else + return DELIMS{i + 1} + ;/ + ;/ + ;/ + return c +;/ + +/; is_delim (~uint8 str) [bool] + int l = utils.strlen(str) + + /; if (l == 1 && _str_contains(DELIMS, str`) == true) + return true + ;; else if (l == 2) + /; if (_str_contains(MULTI_DELIMS, str{0}) == true) + return (str{0} == str{1} && str{0} !== '#') || str{1} == '/' + ;; else if (_str_contains(MULTI_DELIMS, str{1}) == true) + return str{0} == '/' + ;/ + ;/ + return false +;/ + +/; token_type (~uint8 str) [int] + int l = utils.strlen(str) + + /; if (l < 1) + return TT_INVALID + ;/ + + /; if (is_delim(str) == true) + return TT_DELIMIT + ;; else if (l == 1 && is_reserved(str{0}) == true) + /; if (_str_contains(OPS, str{0}) == true) + return TT_AUGMENT + ;; else if (str` == ',' || str` == ';' || str` == ':') + return TT_SPLITTR + ;/ + ;; else if (_in_csv(MULTI_OPS, str) == true) + return TT_AUGMENT + ;; else if (_in_csv(KEYTYPES, str) == true) + return TT_KEYTYPE + ;; else if (_in_csv(KEYWORDS, str) == true) + return TT_KEYWORD + ;; else if (_in_csv(LITERALS, str) == true) + return TT_LITERAL + ;/ + + return TT_DEFWORD +;/ + +/; is_reserved (uint8 char) [bool] + return _str_contains(RESERVED, char) +;/ + +/; parse_nl_token (~int line, col) [Token] + Token out + out.line = line` + out.col = col` + out._type = TT_SPLITTR + + out.data = _alloc(2) + out.data{0} = '\n' + out.data{1} = 0 + + col` = 1 + line` = line` + 1 + + return out +;/ + +/; parse_comment (~utils.File fin, ~uint8 char) + /; loop (fin`.at_end == false && char` !== '\n') + char` = fin`.read() + ;/ +;/ + +/; parse_string_token(~utils.File fin, ~uint8 char, ~int line, col) [Token] + utils.Vector str + str.init(1) + str.push_char(char`) + + uint8 first = char` + + Token out + out.line = line` + out.col = col` + out._type = TT_LITERAL + + char` = fin`.read() + col`++ + /; loop (char` !== first && fin`.at_end == false) + /; if (char` == '\\') + str.push_char(char`) + char` = fin`.read() + col`++ + /; if (fin`.at_end == false) + /; if (char` == '\n') + line`++ + col` = 0 + ;/ + str.push_char(char`) + char` = fin`.read() + col`++ + ;/ + ;; else + /; if (char` == '\n') + line`++ + col` = 0 + ;/ + str.push_char(char`) + char` = fin`.read() + col`++ + ;/ + ;/ + + /; if (fin`.at_end == false) + char` = fin`.read() + ;/ + + str.push_char(first) + + out.data = str.as_cstr() + + return out +;/ + +/; in_num_range (uint8 char) [bool] + bool dec = char !< '0' && char !> '9' + + bool hex = char !< 'a' && char !> 'f' + bool HEX = char !< 'A' && char !> 'F' + hex = hex || HEX + + return dec || hex || char == '.' +;/ + +/; parse_numeric_token (~utils.File fin, ~uint8 char, ~int line, col) [Token] + Token out + out.line = line` + out.col = col` + out._type = TT_LITERAL + + utils.Vector num + num.init(1) + num.push_char(char`) -/; tokenize(utils.File fin) [utils.Vector] + char` = fin`.read() + col`++ + + bool dec = false, ok = true + + /; loop (fin`.at_end == false && ok == true) + /; if (char` == '.' && dec == true) + ok = false + ;; else if (char` == '.') + dec = true + ;/ + + /; if (ok == true && in_num_range(char`) == true) + num.push_char(char`) + char` = fin`.read() + col`++ + ;; else + ok = false + ;/ + ;/ + + out.data = num.as_cstr() + + return out +;/ + +/; parse_word_token (~utils.File fin, ~uint8 char, ~int line, col) [Token] + Token out + out.line = line` + out.col = col` + + utils.Vector str + str.init(1) + + bool ok = true + + /; loop (fin`.at_end == false && ok == true) + str.push_char(char`) + + char` = fin`.read() + col`++ + + /; if (char` == '\n' || _is_space(char`) == true || is_reserved(char`) == true) + ok = false + ;/ + ;/ + + out.data = str.as_cstr() + out._type = token_type(out.data) + return out +;/ + +/; parse_reserved_tokens (~utils.File fin, ~uint8 char, ~int line, col, ~utils.Vector out) + Token tmp + tmp.line = line` + tmp.col = col` + + utils.Vector res + res.init(1) + + bool ok = true + + /; loop (fin`.at_end == false && ok == true) + + res.push_char(char`) + int after = token_type(res.as_cstr()) + + /; if (after == TT_DEFWORD) + res.pop() + tmp.data = res.as_cstr() + tmp._type = token_type(tmp.data) + out`.push(~tmp) + + res.init(1) + res.push_char(char`) + tmp.col = col` + ;/ + + char` = fin`.read() + col`++ + + /; if (is_reserved(char`) == false || char` == '\"' || char` == '\'') + ok = false + ;/ + ;/ + + /; if (res.count > 0) + tmp.data = res.as_cstr() + tmp._type = token_type(tmp.data) + out`.push(~tmp) + ;; else + res.end() + ;/ +;/ + +~uint8 RES_LOL = "Reserved %c\n\0" +~uint8 PUSH = "Pushing token %s\n\0" + +/; tokenize (~utils.File fin) [utils.Vector] + # create a tmp token Token tok + tok._type = TT_INVALID + + utils.Vector out, delims, str + + # init vectors - utils.Vector out out.init(len tok) + delims.init(8) # A stack of delimiters + str.init(1) + + # open file for reading + fin`.open() + + # main counters for line and col + uint line = 1, col = 1 + + # main loop + uint8 char = fin`.read() + /; loop (fin`.at_end == false) + /; if (_is_space(char) == true) + # skip spaces + char = fin`.read() + col++ + + ;; else if (char == '#') + parse_comment(fin, ~char) + + ;; else if (char == '\"' || char == '\'') + # Generate string literals + tok = parse_string_token(fin, ~char, ~line, ~col) + + ;; else if (char !< '0' && char !> '9') + # handle numeric literals + tok = parse_numeric_token(fin, ~char, ~line, ~col) + + ;; else if (is_reserved(char) == true) + parse_reserved_tokens(fin, ~char, ~line, ~col, ~out) + + ;; else if (char != '\n') + # word tokens + tok = parse_word_token(fin, ~char, ~line, ~col) + ;/ + + /; if (tok._type !== TT_INVALID) + out.push(~tok) + tok._type = TT_INVALID + ;/ + + /; if (char == '\n') + tok = parse_nl_token(~line, ~col) + char = fin`.read() + out.push(~tok) + tok._type = TT_INVALID + ;/ + ;/ + + /; if (str.count > 0) + tok.data = str.as_cstr() + tok._type = token_type(tok.data) + out.push(~tok) + ;; else + str.end() + ;/ + + delims.end() + + # done with file + fin`.close() return out ;/ + +/; free_token_list (~utils.Vector vec) + ~Token t + + /; loop (int i = 0; i < vec`.count) [i++] + t = vec`.get(i) + _delete(t`.data) + ;/ + + vec`.end() +;/ + diff --git a/tnslc/test.tnsl b/tnslc/test.tnsl new file mode 100644 index 0000000..7a51951 --- /dev/null +++ b/tnslc/test.tnsl @@ -0,0 +1,3 @@ +/; main [int] +;/ + diff --git a/tnslc/tnslc.tnsl b/tnslc/tnslc.tnsl index 84fbdb5..ce68133 100644 --- a/tnslc/tnslc.tnsl +++ b/tnslc/tnslc.tnsl @@ -35,7 +35,7 @@ usage: fout.init(DEFAULT_FOUT) ;/ - compile.generate(fin, fout) + compile.generate(~fin, ~fout) fin.end() fout.end() -- cgit v1.2.3