From 5d688b4da97da2c2f684940147478f12d1f2baba Mon Sep 17 00:00:00 2001 From: Kyle Gunger Date: Fri, 19 Jul 2024 03:21:39 -0400 Subject: switch tokenization scheme --- tnslc/compile/tokenizer.tnsl | 455 ------------------------------------------- 1 file changed, 455 deletions(-) delete mode 100644 tnslc/compile/tokenizer.tnsl (limited to 'tnslc/compile/tokenizer.tnsl') diff --git a/tnslc/compile/tokenizer.tnsl b/tnslc/compile/tokenizer.tnsl deleted file mode 100644 index 30fc8e8..0000000 --- a/tnslc/compile/tokenizer.tnsl +++ /dev/null @@ -1,455 +0,0 @@ -bool HAD_ERROR = false - -struct Token { - ~uint8 data, - int - _type, - line, - col, - int - closing # only has meaning for delimiters -} - -/; method Token - /; eq (Token tok) [bool] - return utils.strcmp(self.data, tok.data) - ;/ - - /; eq_str(~uint8 str) [bool] - return utils.strcmp(self.data, str) - ;/ - - /; sprint [~uint8] - utils.Vector out - out.init(1) - - ~uint8 tmp - - out.push_char('{') - - out.push_cstr(self.data) - - out.push_char(',') - out.push_char(' ') - - tmp = utils.int_to_str(self._type) - out.push_cstr(tmp) - _delete(tmp) - - out.push_char(',') - out.push_char(' ') - - tmp = utils.int_to_str(self.line) - out.push_cstr(tmp) - _delete(tmp) - - out.push_char(',') - out.push_char(' ') - - tmp = utils.int_to_str(self.col) - out.push_cstr(tmp) - _delete(tmp) - - out.push_char('}') - - return out.as_cstr() - ;/ -;/ - -/; _is_space(uint8 char) [bool] - /; if (char == '\t' || char == '\r' || char == ' ') - return true - ;/ - return false -;/ - -/; _in_csv (~uint8 csv, ~uint8 str) [bool] - int along = 0 - - /; loop (csv` !== 0) [csv++] - /; if (csv` == ',') - /; if (along !< 0 && str{along} == 0) - return true - ;/ - along = 0 - ;; else if (along !< 0 && str{along} == csv`) - along++ - ;; else - along = 0 - along-- - ;/ - ;/ - - return along !< 0 && str{along} == 0 -;/ - -/; _str_contains (~uint8 str, uint8 ch) [bool] - /; loop (str` !== 0) [str++] - /; if (str` == ch) - return true - ;/ - ;/ - return false -;/ - -~uint8 KEYWORDS = "module,export,asm,if,else,loop,label,goto,continue,break,return,import,as,using,struct,method,interface,enum,implements,operator,is\0" -~uint8 KEYTYPES = "uint8,uint16,uint32,uint64,uint,int8,int16,int32,int64,int,float32,float64,float,bool,vect,void\0" -~uint8 LITERALS = "false,true\0" - -~uint8 RESERVED = "~`!@#$%^&*()[]{}-+=\"\'\\|;:/?.>,<\0" - -~uint8 OPS = "`~!%^&|*-=+./><\0" -~uint8 MULTI_OPS = "==,&&,||,^^,!==,!&&,!||,!^^,!<,!>,<<,>>,!&,!|,!^,++,--,>==,<==,len,!=\0" - -~uint8 DELIMS = "()[]{}\0" -~uint8 MULTI_DELIMS = ";:#\0" - -int TT_DEFWORD = 0 -int TT_KEYWORD = 1 -int TT_KEYTYPE = 2 -int TT_LITERAL = 3 -int TT_AUGMENT = 4 -int TT_DELIMIT = 5 -int TT_SPLITTR = 6 -int TT_INVALID = 7 - -/; opposite_delim (uint8 c) [uint8] - /; loop (int i = 0; DELIMS{i} !== 0) [i++] - /; if (DELIMS{i} == c) - /; if (i % 2 == 1) - return DELIMS{i - 1} - ;; else - return DELIMS{i + 1} - ;/ - ;/ - ;/ - return c -;/ - -/; is_delim (~uint8 str) [bool] - int l = utils.strlen(str) - - /; if (l == 1 && _str_contains(DELIMS, str`) == true) - return true - ;; else if (l == 2) - /; if (_str_contains(MULTI_DELIMS, str{0}) == true) - return (str{0} == str{1} && str{0} !== '#') || str{1} == '/' - ;; else if (_str_contains(MULTI_DELIMS, str{1}) == true) - return str{0} == '/' - ;/ - ;/ - return false -;/ - -/; token_type (~uint8 str) [int] - int l = utils.strlen(str) - - /; if (l < 1) - return TT_INVALID - ;/ - - /; if (is_delim(str) == true) - return TT_DELIMIT - ;; else if (l == 1 && is_reserved(str{0}) == true) - /; if (_str_contains(OPS, str{0}) == true) - return TT_AUGMENT - ;; else if (str` == ',' || str` == ';' || str` == ':') - return TT_SPLITTR - ;/ - ;; else if (_in_csv(MULTI_OPS, str) == true) - return TT_AUGMENT - ;; else if (_in_csv(KEYTYPES, str) == true) - return TT_KEYTYPE - ;; else if (_in_csv(KEYWORDS, str) == true) - return TT_KEYWORD - ;; else if (_in_csv(LITERALS, str) == true) - return TT_LITERAL - ;/ - - return TT_DEFWORD -;/ - -/; is_reserved (uint8 char) [bool] - return _str_contains(RESERVED, char) -;/ - -/; parse_nl_token (~int line, col) [Token] - Token out - out.line = line` - out.col = col` - out._type = TT_SPLITTR - - out.data = _alloc(2) - out.data{0} = '\n' - out.data{1} = 0 - - col` = 1 - line` = line` + 1 - - return out -;/ - -/; parse_comment (~utils.File fin, ~uint8 char) - /; loop (fin`.at_end == false && char` !== '\n') - char` = fin`.read() - ;/ -;/ - -/; parse_string_token(~utils.File fin, ~uint8 char, ~int line, col) [Token] - utils.Vector str - str.init(1) - str.push_char(char`) - - uint8 first = char` - - Token out - out.line = line` - out.col = col` - out._type = TT_LITERAL - - char` = fin`.read() - col`++ - /; loop (char` !== first && fin`.at_end == false) - /; if (char` == '\\') - str.push_char(char`) - char` = fin`.read() - col`++ - /; if (fin`.at_end == false) - /; if (char` == '\n') - line`++ - col` = 0 - ;/ - str.push_char(char`) - char` = fin`.read() - col`++ - ;/ - ;; else - /; if (char` == '\n') - line`++ - col` = 0 - ;/ - str.push_char(char`) - char` = fin`.read() - col`++ - ;/ - ;/ - - /; if (fin`.at_end == false) - char` = fin`.read() - ;/ - - str.push_char(first) - - out.data = str.as_cstr() - - return out -;/ - -/; in_num_range (uint8 char) [bool] - bool dec = char !< '0' && char !> '9' - - bool hex = char !< 'a' && char !> 'f' - bool HEX = char !< 'A' && char !> 'F' - hex = hex || HEX - - return dec || hex || char == '.' -;/ - -/; parse_numeric_token (~utils.File fin, ~uint8 char, ~int line, col) [Token] - Token out - out.line = line` - out.col = col` - out._type = TT_LITERAL - - utils.Vector num - num.init(1) - num.push_char(char`) - - char` = fin`.read() - col`++ - - bool dec = false, ok = true - - /; loop (fin`.at_end == false && ok == true) - /; if (char` == '.' && dec == true) - ok = false - ;; else if (char` == '.') - dec = true - ;/ - - /; if (ok == true && in_num_range(char`) == true) - num.push_char(char`) - char` = fin`.read() - col`++ - ;; else - ok = false - ;/ - ;/ - - out.data = num.as_cstr() - - return out -;/ - -/; parse_word_token (~utils.File fin, ~uint8 char, ~int line, col) [Token] - Token out - out.line = line` - out.col = col` - - utils.Vector str - str.init(1) - - bool ok = true - - /; loop (fin`.at_end == false && ok == true) - str.push_char(char`) - - char` = fin`.read() - col`++ - - /; if (char` == '\n' || _is_space(char`) == true || is_reserved(char`) == true) - ok = false - ;/ - ;/ - - out.data = str.as_cstr() - out._type = token_type(out.data) - return out -;/ - -~uint8 ERROR_RESERVED = "unexpected reserved token in file\0" - -/; parse_reserved_tokens (~utils.File fin, ~uint8 char, ~int line, col, ~utils.Vector out) - Token tmp - tmp.line = line` - tmp.col = col` - - utils.Vector res - res.init(1) - - bool ok = true - - /; loop (fin`.at_end == false && ok == true) - - res.push_char(char`) - int after = token_type(res.as_cstr()) - - /; if (after == TT_DEFWORD) - bool res_unexpected = true - /; if (res.count > 1) - res.pop() - res_unexpected = false - ;/ - - tmp.data = res.as_cstr() - tmp._type = token_type(tmp.data) - - /; if (res_unexpected == true) - HAD_ERROR = true - report_error(fin`, tmp, ERROR_RESERVED) - ;/ - - out`.push(~tmp) - - res.init(1) - res.push_char(char`) - tmp.col = col` - ;/ - - char` = fin`.read() - col`++ - - /; if (is_reserved(char`) == false || char` == '\"' || char` == '\'') - ok = false - ;/ - ;/ - - /; if (res.count > 0) - tmp.data = res.as_cstr() - tmp._type = token_type(tmp.data) - out`.push(~tmp) - ;; else - res.end() - ;/ -;/ - -~uint8 RES_LOL = "Reserved %c\n\0" -~uint8 PUSH = "Pushing token %s\n\0" - -/; tokenize (~utils.File fin) [utils.Vector] - # create a tmp token - Token tok - tok._type = TT_INVALID - - utils.Vector out, delims - - # init vectors - - out.init(len tok) - delims.init(8) # A stack of delimiters - - # open file for reading - fin`.open() - - # main counters for line and col - uint line = 1, col = 1 - - # main loop - uint8 char = fin`.read() - /; loop (fin`.at_end == false) - /; if (_is_space(char) == true) - # skip spaces - char = fin`.read() - col++ - - ;; else if (char == '#') - parse_comment(fin, ~char) - - ;; else if (char == '\"' || char == '\'') - # Generate string literals - tok = parse_string_token(fin, ~char, ~line, ~col) - - ;; else if (char !< '0' && char !> '9') - # handle numeric literals - tok = parse_numeric_token(fin, ~char, ~line, ~col) - - ;; else if (is_reserved(char) == true) - parse_reserved_tokens(fin, ~char, ~line, ~col, ~out) - - ;; else if (char !== '\n') - # word tokens - tok = parse_word_token(fin, ~char, ~line, ~col) - ;/ - - /; if (tok._type !== TT_INVALID) - out.push(~tok) - tok._type = TT_INVALID - ;/ - - /; if (char == '\n') - tok = parse_nl_token(~line, ~col) - char = fin`.read() - out.push(~tok) - tok._type = TT_INVALID - ;/ - ;/ - - delims.end() - - # done with file - fin`.close() - - return out -;/ - -/; free_token_list (~utils.Vector vec) - ~Token t - - /; loop (int i = 0; i < vec`.count) [i++] - t = vec`.get(i) - _delete(t`.data) - ;/ - - vec`.end() -;/ - -- cgit v1.2.3