diff options
Diffstat (limited to 'tnslc/compile/tokenizer.tnsl')
-rw-r--r-- | tnslc/compile/tokenizer.tnsl | 388 |
1 files changed, 382 insertions, 6 deletions
diff --git a/tnslc/compile/tokenizer.tnsl b/tnslc/compile/tokenizer.tnsl index 722a5a0..e528e34 100644 --- a/tnslc/compile/tokenizer.tnsl +++ b/tnslc/compile/tokenizer.tnsl @@ -1,9 +1,13 @@ +bool HAD_ERROR = false + struct Token { ~uint8 data, int _type, line, - col + col, + int + closing # only has meaning for delimiters } /; method Token @@ -14,10 +18,46 @@ struct Token { /; eq_str(~uint8 str) [bool] return utils.strcmp(self.data, str) ;/ + + /; sprint [~uint8] + utils.Vector out + out.init(1) + + ~uint8 tmp + + out.push_char('{') + + out.push_cstr(self.data) + + out.push_char(',') + out.push_char(' ') + + tmp = utils.int_to_str(self._type) + out.push_cstr(tmp) + _delete(tmp) + + out.push_char(',') + out.push_char(' ') + + tmp = utils.int_to_str(self.line) + out.push_cstr(tmp) + _delete(tmp) + + out.push_char(',') + out.push_char(' ') + + tmp = utils.int_to_str(self.col) + out.push_cstr(tmp) + _delete(tmp) + + out.push_char('}') + + return out.as_cstr() + ;/ ;/ /; _is_space(uint8 char) [bool] - /; if (char == '\t' || char == '\n' || char == '\r' || char == ' ') + /; if (char == '\t' || char == '\r' || char == ' ') return true ;/ return false @@ -56,20 +96,356 @@ struct Token { ~uint8 KEYTYPES = "uint8,uint16,uint32,uint64,uint,int8,int16,int32,int64,int,float32,float64,float,bool,vect,void\0" ~uint8 LITERALS = "false,true\0" -~uint8 RESERVED = "~`!@#$%^&*()[]{}+_=\"\'\\|;:/?.>,<\0" +~uint8 RESERVED = "~`!@#$%^&*()[]{}+=\"\'\\|;:/?.>,<\0" ~uint8 OPS = "`~!%^&*-=+./><\0" -~uint8 MULTI_OPS = "==,&&,||,^^,!==,!&&,!||,!^^,!<,!>,<<,>>,!&,!|,!^,++,--,>==,<==,len\0" +~uint8 MULTI_OPS = "==,&&,||,^^,!=,!&&,!||,!^^,!<,!>,<<,>>,!&,!|,!^,++,--,>=,<=,len\0" ~uint8 DELIMS = "()[]{}\0" ~uint8 MULTI_DELIMS = ";:#\0" +int TT_DEFWORD = 0 +int TT_KEYWORD = 1 +int TT_KEYTYPE = 2 +int TT_LITERAL = 3 +int TT_AUGMENT = 4 +int TT_DELIMIT = 5 +int TT_SPLITTR = 6 +int TT_INVALID = 7 + +/; opposite_delim (uint8 c) [uint8] + /; loop (int i = 0; DELIMS{i} !== 0) [i++] + /; if (DELIMS{i} == c) + /; if (i % 2 == 1) + return DELIMS{i - 1} + ;; else + return DELIMS{i + 1} + ;/ + ;/ + ;/ + return c +;/ + +/; is_delim (~uint8 str) [bool] + int l = utils.strlen(str) + + /; if (l == 1 && _str_contains(DELIMS, str`) == true) + return true + ;; else if (l == 2) + /; if (_str_contains(MULTI_DELIMS, str{0}) == true) + return (str{0} == str{1} && str{0} !== '#') || str{1} == '/' + ;; else if (_str_contains(MULTI_DELIMS, str{1}) == true) + return str{0} == '/' + ;/ + ;/ + return false +;/ + +/; token_type (~uint8 str) [int] + int l = utils.strlen(str) + + /; if (l < 1) + return TT_INVALID + ;/ + + /; if (is_delim(str) == true) + return TT_DELIMIT + ;; else if (l == 1 && is_reserved(str{0}) == true) + /; if (_str_contains(OPS, str{0}) == true) + return TT_AUGMENT + ;; else if (str` == ',' || str` == ';' || str` == ':') + return TT_SPLITTR + ;/ + ;; else if (_in_csv(MULTI_OPS, str) == true) + return TT_AUGMENT + ;; else if (_in_csv(KEYTYPES, str) == true) + return TT_KEYTYPE + ;; else if (_in_csv(KEYWORDS, str) == true) + return TT_KEYWORD + ;; else if (_in_csv(LITERALS, str) == true) + return TT_LITERAL + ;/ + + return TT_DEFWORD +;/ + +/; is_reserved (uint8 char) [bool] + return _str_contains(RESERVED, char) +;/ + +/; parse_nl_token (~int line, col) [Token] + Token out + out.line = line` + out.col = col` + out._type = TT_SPLITTR + + out.data = _alloc(2) + out.data{0} = '\n' + out.data{1} = 0 + + col` = 1 + line` = line` + 1 + + return out +;/ + +/; parse_comment (~utils.File fin, ~uint8 char) + /; loop (fin`.at_end == false && char` !== '\n') + char` = fin`.read() + ;/ +;/ + +/; parse_string_token(~utils.File fin, ~uint8 char, ~int line, col) [Token] + utils.Vector str + str.init(1) + str.push_char(char`) + + uint8 first = char` + + Token out + out.line = line` + out.col = col` + out._type = TT_LITERAL + + char` = fin`.read() + col`++ + /; loop (char` !== first && fin`.at_end == false) + /; if (char` == '\\') + str.push_char(char`) + char` = fin`.read() + col`++ + /; if (fin`.at_end == false) + /; if (char` == '\n') + line`++ + col` = 0 + ;/ + str.push_char(char`) + char` = fin`.read() + col`++ + ;/ + ;; else + /; if (char` == '\n') + line`++ + col` = 0 + ;/ + str.push_char(char`) + char` = fin`.read() + col`++ + ;/ + ;/ + + /; if (fin`.at_end == false) + char` = fin`.read() + ;/ + + str.push_char(first) + + out.data = str.as_cstr() + + return out +;/ + +/; in_num_range (uint8 char) [bool] + bool dec = char !< '0' && char !> '9' + + bool hex = char !< 'a' && char !> 'f' + bool HEX = char !< 'A' && char !> 'F' + hex = hex || HEX + + return dec || hex || char == '.' +;/ + +/; parse_numeric_token (~utils.File fin, ~uint8 char, ~int line, col) [Token] + Token out + out.line = line` + out.col = col` + out._type = TT_LITERAL + + utils.Vector num + num.init(1) + num.push_char(char`) -/; tokenize(utils.File fin) [utils.Vector] + char` = fin`.read() + col`++ + + bool dec = false, ok = true + + /; loop (fin`.at_end == false && ok == true) + /; if (char` == '.' && dec == true) + ok = false + ;; else if (char` == '.') + dec = true + ;/ + + /; if (ok == true && in_num_range(char`) == true) + num.push_char(char`) + char` = fin`.read() + col`++ + ;; else + ok = false + ;/ + ;/ + + out.data = num.as_cstr() + + return out +;/ + +/; parse_word_token (~utils.File fin, ~uint8 char, ~int line, col) [Token] + Token out + out.line = line` + out.col = col` + + utils.Vector str + str.init(1) + + bool ok = true + + /; loop (fin`.at_end == false && ok == true) + str.push_char(char`) + + char` = fin`.read() + col`++ + + /; if (char` == '\n' || _is_space(char`) == true || is_reserved(char`) == true) + ok = false + ;/ + ;/ + + out.data = str.as_cstr() + out._type = token_type(out.data) + return out +;/ + +/; parse_reserved_tokens (~utils.File fin, ~uint8 char, ~int line, col, ~utils.Vector out) + Token tmp + tmp.line = line` + tmp.col = col` + + utils.Vector res + res.init(1) + + bool ok = true + + /; loop (fin`.at_end == false && ok == true) + + res.push_char(char`) + int after = token_type(res.as_cstr()) + + /; if (after == TT_DEFWORD) + res.pop() + tmp.data = res.as_cstr() + tmp._type = token_type(tmp.data) + out`.push(~tmp) + + res.init(1) + res.push_char(char`) + tmp.col = col` + ;/ + + char` = fin`.read() + col`++ + + /; if (is_reserved(char`) == false || char` == '\"' || char` == '\'') + ok = false + ;/ + ;/ + + /; if (res.count > 0) + tmp.data = res.as_cstr() + tmp._type = token_type(tmp.data) + out`.push(~tmp) + ;; else + res.end() + ;/ +;/ + +~uint8 RES_LOL = "Reserved %c\n\0" +~uint8 PUSH = "Pushing token %s\n\0" + +/; tokenize (~utils.File fin) [utils.Vector] + # create a tmp token Token tok + tok._type = TT_INVALID + + utils.Vector out, delims, str + + # init vectors - utils.Vector out out.init(len tok) + delims.init(8) # A stack of delimiters + str.init(1) + + # open file for reading + fin`.open() + + # main counters for line and col + uint line = 1, col = 1 + + # main loop + uint8 char = fin`.read() + /; loop (fin`.at_end == false) + /; if (_is_space(char) == true) + # skip spaces + char = fin`.read() + col++ + + ;; else if (char == '#') + parse_comment(fin, ~char) + + ;; else if (char == '\"' || char == '\'') + # Generate string literals + tok = parse_string_token(fin, ~char, ~line, ~col) + + ;; else if (char !< '0' && char !> '9') + # handle numeric literals + tok = parse_numeric_token(fin, ~char, ~line, ~col) + + ;; else if (is_reserved(char) == true) + parse_reserved_tokens(fin, ~char, ~line, ~col, ~out) + + ;; else if (char != '\n') + # word tokens + tok = parse_word_token(fin, ~char, ~line, ~col) + ;/ + + /; if (tok._type !== TT_INVALID) + out.push(~tok) + tok._type = TT_INVALID + ;/ + + /; if (char == '\n') + tok = parse_nl_token(~line, ~col) + char = fin`.read() + out.push(~tok) + tok._type = TT_INVALID + ;/ + ;/ + + /; if (str.count > 0) + tok.data = str.as_cstr() + tok._type = token_type(tok.data) + out.push(~tok) + ;; else + str.end() + ;/ + + delims.end() + + # done with file + fin`.close() return out ;/ + +/; free_token_list (~utils.Vector vec) + ~Token t + + /; loop (int i = 0; i < vec`.count) [i++] + t = vec`.get(i) + _delete(t`.data) + ;/ + + vec`.end() +;/ + |