From f31ea957ae8be6a03f19474363d6e00b68de0532 Mon Sep 17 00:00:00 2001 From: Kyle Gunger Date: Fri, 4 Aug 2023 02:51:35 -0400 Subject: Slightly better tokenization for tnsl_wrapped --- tnslc/tokenizer.tnsl | 188 ++++++++++++++++++++++++++++----------------------- 1 file changed, 102 insertions(+), 86 deletions(-) diff --git a/tnslc/tokenizer.tnsl b/tnslc/tokenizer.tnsl index 3de0182..a7017e3 100644 --- a/tnslc/tokenizer.tnsl +++ b/tnslc/tokenizer.tnsl @@ -80,16 +80,16 @@ struct Token { ;/ ;/ -{}uint8 tkn_ok = "OK \0", tkn_no = "NO \0", tkn_nl = "\n\0" +{}uint8 tkn_st = "{ \0", tkn_nd = " }\n\0", tkn_sp = " \0" /; print_token (Token tok, ~void file_out) - /; if (in_csv(~CSV_KEYWORDS, tok.data) == true) - write_to_file(file_out, ~tkn_ok{0}) - ;; else - write_to_file(file_out, ~tkn_no{0}) - ;/ + write_to_file(file_out, ~tkn_st{0}) + + write_to_file(file_out, print_tok_type(tok)) + write_to_file(file_out, ~tkn_sp{0}) write_to_file(file_out, tok.data) - write_to_file(file_out, ~tkn_nl{0}) + + write_to_file(file_out, ~tkn_nd{0}) ;/ # Returns true if the character is whitespace @@ -114,10 +114,31 @@ struct Token { return in_csv(~CSV_AUGMENTS, tok.data) ;/ -# Returns true if the token is a valid literal value -/; tok_literal (Token tok) [bool] - # TODO: implement literals - return false +# True if the token is a valid number (integer or float) +/; is_numeric_literal(Token tok) [bool] + /; if (tok._len() < 1) + return false + ;; else if (tok.data{0} < '0' || tok.data{0} > '9') + return false + ;/ + + bool non_dec = false + /; if (tok._len() > 1 && tok.data{0} == '0') + non_dec = tok.data{1} > '9' + # TODO: non_dec not impl + ;/ + + bool dec_seen = false + + /; loop (int i = 0; i < tok._len()) [i++] + /; if (dec_seen == false && tok.data{i} == '.') + dec_seen = true + ;; else if (tok.data{i} < '0' || tok.data{i} > '9') + return false + ;/ + ;/ + + return true ;/ /; get_tok_type(Token tok) [uint] @@ -138,7 +159,7 @@ struct Token { return TOKEN_TYPE.KEYWORD ;; else if (in_csv(~CSV_KEYTYPES, tok.data) == true) return TOKEN_TYPE.KEYTYPE - ;; else if (tok_literal(tok) == true) + ;; else if (is_numeric_literal(tok) == true) return TOKEN_TYPE.LITERAL ;/ @@ -147,58 +168,107 @@ struct Token { /; break_token(~Token tok, uint8 c) [bool] - # return true uint type_before = get_tok_type(tok`) tok`.append(c) uint type_after = get_tok_type(tok`) tok`.pop() - bool a = true - return a + + bool a = is_whitespace(c) && type_after !== TOKEN_TYPE.LITERAL + bool b = is_reserved(c) && type_before == TOKEN_TYPE.DEFWORD + bool c = type_before !== TOKEN_TYPE.DEFWORD && type_after == TOKEN_TYPE.DEFWORD + + return a || b || c ;/ + /; tokenize_file (~void file_in, file_out) Token tmp tmp.start() + tmp.line = 1 + tmp.column = 1 uint8 buf = 0 int read_count = 0 + int line = 1 + int column = 1 # Start reading at beginning of file _read_byte(file_in, ~buf, ~read_count) # Read loop. - /; loop (_read_byte(file_in, ~buf, ~read_count)) + /; loop (_read_byte(file_in, ~buf, ~read_count)) [column++] /; if (read_count == 0) break ;/ - /; if (buf == '#') + + # Handle comment /; loop (_read_byte(file_in, ~buf, ~read_count)) /; if (buf == '\n' || read_count == 0) break ;/ ;/ + ;; else if (buf == '\'' || buf == '"') + + # Handle char/string literal + uint8 first = buf + tmp._type = TOKEN_TYPE.LITERAL + tmp.append(buf) + /; loop (_read_byte(file_in, ~buf, ~read_count)) + /; if (buf == '\\') + tmp.append(buf) + read_count = 0 + _read_byte(file_in, ~buf, ~read_count) + column++ + tmp.append(buf) + ;; else if (buf == first) + tmp.append(buf) + break + ;; else + tmp.append(buf) + ;/ + + /; if (buf == '\n') + line++ + column = 1 + ;; else + column++ + ;/ + + read_count = 0 + ;/ + print_token(tmp, file_out) + tmp._del() + tmp.start() + ;; else if (break_token(~tmp, buf) == true) + + # Handle token break /; if (tmp._len() > 0) + tmp._type = get_tok_type(tmp) print_token(tmp, file_out) ;/ tmp._del() tmp.start() + tmp.line = line + tmp.column = column /; if (is_whitespace(buf) == false) tmp.append(buf) - ;; else if (buf == WHITESPACE{2}) - tmp.append(WHITESPACE{2}) - print_token(tmp, file_out) - tmp._del() - tmp.start() ;/ - ;; else + ;; else if (is_whitespace(buf) == false) + + # Add non-whitespace tmp.append(buf) ;/ + /; if (buf == '\n') + line++ + column = 0 + ;/ + read_count = 0 ;/ @@ -209,15 +279,15 @@ struct Token { tmp._del() ;/ -{}uint8 w_SEP = "SEPARATOR\n\0" -{}uint8 w_DEL = "DELIMITER\n\0" -{}uint8 w_AUG = "AUGMENT\n\0" -{}uint8 w_KTP = "KEYTYPE\n\0" -{}uint8 w_KWD = "KEYWORD\n\0" -{}uint8 w_LIT = "LITERAL\n\0" -{}uint8 w_DEF = "DEFWORD\n\0" +{}uint8 w_SEP = "SEPARATOR\0" +{}uint8 w_DEL = "DELIMITER\0" +{}uint8 w_AUG = "AUGMENT\0" +{}uint8 w_KTP = "KEYTYPE\0" +{}uint8 w_KWD = "KEYWORD\0" +{}uint8 w_LIT = "LITERAL\0" +{}uint8 w_DEF = "DEFWORD\0" -/; print_tok_type(uint tt) +/; print_tok_type(uint tt) [~uint8] ~uint8 ptr = ~w_DEF{0} @@ -237,61 +307,7 @@ struct Token { ptr = ~w_DEF{0} ;/ - _printf(ptr) + return ptr ;/ -{}uint8 test_multi = "/;\0" -{}uint8 test_paren = "(\0" -{}uint8 test_seps = ",\0" -{}uint8 test_aug = ".\0" -{}uint8 test_maug = "++\0" -{}uint8 test_mkw = "if\0" -{}uint8 test_mkt = "bool\0" -{}uint8 test_def = "main\0" -{}uint8 space = " \0" - -/; tests - Token tk - - # Delimiter - tk.data = ~test_multi{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_paren{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_seps{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_aug{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_maug{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_mkw{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_mkt{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) - - tk.data = ~test_def{0} - _printf(tk.data) - _printf(~space{0}) - print_tok_type(get_tok_type(tk)) -;/ -- cgit v1.2.3