summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKyle Gunger <kgunger12@gmail.com>2023-08-04 02:51:35 -0400
committerKyle Gunger <kgunger12@gmail.com>2023-08-04 02:51:35 -0400
commitf31ea957ae8be6a03f19474363d6e00b68de0532 (patch)
tree1453e63da4fe7eeafe650d67426fad8bffac2024
parentc265215bc6f8a49d47f5bfa29962601302c0c2df (diff)
Slightly better tokenization for tnsl_wrapped
-rw-r--r--tnslc/tokenizer.tnsl188
1 files changed, 102 insertions, 86 deletions
diff --git a/tnslc/tokenizer.tnsl b/tnslc/tokenizer.tnsl
index 3de0182..a7017e3 100644
--- a/tnslc/tokenizer.tnsl
+++ b/tnslc/tokenizer.tnsl
@@ -80,16 +80,16 @@ struct Token {
;/
;/
-{}uint8 tkn_ok = "OK \0", tkn_no = "NO \0", tkn_nl = "\n\0"
+{}uint8 tkn_st = "{ \0", tkn_nd = " }\n\0", tkn_sp = " \0"
/; print_token (Token tok, ~void file_out)
- /; if (in_csv(~CSV_KEYWORDS, tok.data) == true)
- write_to_file(file_out, ~tkn_ok{0})
- ;; else
- write_to_file(file_out, ~tkn_no{0})
- ;/
+ write_to_file(file_out, ~tkn_st{0})
+
+ write_to_file(file_out, print_tok_type(tok))
+ write_to_file(file_out, ~tkn_sp{0})
write_to_file(file_out, tok.data)
- write_to_file(file_out, ~tkn_nl{0})
+
+ write_to_file(file_out, ~tkn_nd{0})
;/
# Returns true if the character is whitespace
@@ -114,10 +114,31 @@ struct Token {
return in_csv(~CSV_AUGMENTS, tok.data)
;/
-# Returns true if the token is a valid literal value
-/; tok_literal (Token tok) [bool]
- # TODO: implement literals
- return false
+# True if the token is a valid number (integer or float)
+/; is_numeric_literal(Token tok) [bool]
+ /; if (tok._len() < 1)
+ return false
+ ;; else if (tok.data{0} < '0' || tok.data{0} > '9')
+ return false
+ ;/
+
+ bool non_dec = false
+ /; if (tok._len() > 1 && tok.data{0} == '0')
+ non_dec = tok.data{1} > '9'
+ # TODO: non_dec not impl
+ ;/
+
+ bool dec_seen = false
+
+ /; loop (int i = 0; i < tok._len()) [i++]
+ /; if (dec_seen == false && tok.data{i} == '.')
+ dec_seen = true
+ ;; else if (tok.data{i} < '0' || tok.data{i} > '9')
+ return false
+ ;/
+ ;/
+
+ return true
;/
/; get_tok_type(Token tok) [uint]
@@ -138,7 +159,7 @@ struct Token {
return TOKEN_TYPE.KEYWORD
;; else if (in_csv(~CSV_KEYTYPES, tok.data) == true)
return TOKEN_TYPE.KEYTYPE
- ;; else if (tok_literal(tok) == true)
+ ;; else if (is_numeric_literal(tok) == true)
return TOKEN_TYPE.LITERAL
;/
@@ -147,58 +168,107 @@ struct Token {
/; break_token(~Token tok, uint8 c) [bool]
- # return true
uint type_before = get_tok_type(tok`)
tok`.append(c)
uint type_after = get_tok_type(tok`)
tok`.pop()
- bool a = true
- return a
+
+ bool a = is_whitespace(c) && type_after !== TOKEN_TYPE.LITERAL
+ bool b = is_reserved(c) && type_before == TOKEN_TYPE.DEFWORD
+ bool c = type_before !== TOKEN_TYPE.DEFWORD && type_after == TOKEN_TYPE.DEFWORD
+
+ return a || b || c
;/
+
/; tokenize_file (~void file_in, file_out)
Token tmp
tmp.start()
+ tmp.line = 1
+ tmp.column = 1
uint8 buf = 0
int read_count = 0
+ int line = 1
+ int column = 1
# Start reading at beginning of file
_read_byte(file_in, ~buf, ~read_count)
# Read loop.
- /; loop (_read_byte(file_in, ~buf, ~read_count))
+ /; loop (_read_byte(file_in, ~buf, ~read_count)) [column++]
/; if (read_count == 0)
break
;/
-
/; if (buf == '#')
+
+ # Handle comment
/; loop (_read_byte(file_in, ~buf, ~read_count))
/; if (buf == '\n' || read_count == 0)
break
;/
;/
+ ;; else if (buf == '\'' || buf == '"')
+
+ # Handle char/string literal
+ uint8 first = buf
+ tmp._type = TOKEN_TYPE.LITERAL
+ tmp.append(buf)
+ /; loop (_read_byte(file_in, ~buf, ~read_count))
+ /; if (buf == '\\')
+ tmp.append(buf)
+ read_count = 0
+ _read_byte(file_in, ~buf, ~read_count)
+ column++
+ tmp.append(buf)
+ ;; else if (buf == first)
+ tmp.append(buf)
+ break
+ ;; else
+ tmp.append(buf)
+ ;/
+
+ /; if (buf == '\n')
+ line++
+ column = 1
+ ;; else
+ column++
+ ;/
+
+ read_count = 0
+ ;/
+ print_token(tmp, file_out)
+ tmp._del()
+ tmp.start()
+
;; else if (break_token(~tmp, buf) == true)
+
+ # Handle token break
/; if (tmp._len() > 0)
+ tmp._type = get_tok_type(tmp)
print_token(tmp, file_out)
;/
tmp._del()
tmp.start()
+ tmp.line = line
+ tmp.column = column
/; if (is_whitespace(buf) == false)
tmp.append(buf)
- ;; else if (buf == WHITESPACE{2})
- tmp.append(WHITESPACE{2})
- print_token(tmp, file_out)
- tmp._del()
- tmp.start()
;/
- ;; else
+ ;; else if (is_whitespace(buf) == false)
+
+ # Add non-whitespace
tmp.append(buf)
;/
+ /; if (buf == '\n')
+ line++
+ column = 0
+ ;/
+
read_count = 0
;/
@@ -209,15 +279,15 @@ struct Token {
tmp._del()
;/
-{}uint8 w_SEP = "SEPARATOR\n\0"
-{}uint8 w_DEL = "DELIMITER\n\0"
-{}uint8 w_AUG = "AUGMENT\n\0"
-{}uint8 w_KTP = "KEYTYPE\n\0"
-{}uint8 w_KWD = "KEYWORD\n\0"
-{}uint8 w_LIT = "LITERAL\n\0"
-{}uint8 w_DEF = "DEFWORD\n\0"
+{}uint8 w_SEP = "SEPARATOR\0"
+{}uint8 w_DEL = "DELIMITER\0"
+{}uint8 w_AUG = "AUGMENT\0"
+{}uint8 w_KTP = "KEYTYPE\0"
+{}uint8 w_KWD = "KEYWORD\0"
+{}uint8 w_LIT = "LITERAL\0"
+{}uint8 w_DEF = "DEFWORD\0"
-/; print_tok_type(uint tt)
+/; print_tok_type(uint tt) [~uint8]
~uint8 ptr = ~w_DEF{0}
@@ -237,61 +307,7 @@ struct Token {
ptr = ~w_DEF{0}
;/
- _printf(ptr)
+ return ptr
;/
-{}uint8 test_multi = "/;\0"
-{}uint8 test_paren = "(\0"
-{}uint8 test_seps = ",\0"
-{}uint8 test_aug = ".\0"
-{}uint8 test_maug = "++\0"
-{}uint8 test_mkw = "if\0"
-{}uint8 test_mkt = "bool\0"
-{}uint8 test_def = "main\0"
-{}uint8 space = " \0"
-
-/; tests
- Token tk
-
- # Delimiter
- tk.data = ~test_multi{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_paren{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_seps{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_aug{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_maug{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_mkw{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_mkt{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-
- tk.data = ~test_def{0}
- _printf(tk.data)
- _printf(~space{0})
- print_tok_type(get_tok_type(tk))
-;/