Basic tokenizer

author: Kyle Gunger <kgunger12@gmail.com> 2024-07-21 00:15:20 -0400
committer: Kyle Gunger <kgunger12@gmail.com> 2024-07-21 00:15:20 -0400
commit: 308a427f3cdb2c7f618b0d48640d064b88bbbceb (patch)
tree: 439ea530d05ae31ef6a92bf9b5d16483c3bcd270 /tnslc
parent: 2f282dd62b9019b6e6613f4af5f50448089497ad (diff)
6 files changed, 233 insertions, 94 deletions
diff --git a/tnslc/parse/ast.tnsl b/tnslc/parse/ast.tnsl
index e69de29..554aac2 100644
--- a/tnslc/parse/ast.tnsl
+++ b/tnslc/parse/ast.tnsl
@@ -0,0 +1,21 @@
+
+uint16 NTYPE_MOD = 0
+uint16 NTYPE_STRUCT = 1
+uint16 NTYPE_ID = 2
+uint16 NTYPE_BINOP = 3
+uint16 NTYPE_PREOP = 4
+uint16 NTYPE_POSTOP = 5
+uint16 NTYPE_FUNCTION = 6
+
+
+struct Node {
+	uint16 _type,
+	~uint8 data,
+	utils.Vector sub
+}
+
+/; generate_ast (~utils.File fin) [Node]
+	Node out
+	return out
+;/
+
diff --git a/tnslc/parse/tokenizer.tnsl b/tnslc/parse/tokenizer.tnsl
index fcc3c5c..0df0ef8 100644
--- a/tnslc/parse/tokenizer.tnsl
+++ b/tnslc/parse/tokenizer.tnsl
@@ -6,7 +6,9 @@ uint TTYPE_KEYTP = 3
 uint TTYPE_LITRL = 4
 uint TTYPE_AUG   = 5
 uint TTYPE_USRWD = 6
+uint TTYPE_COMNT = 7
 
+uint TTYPE_UNKNOWN = 998
 uint TTYPE_ERR   = 999
 
 struct Token {
@@ -75,36 +77,31 @@ uint   MAX_MULTI  = 3
 	Token out
 	out.line = prev.line
 	out.col = prev.col
+	out._type = TTYPE_USRWD
 
 	utils.Vector tmp
 	tmp.init(1)
-	
+
 	uint8 ch = fin`.read()
-	tmp.push(~ch)
 
-	/; loop (bool run = true) [run == true]
+	/; loop (fin`.at_end == false && is_reserved(ch) == false && is_whitespace(ch) == false)
+		tmp.push(~ch)
 		ch = fin`.read()
-		/; if (ch == 0)
-			run = false
-		;; else if (is_reserved(ch) == true || is_whitespace(ch) == true)
-			fin`.unread()
-			run = false
-		;; else
-			tmp.push(~ch)
-		;/
 	;/
 
-	~uint8 str = tmp.as_cstr()
-	/; if (_in_csv(KEYWORDS, str) == true)
+	/; if (fin`.at_end == false)
+		fin`.unread()
+	;/
+
+	out.data = tmp.as_cstr()
+	/; if (_in_csv(KEYWORDS, out.data) == true)
 		out._type = TTYPE_KEYWD
-	;; else if (_in_csv(KEYTYPES, str) == true)
-		out._type == TTYPE_KEYTP
-	;; else if (_in_csv(LITERALS, str) == true)
-		out._type == TTYPE_LITRL
-	;; else if (_in_csv(MULTI_OP_W, str) == true)
+	;; else if (_in_csv(KEYTYPES, out.data) == true)
+		out._type = TTYPE_KEYTP
+	;; else if (_in_csv(LITERALS, out.data) == true)
+		out._type = TTYPE_LITRL
+	;; else if (_in_csv(MULTI_OP_W) == true)
 		out._type = TTYPE_AUG
-	;; else
-		out._type = TTYPE_USRWD
 	;/
 
 	return out
@@ -112,79 +109,135 @@ uint   MAX_MULTI  = 3
 
 /; produce_string_token (~utils.File fin, Token prev) [Token]
 	Token out
-	out._type = TTYPE_LITRL
 	out.line = prev.line
 	out.col = prev.col
+	out._type = TTYPE_LITRL
+
+	utils.Vector tmp
+	tmp.init(1)
 
-	utils.Vector store
-	store.init(1)
 	uint8 delim = fin`.read()
-	store.push(~delim)
+	tmp.push(~delim)
 
 	/; loop (fin`.at_end == false && delim !== 0)
-		uint8 tmp = fin`.read()
-		store.push(~tmp)
-		/; if(tmp == '\\')
-			tmp = fin`.read()
-			store.push(~tmp)
-		;; else if (tmp == delim)
-			delim = 0
-		;; else if (tmp == '\n')
+		uint8 ch = fin`.read()
+		/; if (ch == '\\')
+			tmp.push(~ch)
+			ch = fin`.read()
+		;; else if (ch == '\n')
 			out.line++
+		;; else if (ch == delim)
+			delim = 0
+		;/
+		
+		/; if (ch !== 0)
+			tmp.push(~ch)
 		;/
 	;/
 
-	out.data = store.as_cstr()
-
+	out.data = tmp.as_cstr()
 	return out
 ;/
 
+/; comment_line (~utils.File fin)
+	uint8 ch = fin`.read()
+
+	/; loop (fin`.at_end == false && ch !== '\n')
+		ch = fin`.read()
+	;/
+
+	/; if (fin`.at_end == false)
+		fin`.unread()
+	;/
+;/
+
+/; comment_block (~utils.File fin, ~Token out)
+	uint8 ch = 1
+	/; loop (fin`.at_end == false && ch !== 0)
+		ch = fin`.read()
+		/; if (ch == '#')
+			ch = fin`.read()
+			/; if (ch == '/')
+				ch = 0
+			;; else
+				comment_line(fin)
+			;/
+		;/
+
+		/; if (ch == '\n')
+			out`.line++
+		;/
+	;/
+;/
+
+/; is_comment_block (~uint8 str) [bool]
+	return utils.strcmp(str, "/#\0")
+;/
+
+/; is_multi_delim(~uint8 str) [bool]
+	/; if (utils.strcmp(str, "/;\0") == true)
+		return true
+	;; else if (utils.strcmp(str, ";;\0") == true)
+		return true
+	;; else if (utils.strcmp(str, ";/\0") == true)
+		return true
+	;/
+	return false
+;/
+
 /; produce_reserved_token (~utils.File fin, Token prev) [Token]
 	Token out
+	out.line = prev.line
+	out.col = prev.col
+	out._type = TTYPE_USRWD
+
 	utils.Vector tmp
 	tmp.init(1)
 
-	out.line = prev.line
-	out.col = prev.col
+	uint8 ch = fin`.read()
 
-	/; loop (int i = 0; i < MAX_MULTI) [i++]
-		uint8 ch = fin`.read()
-		/; if (is_reserved(ch) == true)
-			tmp.push(~ch)
-		;; else
-			fin`.unread()
+	/; if (ch == '#')
+		tmp.push(~ch)
+		out._type = TTYPE_COMNT
+		out.data = tmp.as_cstr()
+		comment_line(fin)
+		return out
+	;/
+
+	tmp.push(~ch)
+	/; loop (int i = 1; i < MAX_MULTI) [i++]
+		ch = fin`.read()
+		/; if (is_reserved(ch) == false)
 			i = MAX_MULTI
+			fin`.unread()
+		;; else
+			tmp.push(~ch)
 		;/
 	;/
-	
-	/; loop (bool run = true) [run == true]
-		/; if (tmp.count < 2)
-			run = false
-			~uint8 ch = tmp.get(0)
-			/; if (ch` == ';' || ch` == ',')
+
+	/; loop (bool run = true; run == true)
+		~uint8 str = tmp.as_cstr()
+		/; if (tmp.count == 1)
+			/; if (str` == ',' || str` == ';')
 				out._type = TTYPE_SEP
-			;; else if (_str_contains(DELIMS, ch`) == true)
-				out._type = TTYPE_DELIM
-			;; else if (_str_contains(OP, ch`) == true)
+			;; else if (_str_contains(OP, str`))
 				out._type = TTYPE_AUG
+			;; else if (_str_contains(DELIMS, str`))
+				out._type = TTYPE_DELIM
+			;; else
+				out._type = TTYPE_UNKNOWN
 			;/
-		;; else if (_in_csv(MULTI_OP, tmp.as_cstr()) == true)
 			run = false
+		;; else if (_in_csv(MULTI_OP, str) == true)
 			out._type = TTYPE_AUG
-		;; else if (tmp.count == 2)
-			~uint8 cha = tmp.get(0)
-			~uint8 chb = tmp.get(0)
-			/; if (cha` == ';' && chb` == ';')
-				run = false
-			;; else if (cha` == '/' && chb` == ';')
-				run = false
-			;; else if (cha` == ';' && chb` == '/')
-				run = false
-			;/
-
-			/; if (run == false)
-				out._type = TTYPE_DELIM
-			;/
+			run = false
+		;; else if (is_comment_block(str) == true)
+			out._type = TTYPE_COMNT
+			comment_block(fin, ~out)
+			run = false
+		;; else if (is_multi_delim(str) == true)
+			out._type = TTYPE_DELIM
+			run = false
 		;; else
 			tmp.pop()
 			fin`.unread()
@@ -192,50 +245,42 @@ uint   MAX_MULTI  = 3
 	;/
 
 	out.data = tmp.as_cstr()
-
 	return out
 ;/
 
 /; produce_numeric_token (~utils.File fin, Token prev) [Token]
 	Token out
-	out._type = TTYPE_LITRL
 	out.line = prev.line
 	out.col = prev.col
+	out._type = TTYPE_LITRL
 
 	utils.Vector tmp
 	tmp.init(1)
+	
 	uint8 ch = fin`.read()
 	tmp.push(~ch)
-
-	bool alt_base = false
+	bool base = false
 	/; if (ch == '0')
 		ch = fin`.read()
-		/; if (ch !< 'a' && ch !> 'z')
-			alt_base = true
-		;; else if (ch !< 'A' && ch !> 'Z')
-			alt_base = true
-		;; else if (is_reserved(ch) == true)
-			fin`.unread()
-			out.data = tmp.as_cstr()
-			return out
-		;; else if (ch == 0)
-			out.data = tmp.as_cstr()
-			return out
+		/; if (is_reserved(ch) == false && is_whitespace(ch) == false && is_numeric(ch) == false)
+			base = true
+			tmp.push(~ch)
 		;/
-		tmp.push(~ch)
 	;/
 
-	/; loop (bool run = true) [run == true]
+	bool decimal = false
+	/; loop (bool run = true; run == true && fin`.at_end == false)
 		ch = fin`.read()
-		/; if (is_numeric(ch) == false && alt_base == false)
+		/; if (decimal == false && ch == '.')
+			decimal = true
+			tmp.push(~ch)
+		;; else if (is_reserved(ch) == true || is_whitespace(ch) == true)
 			fin`.unread()
 			run = false
-		;; else if (is_reserved(ch) == true)
+		;; else if (is_numeric(ch) == false && base == false)
 			fin`.unread()
 			run = false
-		;; else if (ch == 0 || fin`.at_end == true)
-			run = false
-		;; else
+		;; else if (ch !== 0)
 			tmp.push(~ch)
 		;/
 	;/
@@ -254,7 +299,7 @@ uint   MAX_MULTI  = 3
 ;/
 
 /; is_reserved (uint8 ch) [bool]
-	return _str_contains(RESERVED, ch)
+	return _str_contains(RESERVED, ch) == true
 ;/
 
 /; is_numeric (uint8 ch) [bool]
@@ -306,3 +351,73 @@ uint   MAX_MULTI  = 3
 	return produce_next_token(fin, tmp)
 ;/
 
+/; gen_token_list (~utils.File fin) [utils.Vector]
+	utils.Vector out
+	Token tmp
+	out.init(len tmp)
+	
+	fin`.open()
+	tmp = produce_first_token(fin)
+	/; loop (tmp._type !== TTYPE_ERR)
+		/; if (tmp._type !== TTYPE_COMNT)
+			out.push(~tmp)
+			tmp = produce_next_token(fin, tmp)
+		;; else
+			Token com = tmp
+			tmp = produce_next_token(fin, com)
+			com.end()
+		;/
+	;/
+
+	return out
+;/
+
+/; print_token_type(Token t)
+	
+	/; if (t._type == TTYPE_DELIM)
+		_printf("DELIM\0")
+	;; else if (t._type == TTYPE_SEP)
+		_printf("SEP\0")
+	;; else if (t._type == TTYPE_KEYWD)
+		_printf("KEYWD\0")
+	;; else if (t._type ==TTYPE_KEYTP)
+		_printf("KEYTP\0")
+	;; else if (t._type == TTYPE_LITRL)
+		_printf("LITRL\0")
+	;; else if (t._type == TTYPE_AUG)
+		_printf("AUG\0")
+	;; else if (t._type == TTYPE_USRWD)
+		_printf("USRWD\0")
+	;; else if (t._type == TTYPE_COMNT)
+		_printf("COMNT\0")
+	;; else if (t._type == TTYPE_UNKNOWN)
+		_printf("UNKNOWN\0")
+	;; else if (t._type == TTYPE_ERR)
+		_printf("ERR\0")
+	;/
+
+;/
+
+/; print_token_list (~utils.Vector vec)
+	~Token tok
+	/; loop (uint i = 0; i < vec`.count) [i++]
+		tok = vec.get(i)
+		_printf("Token {\0")
+		_printf(tok`.data)
+		_print_num(", line: %u\0", tok`.line)
+		_print_num(", col: %u, type: \0", tok`.col)
+		print_token_type(tok`)
+		_printf("}\n\0")
+	;/
+;/
+
+/; end_token_list (~utils.Vector vec)
+	~Token tok
+	
+	/; loop (uint i = 0; i < vec`.count) [i++]
+		tok = vec`.get(i)
+		tok`.end()
+	;/
+	vec`.end()
+;/
+
diff --git a/tnslc/tests/simple/comments.tnsl b/tnslc/tests/simple/comments.tnsl
index dbece20..36079e4 100644
--- a/tnslc/tests/simple/comments.tnsl
+++ b/tnslc/tests/simple/comments.tnsl
@@ -24,7 +24,8 @@
 #    It is a doc comment of a code block because it starts with '/##' instead of '/#'
 #    and ends with '# ;' which ends the comment and opens a block.
 #    This doc comment is on the main function
-#; main /# Comment inside function declaration #/ [int /# Comment inside this list of outputs #/ ]
+#/
+/; main /# Comment inside function declaration #/ [int /# Comment inside this list of outputs #/ ]
     return 0 # line comment inside a function
     /# Block comment inside function #/
 ;/
diff --git a/tnslc/tnslc.tnsl b/tnslc/tnslc.tnsl
index e95a63b..bb7992c 100644
--- a/tnslc/tnslc.tnsl
+++ b/tnslc/tnslc.tnsl
@@ -35,7 +35,9 @@ usage:
 		fout.init(DEFAULT_FOUT)
 	;/
 
-	compile.generate(~fin, ~fout)
+	utils.Vector v = parse.gen_token_list(~fin)
+	parse.print_token_list(~v)
+	parse.end_token_list(~v)
 
 	fin.end()
 	fout.end()
diff --git a/tnslc/utils/c_wrap_linux.tnsl b/tnslc/utils/c_wrap_linux.tnsl
index 1e3155e..62c3962 100644
--- a/tnslc/utils/c_wrap_linux.tnsl
+++ b/tnslc/utils/c_wrap_linux.tnsl
@@ -1,5 +1,5 @@
 # Must be included at the top of the file
-asm "extern malloc, realloc, free, printf, open, close, read, write, fseek"
+asm "extern malloc, realloc, free, printf, open, close, read, write, lseek"
 
 {}uint8 _alert = "Alert!\n\0"
 {}uint8 _dec = "%d\n\0"
@@ -218,7 +218,7 @@ asm "extern malloc, realloc, free, printf, open, close, read, write, fseek"
 	asm "mov rdi, r10"
 	asm "mov rsi, r11"
 	asm "mov rdx, 0" # standard value for SEEK_SET as per GNU libc
-    asm "call fseek wrt ..plt"
+    asm "call lseek wrt ..plt"
 
 	# get return value
 	asm "mov r12, rax"
diff --git a/tnslc/utils/file.tnsl b/tnslc/utils/file.tnsl
index 22b11f1..1d8a1e9 100644
--- a/tnslc/utils/file.tnsl
+++ b/tnslc/utils/file.tnsl
@@ -98,8 +98,8 @@ struct File {
 			return
 		;/
 
-		_fseek(self.handle, self.pos - 1)
 		self.pos = self.pos - 1
+		_fseek(self.handle, self.pos)
 
 		/; if (self.at_end == true)
 			self.at_end = false
author	Kyle Gunger <kgunger12@gmail.com>	2024-07-21 00:15:20 -0400
committer	Kyle Gunger <kgunger12@gmail.com>	2024-07-21 00:15:20 -0400
commit	308a427f3cdb2c7f618b0d48640d064b88bbbceb (patch)
tree	439ea530d05ae31ef6a92bf9b5d16483c3bcd270 /tnslc
parent	2f282dd62b9019b6e6613f4af5f50448089497ad (diff)