Begin port of tnsl-parse code to native TNSL

author: Kyle Gunger <kgunger12@gmail.com> 2021-08-22 15:25:54 -0400
committer: Kyle Gunger <kgunger12@gmail.com> 2021-08-22 15:25:54 -0400
commit: 3add402da9fc5b574f34e37e951779212ce28ed1 (patch)
tree: a081a2331923dce6ee29c87285be901d58b0bc4b /tnslc/src/parse
parent: 11f9c56ae3861e32ac45785e9f30ed5f4c19ea32 (diff)
3 files changed, 286 insertions, 3 deletions
diff --git a/tnslc/src/parse/parse.tnsl b/tnslc/src/parse/parse.tnsl
index dc6b9a2..e10ab4c 100644
--- a/tnslc/src/parse/parse.tnsl
+++ b/tnslc/src/parse/parse.tnsl
@@ -15,5 +15,8 @@
 #/
 
 /; export module parse
-	:import 'token.tnsl'
+	/:import
+		"token.tnsl"
+		"tokenizer.tnsl"
+	:/
 ;/
 \ No newline at end of file
diff --git a/tnslc/src/parse/token.tnsl b/tnslc/src/parse/token.tnsl
index 92b2ca4..a841f58 100644
--- a/tnslc/src/parse/token.tnsl
+++ b/tnslc/src/parse/token.tnsl
@@ -14,13 +14,27 @@
 	EXPRESS OR IMPLIED
 #/
 
-;struct Token {
+/# The various types of tokens #/
+; enum TOKEN_TYPE [uint] {
+	LINESEP = 0,
+	INLNSEP = 1,
+	DELIMIT = 2,
+	AUGMENT = 3,
+	LITERAL = 4,
+	KEYTYPE = 5,
+	PREWORD = 6,
+	KEYWORD = 7,
+	DEFWORD = 8
+}
+
+/# Token struct definition #/
+;raw struct Token {
 	uint
 		type,
 		line,
 		char,
 
-	~{}char
+	~{}charp
 		data
 }
 
@@ -29,4 +43,245 @@
 	/; operator delete
 		;delete this.data
 	;/
+;/
+
+/#
+	Reserved words and characters, as well as
+	helper funcs for checking their token types.
+#/
+
+;const {}{}charp PREWORDS = {
+	"include",
+	"define",
+	"extern",
+	"size",
+	"align",
+	"address",
+	"rootfile",
+	"if",
+	"else",
+	"abi"
+}
+
+;const {}{}charp KEYTYPES = {
+	"bool",
+	"char",
+	"charp",
+
+	"int8",
+	"int16",
+	"int32",
+	"int64",
+	"int",
+	"uint8",
+	"uint16",
+	"uint32",
+	"uint64",
+	"uint",
+
+	"float32",
+	"float64",
+	"float",
+
+	"void",
+	"type"
+}
+
+;const {}{}charp KEYWORDS = {
+	"struct",
+	"interface",
+	"enum",
+	"is",
+	"extends",
+	
+	"loop",
+	"continue",
+	"break",
+	
+	"match",
+	"case",
+	"default",
+	
+	"label",
+	"goto",
+	
+	"if",
+	"else",
+	
+	"const",
+	"static",
+	"volatile",
+	
+	"method",
+	"override",
+	"self",
+	"super",
+	"operator",
+	
+	"raw",
+	"asm",
+	"inline",
+
+	"delete",
+	
+	"module",
+	"export",
+}
+
+;const {}{}charp LITERALS = {
+	"true",
+	"false"
+}
+
+;const {}charp DELIMITS = "()[]{}"
+;const {}charp LINESEPS = ";:#"
+;const {}charp INLNSEPS = ","
+;const {}charp AUGMENTS = "~`.&|^><!+-*/%"
+
+;const {}{}charp MDELIMITS = {
+	# Code block
+	"/;",
+	";/",
+	
+	# Comment block
+	"/#",
+	"#/",
+	
+	# Preproc block
+	"/:",
+	":/",
+
+	# Redef blocks
+	";;",
+	"::",
+	";#",
+	":#",
+	"#;",
+	"#:"
+}
+
+;const {}{}charp MAUGMENTS = {
+	# Boolean
+	"==",
+	"&&",
+	"||",
+
+	# Bitwise shifts
+	"<<",
+	">>",
+
+	# PREaugmented augmentors
+	"&=",
+	"|=",
+	"^=",
+	"+=",
+	"-=",
+	"*=",
+	"/=",
+	"%=",
+	"~=",
+	"`=",
+
+	# POSTaugmented augmentors
+	"!&",
+	"!|",
+	"!^",
+	"!==",
+	"!&&",
+	"!||",
+	"!>",
+	"!<",
+	">==",
+	"<==",
+
+	# Increment and De-increment
+	"++",
+	"--"
+}
+
+; const uint MAX_MRESERVED = 3
+
+/##
+	Checks if the character point p is in the string cmp
+
+#; is_in_string (`const {}charp cmp, charp p) [bool]
+
+	/; for (int i = 0; i < len cmp) [i++]
+
+		/; if (s == cmp{i})
+			;return true
+		;/
+	;/
+
+	;return false
+;/
+
+
+/##
+	Checks if the string s is in the list cmp
+
+#; is_in_string_list (`const {}{}charp cmp, `{}charp s) [bool]
+
+	/; for (int i = 0; i < len cmp) [i++]
+
+		/; if (len s == len cmp{i})
+
+			/; for (int j = 0; j < len s) [j++]
+
+				/; if (s{j} !== cmp{i}{j})
+					;goto cont_outer
+				;/
+			;/
+
+			;return true
+		;/
+
+		;label cont_outer
+	;/
+
+	;return false
+;/
+
+/#
+	Get the token_type value for a given string of character points
+
+#; get_token_type (`{}charp s) [int]
+
+	/; if (len s > 1)
+
+		/; if (is_in_string_list(~PREWORDS, ~s))
+			;return TOKEN_TYPE.PREWORD
+		;; else if (is_in_string_list(~KEYTYPES, ~s))
+			;return TOKEN_TYPE.KEYTYPE
+		;; else if (is_in_string_list(~KEYWORDS, ~s))
+			;return TOKEN_TYPE.KEYWORD
+		;; else if (is_in_string_list(~LITERALS, ~s))
+			;return TOKEN_TYPE.LITERAL
+		;; else if (is_in_string_list(~MDELIMITS, ~s))
+			;return TOKEN_TYPE.DELIMIT
+		;; else if (is_in_string_list(~MAUGMENTS, ~s))
+			;return TOKEN_TYPE.AUGMENT
+		;/
+
+		;return TOKEN_TYPE.DEFWORD
+
+	;;else if (len s == 1)
+
+		/; if (is_in_string(~DELIMITS, s{0}))
+			;return TOKEN_TYPE.DELIMIT
+		;; else if (is_in_string(~LINESEPS, s{0}))
+			;return TOKEN_TYPE.LINESEP
+		;; else if (is_in_string(~INLNSEPS, s{0}))
+			;return TOKEN_TYPE.INLNSEP
+		;; else if (is_in_string(~AUGMENTS, s{0}))
+			;return TOKEN_TYPE.AUGMENT
+		;/
+
+		;return TOKEN_TYPE.DEFWORD
+	;/
+
+	# What, we just produce vacant tokens now?
+	# Something has gone wrong.
+
+	;return -1
 ;/
 \ No newline at end of file
diff --git a/tnslc/src/parse/tokenizer.tnsl b/tnslc/src/parse/tokenizer.tnsl
new file mode 100644
index 0000000..ec34d83
--- /dev/null
+++ b/tnslc/src/parse/tokenizer.tnsl
@@ -0,0 +1,25 @@
+/#
+	Copyright 2021 Kyle Gunger
+
+	This file is licensed under the CDDL 1.0 (the License)
+	and may only be used in accordance with the License.
+	You should have received a copy of the License with this
+	software/source code. If you did not, a copy can be found
+	at the following URL:
+
+	https://opensource.org/licenses/CDDL-1.0
+
+	THIS SOFTWARE/SOURCE CODE IS PROVIDED "AS IS" WITH NO
+	WARRANTY, GUARANTEE, OR CLAIM OF FITNESS FOR ANY PURPOSE
+	EXPRESS OR IMPLIED
+#/
+
+
+/## 
+	parse.numeric_literal tokenizes the next numeric literal value in a file.
+	Returns a token with the proper data as well as the number of characters read
+
+#; numeric_literal () [Token, uint]
+
+	
+;/
+\ No newline at end of file
author	Kyle Gunger <kgunger12@gmail.com>	2021-08-22 15:25:54 -0400
committer	Kyle Gunger <kgunger12@gmail.com>	2021-08-22 15:25:54 -0400
commit	3add402da9fc5b574f34e37e951779212ce28ed1 (patch)
tree	a081a2331923dce6ee29c87285be901d58b0bc4b /tnslc/src/parse
parent	11f9c56ae3861e32ac45785e9f30ed5f4c19ea32 (diff)