summaryrefslogtreecommitdiff
path: root/tnslc/tokenizer.tnsl
diff options
context:
space:
mode:
Diffstat (limited to 'tnslc/tokenizer.tnsl')
-rw-r--r--tnslc/tokenizer.tnsl215
1 files changed, 215 insertions, 0 deletions
diff --git a/tnslc/tokenizer.tnsl b/tnslc/tokenizer.tnsl
new file mode 100644
index 0000000..ada1b8e
--- /dev/null
+++ b/tnslc/tokenizer.tnsl
@@ -0,0 +1,215 @@
+# All single reserved characters
+{}uint8 MULTI_PARENS = "/;:#"
+{}uint8 PARENS = "()[]{}"
+{}uint8 SEPS = "\n;:,"
+{}uint8 RESERVED = "`~!%^&*()-+=[]{}|;:/?<>.,"
+{}uint8 AUGMENTS = "=~!<>&|^+-*/%`."
+
+{}uint8 WHITESPACE = " \r\n\t"
+
+# All lists of keywords are comma delim because the compiler does not yet support arrays of strings
+{}uint8 CSV_AUGMENTS = "++,--,==,!==,&&,||,^^,<==,>==,!>,!<,~=,`=,%=,^=,&=,*=,!=,|=,/=,<<,>>,!&,!|,!^,len,is"
+
+{}uint8 CSV_KEYWORDS = "if,else,loop,continue,break,return,method,struct,enum,interface,export,module,const,static,volatile,raw,extends,override,asm"
+
+{}uint8 CSV_KEYTYPES = "uint8,uint16,uint32,uint64,uint,int8,int16,int32,int64,int,float32,float64,float,comp32,comp64,comp,vect,bool,type,void"
+
+# Types of tokens
+enum TOKEN_TYPE [uint] {
+ SEPARATOR = 0,
+ DELIMITER = 1,
+ AUGMENT = 2,
+ KEYTYPE = 3,
+ KEYWORD = 4,
+ LITERAL = 5,
+ DEFWORD = 6
+}
+
+# Token structure represents a single token in the program
+struct Token {
+ uint
+ _type,
+ line,
+ column,
+ ~uint8
+ data
+}
+
+# Shortcut methods on the token struct
+/; method Token
+
+ # Initialize the data buffer
+ /; start
+ self.data = _alloc(1)
+ self.data{0} = 0
+ ;/
+
+ # Append a character to the end of the token
+ /; append (uint8 ch)
+ int ln = cstr_len(self.data)
+ self.data = _realloc(self.data, ln + 2)
+ self.data{ln} = ch
+ self.data{ln + 1} = 0
+ ;/
+
+ # Remove the last character from this token
+ /; pop
+ int ln = cstr_len(self.data)
+ self.data = _realloc(self.data, ln)
+ self.data{ln - 1} = 0
+ ;/
+
+ # Copy another token to this token
+ /; copy (Token other)
+ self._type = other._type
+ self.line = other.line
+ self.column = other.column
+
+ self.data = _alloc(cstr_len(other.data) + 1)
+
+ cstr_copy(other.data, self.data)
+ ;/
+
+ # Delete the memory associated with this token
+ /; _del
+ _delete(self.data)
+ ;/
+
+ # length of the string that this token encodes
+ /; _len [int]
+ return cstr_len(self.data)
+ ;/
+;/
+
+{}uint8 tkn_ok = "OK \0", tkn_no = "NO \0", tkn_nl = "\n\0"
+
+/; print_token (Token tok, ~void file_out)
+ /; if (in_csv(~CSV_KEYWORDS, tok.data) == true)
+ write_to_file(file_out, ~tkn_ok{0})
+ ;; else
+ write_to_file(file_out, ~tkn_no{0})
+ ;/
+ write_to_file(file_out, tok.data)
+ write_to_file(file_out, ~tkn_nl{0})
+;/
+
+# Returns true if the character is whitespace
+/; is_whitespace(uint8 c) [bool]
+ return contains_char(~WHITESPACE, c)
+;/
+
+
+# Returns true if the character is reserved
+/; is_reserved (uint8 c) [bool]
+ return contains_char(~RESERVED, c)
+;/
+
+# Returns true if the token is a valid reserved token
+/; tok_reserved (Token tok) [bool]
+ log_one_nl('i')
+ /; if (tok._len() == 1)
+ return is_reserved(tok.data{0})
+ ;; else if (tok._len() == 2)
+ bool a = contains_char(~MULTI_PARENS, tok.data{0})
+ bool b = contains_char(~MULTI_PARENS, tok.data{1})
+ return a && b
+ ;/
+
+ return in_csv(~CSV_AUGMENTS, tok.data)
+;/
+
+# Returns true if the token is a valid literal value
+/; tok_literal (Token tok) [bool]
+ # TODO: implement literals
+ return false
+;/
+
+/; get_tok_type(Token tok) [uint]
+ log_one_nl('h')
+ /; if (tok_reserved(tok) == true)
+ /; if (tok._len() > 1)
+ bool a = contains_char(~MULTI_PARENS, tok.data{0})
+ bool b = contains_char(~MULTI_PARENS, tok.data{1})
+ /; if (a && b)
+ return TOKEN_TYPE.DELIMITER
+ ;/
+ return TOKEN_TYPE.AUGMENT
+ ;; else if (contains_char(~PARENS, tok.data{0}))
+ return TOKEN_TYPE.DELIMITER
+ ;; else if (contains_char(~SEPS, tok.data{0}))
+ return TOKEN_TYPE.SEPARATOR
+ ;; else if (contains_char(~AUGMENTS, tok.data{0}))
+ return TOKEN_TYPE.AUGMENT
+ ;/
+ ;; else if (tok_literal(tok) == true)
+ return TOKEN_TYPE.LITERAL
+ ;/
+
+ return TOKEN_TYPE.DEFWORD
+;/
+
+
+/; break_token(Token tok, uint8 c) [bool]
+ log_one('g')
+ log_one(' ')
+ log_one_nl(c)
+ # return true
+ uint type_before = get_tok_type(tok)
+ tok.append(c)
+ uint type_after = get_tok_type(tok)
+ tok.pop()
+ log_one_nl('g')
+ bool a = type_before !== TOKEN_TYPE.DEFWORD && type_after == TOKEN_TYPE.DEFWORD
+ bool b = type_after !== TOKEN_TYPE.LITERAL && is_whitespace(c) == true
+ return a || b
+;/
+
+/; tokenize_file (~void file_in, file_out)
+
+ Token tmp
+ tmp.start()
+
+ uint8 buf = 0
+ int read_count = 0
+ # Start reading at beginning of file
+ _read_byte(file_in, ~buf, ~read_count)
+ # Read loop.
+ /; loop (_read_byte(file_in, ~buf, ~read_count))
+ /; if (read_count == 0)
+ break
+ ;/
+ log_one_nl('b')
+ # /; if (break_token(tmp, buf) == true)
+ # log_one_nl('c')
+ # /; if (tmp._len() > 0)
+ # log_one_nl('d')
+ # print_token(tmp, file_out)
+ # ;/
+ #
+ # tmp._del()
+ # tmp.start()
+#
+ # /; if (is_whitespace(buf) == false)
+ # log_one_nl('e')
+ # tmp.append(buf)
+ # ;; else if (buf == WHITESPACE{2})
+ # log_one_nl('f')
+ # tmp.append(WHITESPACE{2})
+ # print_token(tmp, file_out)
+ # tmp._del()
+ # tmp.start()
+ # ;/
+ # log_one_nl('c')
+ # ;; else
+ tmp.append(buf)
+ # ;/
+ log_one_nl('b')
+ read_count = 0
+ ;/
+
+ /; if (tmp._len() > 0)
+ print_token(tmp, file_out)
+ ;/
+
+ tmp._del()
+;/