From ceaeb8df4d9bf9b518239ea623d813add5a71072 Mon Sep 17 00:00:00 2001 From: Kyle Gunger Date: Fri, 7 Jul 2023 15:50:30 -0400 Subject: Returns inside of complex operations --- tnslc/tokenizer.tnsl | 215 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 tnslc/tokenizer.tnsl (limited to 'tnslc/tokenizer.tnsl') diff --git a/tnslc/tokenizer.tnsl b/tnslc/tokenizer.tnsl new file mode 100644 index 0000000..ada1b8e --- /dev/null +++ b/tnslc/tokenizer.tnsl @@ -0,0 +1,215 @@ +# All single reserved characters +{}uint8 MULTI_PARENS = "/;:#" +{}uint8 PARENS = "()[]{}" +{}uint8 SEPS = "\n;:," +{}uint8 RESERVED = "`~!%^&*()-+=[]{}|;:/?<>.," +{}uint8 AUGMENTS = "=~!<>&|^+-*/%`." + +{}uint8 WHITESPACE = " \r\n\t" + +# All lists of keywords are comma delim because the compiler does not yet support arrays of strings +{}uint8 CSV_AUGMENTS = "++,--,==,!==,&&,||,^^,<==,>==,!>,!<,~=,`=,%=,^=,&=,*=,!=,|=,/=,<<,>>,!&,!|,!^,len,is" + +{}uint8 CSV_KEYWORDS = "if,else,loop,continue,break,return,method,struct,enum,interface,export,module,const,static,volatile,raw,extends,override,asm" + +{}uint8 CSV_KEYTYPES = "uint8,uint16,uint32,uint64,uint,int8,int16,int32,int64,int,float32,float64,float,comp32,comp64,comp,vect,bool,type,void" + +# Types of tokens +enum TOKEN_TYPE [uint] { + SEPARATOR = 0, + DELIMITER = 1, + AUGMENT = 2, + KEYTYPE = 3, + KEYWORD = 4, + LITERAL = 5, + DEFWORD = 6 +} + +# Token structure represents a single token in the program +struct Token { + uint + _type, + line, + column, + ~uint8 + data +} + +# Shortcut methods on the token struct +/; method Token + + # Initialize the data buffer + /; start + self.data = _alloc(1) + self.data{0} = 0 + ;/ + + # Append a character to the end of the token + /; append (uint8 ch) + int ln = cstr_len(self.data) + self.data = _realloc(self.data, ln + 2) + self.data{ln} = ch + self.data{ln + 1} = 0 + ;/ + + # Remove the last character from this token + /; pop + int ln = cstr_len(self.data) + self.data = _realloc(self.data, ln) + self.data{ln - 1} = 0 + ;/ + + # Copy another token to this token + /; copy (Token other) + self._type = other._type + self.line = other.line + self.column = other.column + + self.data = _alloc(cstr_len(other.data) + 1) + + cstr_copy(other.data, self.data) + ;/ + + # Delete the memory associated with this token + /; _del + _delete(self.data) + ;/ + + # length of the string that this token encodes + /; _len [int] + return cstr_len(self.data) + ;/ +;/ + +{}uint8 tkn_ok = "OK \0", tkn_no = "NO \0", tkn_nl = "\n\0" + +/; print_token (Token tok, ~void file_out) + /; if (in_csv(~CSV_KEYWORDS, tok.data) == true) + write_to_file(file_out, ~tkn_ok{0}) + ;; else + write_to_file(file_out, ~tkn_no{0}) + ;/ + write_to_file(file_out, tok.data) + write_to_file(file_out, ~tkn_nl{0}) +;/ + +# Returns true if the character is whitespace +/; is_whitespace(uint8 c) [bool] + return contains_char(~WHITESPACE, c) +;/ + + +# Returns true if the character is reserved +/; is_reserved (uint8 c) [bool] + return contains_char(~RESERVED, c) +;/ + +# Returns true if the token is a valid reserved token +/; tok_reserved (Token tok) [bool] + log_one_nl('i') + /; if (tok._len() == 1) + return is_reserved(tok.data{0}) + ;; else if (tok._len() == 2) + bool a = contains_char(~MULTI_PARENS, tok.data{0}) + bool b = contains_char(~MULTI_PARENS, tok.data{1}) + return a && b + ;/ + + return in_csv(~CSV_AUGMENTS, tok.data) +;/ + +# Returns true if the token is a valid literal value +/; tok_literal (Token tok) [bool] + # TODO: implement literals + return false +;/ + +/; get_tok_type(Token tok) [uint] + log_one_nl('h') + /; if (tok_reserved(tok) == true) + /; if (tok._len() > 1) + bool a = contains_char(~MULTI_PARENS, tok.data{0}) + bool b = contains_char(~MULTI_PARENS, tok.data{1}) + /; if (a && b) + return TOKEN_TYPE.DELIMITER + ;/ + return TOKEN_TYPE.AUGMENT + ;; else if (contains_char(~PARENS, tok.data{0})) + return TOKEN_TYPE.DELIMITER + ;; else if (contains_char(~SEPS, tok.data{0})) + return TOKEN_TYPE.SEPARATOR + ;; else if (contains_char(~AUGMENTS, tok.data{0})) + return TOKEN_TYPE.AUGMENT + ;/ + ;; else if (tok_literal(tok) == true) + return TOKEN_TYPE.LITERAL + ;/ + + return TOKEN_TYPE.DEFWORD +;/ + + +/; break_token(Token tok, uint8 c) [bool] + log_one('g') + log_one(' ') + log_one_nl(c) + # return true + uint type_before = get_tok_type(tok) + tok.append(c) + uint type_after = get_tok_type(tok) + tok.pop() + log_one_nl('g') + bool a = type_before !== TOKEN_TYPE.DEFWORD && type_after == TOKEN_TYPE.DEFWORD + bool b = type_after !== TOKEN_TYPE.LITERAL && is_whitespace(c) == true + return a || b +;/ + +/; tokenize_file (~void file_in, file_out) + + Token tmp + tmp.start() + + uint8 buf = 0 + int read_count = 0 + # Start reading at beginning of file + _read_byte(file_in, ~buf, ~read_count) + # Read loop. + /; loop (_read_byte(file_in, ~buf, ~read_count)) + /; if (read_count == 0) + break + ;/ + log_one_nl('b') + # /; if (break_token(tmp, buf) == true) + # log_one_nl('c') + # /; if (tmp._len() > 0) + # log_one_nl('d') + # print_token(tmp, file_out) + # ;/ + # + # tmp._del() + # tmp.start() +# + # /; if (is_whitespace(buf) == false) + # log_one_nl('e') + # tmp.append(buf) + # ;; else if (buf == WHITESPACE{2}) + # log_one_nl('f') + # tmp.append(WHITESPACE{2}) + # print_token(tmp, file_out) + # tmp._del() + # tmp.start() + # ;/ + # log_one_nl('c') + # ;; else + tmp.append(buf) + # ;/ + log_one_nl('b') + read_count = 0 + ;/ + + /; if (tmp._len() > 0) + print_token(tmp, file_out) + ;/ + + tmp._del() +;/ -- cgit v1.2.3