# All single reserved characters {}uint8 MULTI_PARENS = "/;:#" {}uint8 PARENS = "()[]{}" {}uint8 SEPS = "\n;:," {}uint8 RESERVED = "`~!%^&*()-+=[]{}|;:/?<>.," {}uint8 AUGMENTS = "=~!<>&|^+-*/%`." {}uint8 WHITESPACE = " \r\n\t" # All lists of keywords are comma delim because the compiler does not yet support arrays of strings {}uint8 CSV_AUGMENTS = "++,--,==,!==,&&,||,^^,<==,>==,!>,!<,~=,`=,%=,^=,&=,*=,!=,|=,/=,<<,>>,!&,!|,!^,len,is" {}uint8 CSV_KEYWORDS = "if,else,loop,continue,break,return,method,struct,enum,interface,export,module,const,static,volatile,raw,extends,override,asm" {}uint8 CSV_KEYTYPES = "uint8,uint16,uint32,uint64,uint,int8,int16,int32,int64,int,float32,float64,float,comp32,comp64,comp,vect,bool,type,void" # Types of tokens enum TOKEN_TYPE [uint] { SEPARATOR = 0, DELIMITER = 1, AUGMENT = 2, KEYTYPE = 3, KEYWORD = 4, LITERAL = 5, DEFWORD = 6 } # Token structure represents a single token in the program struct Token { uint _type, line, column, ~uint8 data } # Shortcut methods on the token struct /; method Token # Initialize the data buffer /; start self.data = _alloc(1) self.data{0} = 0 ;/ # Append a character to the end of the token /; append (uint8 ch) int ln = cstr_len(self.data) self.data = _realloc(self.data, ln + 2) self.data{ln} = ch self.data{ln + 1} = 0 ;/ # Remove the last character from this token /; pop int ln = cstr_len(self.data) self.data{ln - 1} = 0 ;/ # Copy another token to this token /; copy (Token other) self._type = other._type self.line = other.line self.column = other.column self.data = _alloc(cstr_len(other.data) + 1) cstr_copy(other.data, self.data) ;/ # Delete the memory associated with this token /; _del _realloc(self.data, 0) ;/ # length of the string that this token encodes /; _len [int] return cstr_len(self.data) ;/ ;/ {}uint8 tkn_st = "{ \0", tkn_nd = " }\n\0", tkn_sp = " \0" /; print_token (Token tok, ~void file_out) write_to_file(file_out, ~tkn_st{0}) write_to_file(file_out, print_tok_type(tok)) write_to_file(file_out, ~tkn_sp{0}) write_to_file(file_out, tok.data) write_to_file(file_out, ~tkn_nd{0}) ;/ # Returns true if the character is whitespace /; is_whitespace(uint8 c) [bool] return contains_char(~WHITESPACE, c) ;/ # Returns true if the character is reserved /; is_reserved (uint8 c) [bool] return contains_char(~RESERVED, c) ;/ # Returns true if the token is a valid reserved token /; tok_reserved (Token tok) [bool] /; if (tok._len() == 1) return is_reserved(tok.data{0}) ;; else if (tok._len() == 2) /; if (contains_char(~MULTI_PARENS, tok.data{0}) && contains_char(~MULTI_PARENS, tok.data{1})) return true ;/ ;/ return in_csv(~CSV_AUGMENTS, tok.data) !< 0 ;/ # True if the token is a valid number (integer or float) /; is_numeric_literal(Token tok) [bool] /; if (tok._len() < 1) return false ;; else if (tok.data{0} < '0' || tok.data{0} > '9') return false ;/ bool non_dec = false /; if (tok._len() > 1 && tok.data{0} == '0') non_dec = tok.data{1} > '9' # TODO: non_dec not impl ;/ bool dec_seen = false /; loop (int i = 0; i < tok._len()) [i++] /; if (dec_seen == false && tok.data{i} == '.') dec_seen = true ;; else if (tok.data{i} < '0' || tok.data{i} > '9') return false ;/ ;/ return true ;/ /; get_tok_type(Token tok) [uint] /; if (tok_reserved(tok) == true) /; if (tok._len() > 1) /; if (contains_char(~MULTI_PARENS, tok.data{0}) && contains_char(~MULTI_PARENS, tok.data{1})) return TOKEN_TYPE.DELIMITER ;/ return TOKEN_TYPE.AUGMENT ;; else if (contains_char(~PARENS, tok.data{0}) == true) return TOKEN_TYPE.DELIMITER ;; else if (contains_char(~SEPS, tok.data{0}) == true) return TOKEN_TYPE.SEPARATOR ;; else if (contains_char(~AUGMENTS, tok.data{0}) == true) return TOKEN_TYPE.AUGMENT ;/ ;; else if (in_csv(~CSV_KEYWORDS, tok.data) !< 0) return TOKEN_TYPE.KEYWORD ;; else if (in_csv(~CSV_KEYTYPES, tok.data) !< 0) return TOKEN_TYPE.KEYTYPE ;; else if (is_numeric_literal(tok) == true) return TOKEN_TYPE.LITERAL ;/ return TOKEN_TYPE.DEFWORD ;/ /; break_token(~Token tok, uint8 c) [bool] uint type_before = get_tok_type(tok`) tok`.append(c) uint type_after = get_tok_type(tok`) tok`.pop() bool a = is_whitespace(c) && type_after !== TOKEN_TYPE.LITERAL bool b = false /; if (is_reserved(c) == true) b = type_after == TOKEN_TYPE.DEFWORD ;; else if (tok`._len() > 0) b = is_reserved(tok`.data{0}) ;/ bool c = type_before == TOKEN_TYPE.LITERAL && type_after == TOKEN_TYPE.DEFWORD return a || b || c ;/ /; handle_comment (~void file_in) uint8 buf = 0 int read_count = 0 /; loop _read_byte(file_in, ~buf, ~read_count) /; if (buf == '\n' || read_count == 0) break ;/ read_count = 0 ;/ ;/ /; handle_str (~void file_in, Token tmp, ~int line, column, uint8 first) [Token] uint8 buf = first int read_count = 0 tmp._type = TOKEN_TYPE.LITERAL tmp.append(buf) read_count = 0 /; loop _read_byte(file_in, ~buf, ~read_count) /; if (read_count == 0) break ;/ /; if (buf == '\\') tmp.append(buf) read_count = 0 _read_byte(file_in, ~buf, ~read_count) column`++ tmp.append(buf) ;; else if (buf == first) tmp.append(buf) break ;; else tmp.append(buf) ;/ /; if (buf == '\n') line`++ column` = 1 ;; else column`++ ;/ read_count = 0 ;/ return tmp ;/ {}uint8 w_tkn_gen = "%d Tokens generated from file.\n\0" /; tokenize_file (~void file_in) [Vector] # This vector is going to store all of our tokens as we generate them Vector out_vect # The size of a token struct is 3 uint + pointer = 4*8 = 32 bytes out_vect.start(32) Token tmp tmp.start() tmp.line = 1 tmp.column = 1 uint8 buf = 0 int read_count = 0 int line = 1 int column = 1 # Read loop. /; loop [column++] _read_byte(file_in, ~buf, ~read_count) /; if (read_count == 0) break ;/ /; if (buf == '#') # Handle comment handle_comment(file_in) line++ ;; else if (buf == '\'' || buf == '"') # Don't rope the last token into this /; if (tmp._len() > 0) tmp._type = get_tok_type(tmp) out_vect.push(~tmp) tmp.start() ;/ # Handle char/string literal tmp = handle_str(file_in, tmp, ~line, ~column, buf) out_vect.push(~tmp) tmp.start() tmp.line = line tmp.column = column ;; else if (break_token(~tmp, buf) == true) # Handle token break /; if (tmp._len() > 0) tmp._type = get_tok_type(tmp) out_vect.push(~tmp) tmp.start() ;/ tmp.line = line tmp.column = column /; if (is_whitespace(buf) == false) tmp.append(buf) ;/ ;; else if (is_whitespace(buf) == false) # Add non-whitespace tmp.append(buf) ;/ /; if (buf == '\n') line++ column = 0 ;/ read_count = 0 ;/ /; if (tmp._len() > 0) tmp._type = get_tok_type(tmp) out_vect.push(~tmp) ;/ _print_num(~w_tkn_gen{0}, out_vect._len()) return out_vect ;/ {}uint8 w_SEP = "SEPARATOR\0" {}uint8 w_DEL = "DELIMITER\0" {}uint8 w_AUG = "AUGMENT\0" {}uint8 w_KTP = "KEYTYPE\0" {}uint8 w_KWD = "KEYWORD\0" {}uint8 w_LIT = "LITERAL\0" {}uint8 w_DEF = "DEFWORD\0" /; print_tok_type(uint tt) [~uint8] ~uint8 ptr = ~w_DEF{0} /; if (tt == TOKEN_TYPE.SEPARATOR) ptr = ~w_SEP{0} ;; else if (tt == TOKEN_TYPE.DELIMITER) ptr = ~w_DEL{0} ;; else if (tt == TOKEN_TYPE.AUGMENT) ptr = ~w_AUG{0} ;; else if (tt == TOKEN_TYPE.KEYTYPE) ptr = ~w_KTP{0} ;; else if (tt == TOKEN_TYPE.KEYWORD) ptr = ~w_KWD{0} ;; else if (tt == TOKEN_TYPE.LITERAL) ptr = ~w_LIT{0} ;; else if (tt == TOKEN_TYPE.DEFWORD) ptr = ~w_DEF{0} ;/ return ptr ;/