diff options
| -rw-r--r-- | tnslc/tokenizer.tnsl | 188 | 
1 files changed, 102 insertions, 86 deletions
| diff --git a/tnslc/tokenizer.tnsl b/tnslc/tokenizer.tnsl index 3de0182..a7017e3 100644 --- a/tnslc/tokenizer.tnsl +++ b/tnslc/tokenizer.tnsl @@ -80,16 +80,16 @@ struct Token {      ;/  ;/ -{}uint8 tkn_ok = "OK \0", tkn_no = "NO \0", tkn_nl = "\n\0" +{}uint8 tkn_st = "{ \0", tkn_nd = " }\n\0", tkn_sp = " \0"  /; print_token (Token tok, ~void file_out) -    /; if (in_csv(~CSV_KEYWORDS, tok.data) == true) -        write_to_file(file_out, ~tkn_ok{0}) -    ;; else -        write_to_file(file_out, ~tkn_no{0}) -    ;/ +    write_to_file(file_out, ~tkn_st{0}) +     +    write_to_file(file_out, print_tok_type(tok)) +    write_to_file(file_out, ~tkn_sp{0})      write_to_file(file_out, tok.data) -    write_to_file(file_out, ~tkn_nl{0}) + +    write_to_file(file_out, ~tkn_nd{0})  ;/  # Returns true if the character is whitespace @@ -114,10 +114,31 @@ struct Token {      return in_csv(~CSV_AUGMENTS, tok.data)  ;/ -# Returns true if the token is a valid literal value -/; tok_literal (Token tok) [bool] -    # TODO: implement literals -    return false +# True if the token is a valid number (integer or float) +/; is_numeric_literal(Token tok) [bool] +    /; if (tok._len() < 1) +        return false +    ;; else if (tok.data{0} < '0' || tok.data{0} > '9') +        return false +    ;/ + +    bool non_dec = false +    /; if (tok._len() > 1 && tok.data{0} == '0') +        non_dec = tok.data{1} > '9' +        # TODO: non_dec not impl +    ;/ + +    bool dec_seen = false + +    /; loop (int i = 0; i < tok._len()) [i++] +        /; if (dec_seen == false && tok.data{i} == '.') +            dec_seen = true +        ;; else if (tok.data{i} < '0' || tok.data{i} > '9') +            return false +        ;/ +    ;/ + +    return true  ;/  /; get_tok_type(Token tok) [uint] @@ -138,7 +159,7 @@ struct Token {          return TOKEN_TYPE.KEYWORD      ;; else if (in_csv(~CSV_KEYTYPES, tok.data) == true)          return TOKEN_TYPE.KEYTYPE -    ;; else if (tok_literal(tok) == true) +    ;; else if (is_numeric_literal(tok) == true)          return TOKEN_TYPE.LITERAL      ;/ @@ -147,58 +168,107 @@ struct Token {  /; break_token(~Token tok, uint8 c) [bool] -    # return true      uint type_before = get_tok_type(tok`)      tok`.append(c)      uint type_after = get_tok_type(tok`)      tok`.pop() -    bool a = true -    return a + +    bool a = is_whitespace(c) && type_after !== TOKEN_TYPE.LITERAL +    bool b = is_reserved(c) && type_before == TOKEN_TYPE.DEFWORD +    bool c = type_before !== TOKEN_TYPE.DEFWORD && type_after == TOKEN_TYPE.DEFWORD + +    return a || b || c  ;/ +  /; tokenize_file (~void file_in, file_out)      Token tmp      tmp.start() +    tmp.line = 1 +    tmp.column = 1      uint8 buf = 0      int read_count = 0 +    int line = 1 +    int column = 1      # Start reading at beginning of file      _read_byte(file_in, ~buf, ~read_count)      # Read loop. -    /; loop (_read_byte(file_in, ~buf, ~read_count)) +    /; loop (_read_byte(file_in, ~buf, ~read_count)) [column++]          /; if (read_count == 0)              break          ;/ -          /; if (buf == '#') + +            # Handle comment              /; loop (_read_byte(file_in, ~buf, ~read_count))                  /; if (buf == '\n' || read_count == 0)                      break                  ;/              ;/ +        ;; else if (buf == '\'' || buf == '"') + +            # Handle char/string literal +            uint8 first = buf +            tmp._type = TOKEN_TYPE.LITERAL +            tmp.append(buf) +            /; loop (_read_byte(file_in, ~buf, ~read_count)) +                /; if (buf == '\\') +                    tmp.append(buf) +                    read_count = 0 +                    _read_byte(file_in, ~buf, ~read_count) +                    column++ +                    tmp.append(buf) +                ;; else if (buf == first) +                    tmp.append(buf) +                    break +                ;; else +                    tmp.append(buf) +                ;/ + +                /; if (buf == '\n') +                    line++ +                    column = 1 +                ;; else +                    column++ +                ;/ + +                read_count = 0 +            ;/ +            print_token(tmp, file_out) +            tmp._del() +            tmp.start() +          ;; else if (break_token(~tmp, buf) == true) + +            # Handle token break              /; if (tmp._len() > 0) +                tmp._type = get_tok_type(tmp)                  print_token(tmp, file_out)              ;/              tmp._del()              tmp.start() +            tmp.line = line +            tmp.column = column              /; if (is_whitespace(buf) == false)                  tmp.append(buf) -            ;; else if (buf == WHITESPACE{2}) -                tmp.append(WHITESPACE{2}) -                print_token(tmp, file_out) -                tmp._del() -                tmp.start()              ;/ -        ;; else +        ;; else if (is_whitespace(buf) == false) + +            # Add non-whitespace              tmp.append(buf)          ;/ +        /; if (buf == '\n') +            line++ +            column = 0 +        ;/ +          read_count = 0      ;/ @@ -209,15 +279,15 @@ struct Token {      tmp._del()  ;/ -{}uint8 w_SEP = "SEPARATOR\n\0" -{}uint8 w_DEL = "DELIMITER\n\0" -{}uint8 w_AUG = "AUGMENT\n\0" -{}uint8 w_KTP = "KEYTYPE\n\0" -{}uint8 w_KWD = "KEYWORD\n\0" -{}uint8 w_LIT = "LITERAL\n\0" -{}uint8 w_DEF = "DEFWORD\n\0" +{}uint8 w_SEP = "SEPARATOR\0" +{}uint8 w_DEL = "DELIMITER\0" +{}uint8 w_AUG = "AUGMENT\0" +{}uint8 w_KTP = "KEYTYPE\0" +{}uint8 w_KWD = "KEYWORD\0" +{}uint8 w_LIT = "LITERAL\0" +{}uint8 w_DEF = "DEFWORD\0" -/; print_tok_type(uint tt) +/; print_tok_type(uint tt) [~uint8]      ~uint8 ptr = ~w_DEF{0} @@ -237,61 +307,7 @@ struct Token {          ptr = ~w_DEF{0}      ;/ -    _printf(ptr) +    return ptr  ;/ -{}uint8 test_multi = "/;\0" -{}uint8 test_paren = "(\0" -{}uint8 test_seps = ",\0" -{}uint8 test_aug = ".\0" -{}uint8 test_maug = "++\0" -{}uint8 test_mkw = "if\0" -{}uint8 test_mkt = "bool\0" -{}uint8 test_def = "main\0" -{}uint8 space = " \0" - -/; tests -    Token tk - -    # Delimiter -    tk.data = ~test_multi{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_paren{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_seps{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_aug{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_maug{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_mkw{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_mkt{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) - -    tk.data = ~test_def{0} -    _printf(tk.data) -    _printf(~space{0}) -    print_tok_type(get_tok_type(tk)) -;/ |