From 3add402da9fc5b574f34e37e951779212ce28ed1 Mon Sep 17 00:00:00 2001 From: Kyle Gunger Date: Sun, 22 Aug 2021 15:25:54 -0400 Subject: Begin port of tnsl-parse code to native TNSL --- tnslc/src/parse/parse.tnsl | 5 +- tnslc/src/parse/token.tnsl | 259 ++++++++++++++++++++++++++++++++++++++++- tnslc/src/parse/tokenizer.tnsl | 25 ++++ 3 files changed, 286 insertions(+), 3 deletions(-) create mode 100644 tnslc/src/parse/tokenizer.tnsl (limited to 'tnslc/src/parse') diff --git a/tnslc/src/parse/parse.tnsl b/tnslc/src/parse/parse.tnsl index dc6b9a2..e10ab4c 100644 --- a/tnslc/src/parse/parse.tnsl +++ b/tnslc/src/parse/parse.tnsl @@ -15,5 +15,8 @@ #/ /; export module parse - :import 'token.tnsl' + /:import + "token.tnsl" + "tokenizer.tnsl" + :/ ;/ \ No newline at end of file diff --git a/tnslc/src/parse/token.tnsl b/tnslc/src/parse/token.tnsl index 92b2ca4..a841f58 100644 --- a/tnslc/src/parse/token.tnsl +++ b/tnslc/src/parse/token.tnsl @@ -14,13 +14,27 @@ EXPRESS OR IMPLIED #/ -;struct Token { +/# The various types of tokens #/ +; enum TOKEN_TYPE [uint] { + LINESEP = 0, + INLNSEP = 1, + DELIMIT = 2, + AUGMENT = 3, + LITERAL = 4, + KEYTYPE = 5, + PREWORD = 6, + KEYWORD = 7, + DEFWORD = 8 +} + +/# Token struct definition #/ +;raw struct Token { uint type, line, char, - ~{}char + ~{}charp data } @@ -29,4 +43,245 @@ /; operator delete ;delete this.data ;/ +;/ + +/# + Reserved words and characters, as well as + helper funcs for checking their token types. +#/ + +;const {}{}charp PREWORDS = { + "include", + "define", + "extern", + "size", + "align", + "address", + "rootfile", + "if", + "else", + "abi" +} + +;const {}{}charp KEYTYPES = { + "bool", + "char", + "charp", + + "int8", + "int16", + "int32", + "int64", + "int", + "uint8", + "uint16", + "uint32", + "uint64", + "uint", + + "float32", + "float64", + "float", + + "void", + "type" +} + +;const {}{}charp KEYWORDS = { + "struct", + "interface", + "enum", + "is", + "extends", + + "loop", + "continue", + "break", + + "match", + "case", + "default", + + "label", + "goto", + + "if", + "else", + + "const", + "static", + "volatile", + + "method", + "override", + "self", + "super", + "operator", + + "raw", + "asm", + "inline", + + "delete", + + "module", + "export", +} + +;const {}{}charp LITERALS = { + "true", + "false" +} + +;const {}charp DELIMITS = "()[]{}" +;const {}charp LINESEPS = ";:#" +;const {}charp INLNSEPS = "," +;const {}charp AUGMENTS = "~`.&|^>>", + + # PREaugmented augmentors + "&=", + "|=", + "^=", + "+=", + "-=", + "*=", + "/=", + "%=", + "~=", + "`=", + + # POSTaugmented augmentors + "!&", + "!|", + "!^", + "!==", + "!&&", + "!||", + "!>", + "!<", + ">==", + "<==", + + # Increment and De-increment + "++", + "--" +} + +; const uint MAX_MRESERVED = 3 + +/## + Checks if the character point p is in the string cmp + +#; is_in_string (`const {}charp cmp, charp p) [bool] + + /; for (int i = 0; i < len cmp) [i++] + + /; if (s == cmp{i}) + ;return true + ;/ + ;/ + + ;return false +;/ + + +/## + Checks if the string s is in the list cmp + +#; is_in_string_list (`const {}{}charp cmp, `{}charp s) [bool] + + /; for (int i = 0; i < len cmp) [i++] + + /; if (len s == len cmp{i}) + + /; for (int j = 0; j < len s) [j++] + + /; if (s{j} !== cmp{i}{j}) + ;goto cont_outer + ;/ + ;/ + + ;return true + ;/ + + ;label cont_outer + ;/ + + ;return false +;/ + +/# + Get the token_type value for a given string of character points + +#; get_token_type (`{}charp s) [int] + + /; if (len s > 1) + + /; if (is_in_string_list(~PREWORDS, ~s)) + ;return TOKEN_TYPE.PREWORD + ;; else if (is_in_string_list(~KEYTYPES, ~s)) + ;return TOKEN_TYPE.KEYTYPE + ;; else if (is_in_string_list(~KEYWORDS, ~s)) + ;return TOKEN_TYPE.KEYWORD + ;; else if (is_in_string_list(~LITERALS, ~s)) + ;return TOKEN_TYPE.LITERAL + ;; else if (is_in_string_list(~MDELIMITS, ~s)) + ;return TOKEN_TYPE.DELIMIT + ;; else if (is_in_string_list(~MAUGMENTS, ~s)) + ;return TOKEN_TYPE.AUGMENT + ;/ + + ;return TOKEN_TYPE.DEFWORD + + ;;else if (len s == 1) + + /; if (is_in_string(~DELIMITS, s{0})) + ;return TOKEN_TYPE.DELIMIT + ;; else if (is_in_string(~LINESEPS, s{0})) + ;return TOKEN_TYPE.LINESEP + ;; else if (is_in_string(~INLNSEPS, s{0})) + ;return TOKEN_TYPE.INLNSEP + ;; else if (is_in_string(~AUGMENTS, s{0})) + ;return TOKEN_TYPE.AUGMENT + ;/ + + ;return TOKEN_TYPE.DEFWORD + ;/ + + # What, we just produce vacant tokens now? + # Something has gone wrong. + + ;return -1 ;/ \ No newline at end of file diff --git a/tnslc/src/parse/tokenizer.tnsl b/tnslc/src/parse/tokenizer.tnsl new file mode 100644 index 0000000..ec34d83 --- /dev/null +++ b/tnslc/src/parse/tokenizer.tnsl @@ -0,0 +1,25 @@ +/# + Copyright 2021 Kyle Gunger + + This file is licensed under the CDDL 1.0 (the License) + and may only be used in accordance with the License. + You should have received a copy of the License with this + software/source code. If you did not, a copy can be found + at the following URL: + + https://opensource.org/licenses/CDDL-1.0 + + THIS SOFTWARE/SOURCE CODE IS PROVIDED "AS IS" WITH NO + WARRANTY, GUARANTEE, OR CLAIM OF FITNESS FOR ANY PURPOSE + EXPRESS OR IMPLIED +#/ + + +/## + parse.numeric_literal tokenizes the next numeric literal value in a file. + Returns a token with the proper data as well as the number of characters read + +#; numeric_literal () [Token, uint] + + +;/ \ No newline at end of file -- cgit v1.2.3