From 0ae7d495935d91f495eb804fdd0701c22a82a4a0 Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Fri, 23 Feb 2018 14:01:52 -0800 Subject: [PATCH] misc/kforth: Finish parsing. --- Makefile | 2 +- defs.h | 1 + doc/index.rst | 1 + doc/part-0x01.rst | 10 ++ doc/part-0x02.rst | 10 +- doc/part-0x03.rst | 293 ++++++++++++++++++++++++++++++++++++++++++++++ kforth.cc | 49 +++++++- parser.cc | 70 +++++++++-- parser.h | 10 +- 9 files changed, 432 insertions(+), 14 deletions(-) create mode 100644 doc/part-0x03.rst diff --git a/Makefile b/Makefile index 6c72769..16f194a 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CXXSTD := c++11 +CXXSTD := c++14 CXXFLAGS := -std=$(CXXSTD) -Wall -Werror -g -O0 OBJS := linux/io.o \ parser.o \ diff --git a/defs.h b/defs.h index 4197b6b..4dcc540 100644 --- a/defs.h +++ b/defs.h @@ -5,5 +5,6 @@ #include "linux/defs.h" #endif +constexpr size_t MAX_TOKEN_LENGTH = 16; #endif // __KF_DEFS_H__ \ No newline at end of file diff --git a/doc/index.rst b/doc/index.rst index bee3448..bd9aa42 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -8,6 +8,7 @@ Contents: part-0x01 part-0x02 + part-0x03 Indices and tables ================== diff --git a/doc/part-0x01.rst b/doc/part-0x01.rst index c609537..788698a 100644 --- a/doc/part-0x01.rst +++ b/doc/part-0x01.rst @@ -87,5 +87,15 @@ Stage 4 Next steps ^^^^^^^^^^ +I've decided to use C++ for two reasons: it's supported by all the targets I +want (amd64, arm/arm64, msp430, avr), and I know it well enough (and +importantly, I know the tooling) to get by. Typically, the TI compilers lag +behind the others in supporting newer C++ standards, so those will be the +limiting factor. Fortunately, just a few days before I started this, the TI +wiki was updated_ to note that the latest compilers now support C++11 and +C++14, so I'll target C++14. + +.. _updated: http://processors.wiki.ti.com/index.php/C%2B%2B_Support_in_TI_Compilers#Status_as_of_February_2018 + I don't really know what I'm doing, so in the next section, I'll build out the basic framework and set up the build. \ No newline at end of file diff --git a/doc/part-0x02.rst b/doc/part-0x02.rst index bb1259b..d829f69 100644 --- a/doc/part-0x02.rst +++ b/doc/part-0x02.rst @@ -20,7 +20,7 @@ The project will also need a build system. For simplicity, I'll at least start with a basic Makefile:: # Makefile - CXXSTD := c++11 + CXXSTD := c++14 CXXFLAGS := -std=$(CXXSTD) -Werror -Wall -g -O0 OBJS := linux/io.o \ kforth.o @@ -271,4 +271,10 @@ definitions from the standard library:: Next steps ^^^^^^^^^^ -I guess the next thing to do will be to start parsing. \ No newline at end of file +I guess the next thing to do will be to start parsing. + +Some housekeeping: I'll keep the state of the code at each part in +the tag ``part-$PART``; this part, for example is in the tag +`part-0x02`_. + +.. _part-0x02: https://github.com/kisom/kforth/tree/part-0x02 \ No newline at end of file diff --git a/doc/part-0x03.rst b/doc/part-0x03.rst new file mode 100644 index 0000000..b88b0c0 --- /dev/null +++ b/doc/part-0x03.rst @@ -0,0 +1,293 @@ +Write You a Forth, 0x03 +----------------------- + +:date: 2018-02-23 09:36 +:tags: wyaf, forth + +Today, I'm working on parsing. I was talking to `steveo +`_ yesterday, and he mentioned string interning, and +it sounded like a fun thing to do (and then I started thinking about ropes and +so on). + +However, I'm not going to intern strings --- at least, not yet. I'm going to do +something way more primitive:: + + bool match_token(const char *a, const size_t alen, + const char *b, const size_t blen) + { + if (alen != blen) { + return false; + } + + return memcmp(a, b, alen) == 0; + } + +I'd also like to operate on a buffer without having to store a bunch of copies +of strings. Performance may not be the number one concern here, but I think +it'll be more fun to implement, and it will be a little easier. The parser +should return the next token that we can push off to the rest of the process. +It seems like we'll want a structure for that. + +``parser.h`` +^^^^^^^^^^^^ + +The parser seems like it really only needs a few things, so time to take a stab at +``parser.h``:: + + #ifndef __KF_PARSER_H__ + #define __KF_PARSER_H__ + + #include "defs.h" + +A ``Token`` can be defined as just the pointer to the start of the token and +its length. There's a limit to the maximum size of the buffer, and it'll be +important to check the length of the token. For simplicity, I'm going to define +the maximum length of a token as 16, and I'll put this as a ``constexpr`` in the +``defs.h`` file. +:: + + struct Token { + char *token; + uint8_t length; + }; + +Next up is to define the function from before for matching tokens. +:: + + bool match_token(const char *, const size_t, const char *, const size_t); + +The meat of the parser is `parse_next`, for which we'll also need some return codes. +:: + + typedef enum _PARSE_RESULT_ : uint8_t { + PARSE_OK = 0, // token now has a valid token. + PARSE_EOB = 1, // end of buffer, parsing a line should stop. + PARSE_LEN = 2, // token is too long + PARSE_FAIL = 3 // catch-all error + } PARSE_RESULT; + + int parse_next(const char *, const size_t, size_t *, struct Token *); + + #endif // __KF_PARSER_H__ + +``parser.cc`` +^^^^^^^^^^^^^^ + +``parser.cc`` will open with a helper to reset tokens and the same +matching code I mentioned before:: + + #include "defs.h" + #include "parser.h" + + #include + + static void + reset(struct Token *t) + { + t->token = nullptr; + t->length = 0; + } + + bool + match_token(const char *a, const size_t alen, + const char *b, const size_t blen) + { + if (alen != blen) { + return false; + } + + return memcmp(a, b, alen) == 0; + } + +At the start of the parser, I'm going to reset the token; if there's a failure, +there shouldn't be a valid token anyhow. +:: + + PARSE_RESULT + parse_next(const char *buf, const size_t length, size_t *offset, + struct Token *token) + { + size_t cursor = *offset; + + // Clear the token. + reset(token); + +If the offset is already at the end of the buffer, there's no more work to do +on this buffer, so I'll cut out early ``PARSE_EOB``. If I was doing a more +careful job of programming this, I'd *generally* try to avoid multiple returns, +but in this case, having working code is more important than awesome code. +:: + + if (cursor == length) { + return PARSE_EOB; + } + +I'm going to assume that tokens are separated by spaces or tabs. I wasn't going +to support tabs at first, but it's easy enough to do that I just included it. +:: + + while (cursor <= length) { + if (buf[cursor] != ' ') { + if (buf[cursor] != '\t') { + break; + } + } + cursor++; + } + +This part might seem superfluous, but it's important in case there's trailing +whitespace in the buffer. I haven't touched the token yet, so no need to reset +it. +:: + + if (cursor == length) { + return PARSE_EOB; + } + +Now I can point the token to the buffer at the start of the next token and walk +through the buffer until the end of the buffer or the first whitespace +character:: + + token->token = (char *)buf + cursor; + while ((token->length <= MAX_TOKEN_LENGTH) && (cursor < length)) { + if (buf[cursor] != ' ') { + if (buf[cursor] != '\t') { + cursor++; + token->length++; + continue; + } + } + +This got me at first and took me a few minutes to figure out. If the cursor +isn't updated at the end, the next run of the parser is going to be stuck on +this word as the cursor doesn't point to whitespace anymore. +:: + + cursor++; + break; + } + +Finally, if the token length hasn't been exceeded, the offset can be updated +and the token returned:: + + if (token->length > MAX_TOKEN_LENGTH) { + reset(token); + return PARSE_LEN; + } + + *offset = cursor; + return PARSE_OK; + } + +``kforth.cc`` +^^^^^^^^^^^^^ + +That's all of ``parse.cc`` (at least for now), but this needs to be integrated +into the frontend. ``kforth.cc`` now starts off with:: + + #include "io.h" + #include "parser.h" + + #include + + #ifdef __linux__ + #include "linux.h" + #endif // __linux__ + + static char ok[] = "ok.\n"; + static char bye[] = "bye"; + + static bool + parser(IO &interface, const char *buf, const size_t buflen) + { + static size_t offset = 0; + static struct Token token; + static PARSE_RESULT result = PARSE_FAIL; + + offset = 0; + + // reset token + token.token = nullptr; + token.length = 0; + + while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) { + interface.wrbuf((char *)"token: ", 7); + interface.wrbuf(token.token, token.length); + interface.wrln((char *)".", 1); + +There's no command parser right now, so I've added in this hack so it starts to +feel a little like a Forth. +:: + + if (match_token(token.token, token.length, bye, 3)) { + interface.wrln((char *)"Goodbye!", 8); + exit(0); + } + } + + switch (result) { + case PARSE_EOB: + interface.wrbuf(ok, 4); + return true; + case PARSE_LEN: + interface.wrln((char *)"parse error: token too long", 27); + return false; + case PARSE_FAIL: + interface.wrln((char *)"parser failure", 14); + return false; + default: + interface.wrln((char *)"*** the world is broken ***", 27); + exit(1); + } + } + + static void + interpreter(IO &interface) + { + static size_t buflen = 0; + static char linebuf[81]; + + while (true) { + interface.wrch('?'); + interface.wrch(' '); + buflen = interface.rdbuf(linebuf, 80, true, '\n'); + +The return value is being ignored right now, but later on it might be useful. +:: + + parser(interface, linebuf, buflen); + } + } + +But does it work? +:: + + ~/code/kforth (0) $ make + g++ -std=c++14 -Wall -Werror -g -O0 -c -o linux/io.o linux/io.cc + g++ -std=c++14 -Wall -Werror -g -O0 -c -o parser.o parser.cc + g++ -std=c++14 -Wall -Werror -g -O0 -c -o kforth.o kforth.cc + g++ -o kforth linux/io.o parser.o kforth.o + ~/code/kforth (0) $ ./kforth + kforth interpreter + ? 2 3 4 + * 1 SWAP + token: 2. + token: 3. + token: 4. + token: +. + token: *. + token: 1. + token: SWAP. + ok. + ? thistokenistoolong! + parse error: token too long + bye + token: bye. + Goodbye! + ~/code/kforth (0) $ + +Heyo! Now I'm getting somewhere. The next logical step (to me) is to add in a +command parser and a standard vocabulary. + +The snapshot of the code from here is in the tag part-0x03_. + +.. _part-0x03: https://github.com/kisom/kforth/tree/part-0x03 \ No newline at end of file diff --git a/kforth.cc b/kforth.cc index 0c0eab2..ebdec1c 100644 --- a/kforth.cc +++ b/kforth.cc @@ -1,11 +1,55 @@ #include "io.h" #include "parser.h" +#include + #ifdef __linux__ #include "linux.h" #endif // __linux__ static char ok[] = "ok.\n"; +static char bye[] = "bye"; + +static bool +parser(IO &interface, const char *buf, const size_t buflen) +{ + static size_t offset = 0; + static struct Token token; + static PARSE_RESULT result = PARSE_FAIL; + + offset = 0; + + // reset token + token.token = nullptr; + token.length = 0; + + while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) { + interface.wrbuf((char *)"token: ", 7); + interface.wrbuf(token.token, token.length); + interface.wrln((char *)".", 1); + + // Temporary hack until the interpreter is working further. + if (match_token(token.token, token.length, bye, 3)) { + interface.wrln((char *)"Goodbye!", 8); + exit(0); + } + } + + switch (result) { + case PARSE_EOB: + interface.wrbuf(ok, 4); + return true; + case PARSE_LEN: + interface.wrln((char *)"parse error: token too long", 27); + return false; + case PARSE_FAIL: + interface.wrln((char *)"parser failure", 14); + return false; + default: + interface.wrln((char *)"*** the world is broken ***", 27); + exit(1); + } +} static void interpreter(IO &interface) @@ -14,9 +58,10 @@ interpreter(IO &interface) static char linebuf[81]; while (true) { + interface.wrch('?'); + interface.wrch(' '); buflen = interface.rdbuf(linebuf, 80, true, '\n'); - interface.wrln(linebuf, buflen); - interface.wrbuf(ok, 4); + parser(interface, linebuf, buflen); } } diff --git a/parser.cc b/parser.cc index 08ee351..c18af83 100644 --- a/parser.cc +++ b/parser.cc @@ -1,17 +1,71 @@ #include "defs.h" #include "parser.h" -int +#include + +static void +reset(struct Token *t) +{ + t->token = nullptr; + t->length = 0; +} + +bool +match_token(const char *a, const size_t alen, + const char *b, const size_t blen) +{ + if (alen != blen) { + return false; + } + + return memcmp(a, b, alen) == 0; +} + +PARSE_RESULT parse_next(const char *buf, const size_t length, size_t *offset, struct Token *token) { - size_t start = *offset; - bool ok = false; + size_t cursor = *offset; - // TODO(skip past whitespace) - // TODO(find next EOC) - if (!ok) { - *offset = start; + // Clear the token. + reset(token); + + if (cursor == length) { + return PARSE_EOB; } - return -1; + + while (cursor <= length) { + if (buf[cursor] != ' ') { + if (buf[cursor] != '\t') { + break; + } + } + + cursor++; + } + + if (cursor == length) { + return PARSE_EOB; + } + + token->token = (char *)buf + cursor; + while ((token->length <= MAX_TOKEN_LENGTH) && (cursor < length)) { + if (buf[cursor] != ' ') { + if (buf[cursor] != '\t') { + cursor++; + token->length++; + continue; + } + } + cursor++; + break; + } + + if (token->length > MAX_TOKEN_LENGTH) { + reset(token); + return PARSE_LEN; + } + + *offset = cursor; + return PARSE_OK; } \ No newline at end of file diff --git a/parser.h b/parser.h index dfb7e21..1af2adf 100644 --- a/parser.h +++ b/parser.h @@ -8,7 +8,15 @@ struct Token { uint8_t length; }; -int parse_next(const char *, const size_t, size_t *, struct Token *); +typedef enum _PARSE_RESULT_ : uint8_t { + PARSE_OK = 0, + PARSE_EOB = 1, // end of buffer + PARSE_LEN = 2, // token is too long + PARSE_FAIL = 3 // catch-all error +} PARSE_RESULT; + +bool match_token(const char *, const size_t, const char *, const size_t); +PARSE_RESULT parse_next(const char *, const size_t, size_t *, struct Token *); #endif // __KF_PARSER_H__