From 63e9a0a3338755ed2582231eee4aeea1a74a32bd Mon Sep 17 00:00:00 2001 From: Kyle Isom Date: Tue, 27 Feb 2018 08:04:17 -0800 Subject: [PATCH] misc/kforth: Interpreter writeup, code cleanups. --- dict.cc | 57 +++++-- dict.h | 4 +- doc/part-0x01.rst | 11 +- doc/part-0x05.rst | 381 +++++++++++++++++++++++++++++++++++++++++++++- io.cc | 4 + kforth.cc | 9 +- parser.cc | 5 + system.h | 1 + word.h | 2 - 9 files changed, 443 insertions(+), 31 deletions(-) diff --git a/dict.cc b/dict.cc index 53ad8cd..96f18b7 100644 --- a/dict.cc +++ b/dict.cc @@ -4,6 +4,8 @@ #include "system.h" #include "word.h" +#include + static bool add(System *sys) { @@ -68,7 +70,7 @@ div(System *sys) return false; } - b *= a; + b /= a; return sys->dstack.push(b); } @@ -125,36 +127,61 @@ rot(System *sys) static bool definitions(System *sys) { - Word *cursor = dict; + Word *cursor = sys->dict; char buf[MAX_TOKEN_LENGTH]; - size_t buflen = 0; - + char line[72]; + size_t buflen = 0, linelen = 0; + bool ready = false; + while (cursor != nullptr) { + if (ready) { + ready = false; + sys->interface->wrln(line, linelen); + linelen = 0; + continue; + } + cursor->getname(buf, &buflen); - sys->interface->wrln(buf, buflen); + + // TODO: get rid of magic numbers + if ((buflen + linelen) > 72) { + ready = true; + continue; + } + memcpy(line + linelen, buf, buflen); + linelen += buflen; + + if (linelen < 71) { + line[linelen++] = ' '; + } + else { + ready = true; + } cursor = cursor->next(); } - + + sys->interface->wrln(line, linelen); return true; } void -init_dict() +init_dict(System *sys) { - dict = new Builtin((const char *)"DEFINITIONS", 11, dict, definitions); - dict = new Builtin((const char *)"+", 1, dict, add); - dict = new Builtin((const char *)"-", 1, dict, sub); - dict = new Builtin((const char *)"*", 1, dict, mul); - dict = new Builtin((const char *)"/", 1, dict, div); - dict = new Builtin((const char *)"SWAP", 4, dict, swap); - dict = new Builtin((const char *)"ROT", 3, dict, rot); + sys->dict = nullptr; + sys->dict = new Builtin((const char *)"DEFINITIONS", 11, sys->dict, definitions); + sys->dict = new Builtin((const char *)"+", 1, sys->dict, add); + sys->dict = new Builtin((const char *)"-", 1, sys->dict, sub); + sys->dict = new Builtin((const char *)"*", 1, sys->dict, mul); + sys->dict = new Builtin((const char *)"/", 1, sys->dict, div); + sys->dict = new Builtin((const char *)"SWAP", 4, sys->dict, swap); + sys->dict = new Builtin((const char *)"ROT", 3, sys->dict, rot); } LOOKUP lookup(struct Token *token, System *sys) { - Word *cursor = dict; + Word *cursor = sys->dict; KF_INT n; if (parse_num(token, &n)) { diff --git a/dict.h b/dict.h index c63181b..e9b5677 100644 --- a/dict.h +++ b/dict.h @@ -6,15 +6,13 @@ #include "system.h" #include "word.h" -static Word *dict = nullptr; - typedef enum _LOOKUP_ : uint8_t { LOOKUP_OK = 0, // Lookup executed properly. LOOKUP_NOTFOUND = 1, // The token isn't in the dictionary. LOOKUP_FAILED = 2 // The word failed to execute. } LOOKUP; -void init_dict(void); +void init_dict(System *); LOOKUP lookup(struct Token *, System *); diff --git a/doc/part-0x01.rst b/doc/part-0x01.rst index 788698a..eb5feb4 100644 --- a/doc/part-0x01.rst +++ b/doc/part-0x01.rst @@ -55,7 +55,7 @@ won't be a target at first, but something to keep in mind as I progress. Eventually, I'd like to build a zero-allocation Forth that can run on an STM-32 or an MSP430, but the first goal is going to get a minimal Forth -working. I'll define the stages as +working. I'll define the stages tentatively as Stage 1 ~~~~~~~ @@ -95,6 +95,15 @@ limiting factor. Fortunately, just a few days before I started this, the TI wiki was updated_ to note that the latest compilers now support C++11 and C++14, so I'll target C++14. +As a reminder to myself: this is not going to be the prettiest or best or most +secure or production ready code. The goal is to have fun writing some software +again and to rekindle some of the joy of computing that I had before. Once I +have something working, I can go back and make an exercise of cleaning it up +and refactoring it. The prose in this series is also not going to be my finest +writing ever --- again, it suffices just to do it. The goal is to have +something to show, not to achieve perfection; it'll mostly going to be hacked +on while I'm on the bus or when I have a bit of downtime here and there. + .. _updated: http://processors.wiki.ti.com/index.php/C%2B%2B_Support_in_TI_Compilers#Status_as_of_February_2018 I don't really know what I'm doing, so in the next section, I'll build out the diff --git a/doc/part-0x05.rst b/doc/part-0x05.rst index f2e0193..2813336 100644 --- a/doc/part-0x05.rst +++ b/doc/part-0x05.rst @@ -4,6 +4,9 @@ Write You a Forth, 0x05 :date: 2018-02-24 12:23 :tags: wyaf, forth +NB: Today's update was pretty large, so I don't show all of the code; this is +what ``git`` is for. + Today I need to start actually doing things with tokens. This requires two things: @@ -30,6 +33,7 @@ stack-aware, so what I've done is define a ``System`` struct in ``system.h``:: typedef struct _System { Stack dstack; + IO *interface; } System; @@ -38,9 +42,382 @@ stack-aware, so what I've done is define a ``System`` struct in ``system.h``:: This will let me later add in support for the return stack and other things that might be useful. Other ideas: adding something like an ``errno`` equivalent to indicate the last error, and storing a dictionary of words. This -will need some restructuring, though. Anyways, this works for now. +will need some restructuring, though. I've already moved the I/O into the +system as well. This took some finangling in ``kforth.cc``; I'm eliding the +diff here because it's so long, but it's basically a ``sed -i -e +'s/interface./sys->interface.``. The Word interface ^^^^^^^^^^^^^^^^^^ -Now I can start defining a Word.h. \ No newline at end of file +Now I can start defining a Word.h. Maybe this is a case of when you have an +object-oriented language, everything looks like a class, but I decided to +design an abstract class for a Word and implement the first concrete class, +**Builtin**. What I came up with was:: + + class Word { + public: + virtual ~Word() {}; + +The *eval* method takes a ``System`` structure and executes some function. + + virtual bool eval(System *) = 0; + +The dictionary is a linked list, so next is used to traverse the list. + + virtual Word *next(void) = 0; + +The ``match`` method is used to determine whether this is the word being +referred to. + + virtual bool match(struct Token *) = 0; + +Finally, ``getname`` will fill in a ``char[MAX_TOKEN_SIZE]`` buffer with the +word's name. + + virtual void getname(char *, size_t *) = 0; + }; + +With the interface defined, I can implement ``Builtins`` (I've elided the +common interface from the listing below to focus on the implementation):: + + class Builtin : public Word { + public: + ~Builtin() {}; + Builtin(const char *name, size_t namelen, Word *head, bool (*fun)(System *)); + + private: + char name[MAX_TOKEN_LENGTH]; + size_t namelen; + Word *prev; + bool (*fun)(System *); + }; + +The ``bool`` works as a first pass, but I think I'm going to have add some +notion of system conditions later on to denote why execution failed. One thing +that both ``pforth`` and ``gforth`` do that I don't yet do is to clear the +stack when there's an execution failure. At least, they clear the stack with an +unrecognised word. The implementation is pretty trivial:: + + #include "defs.h" + #include "parser.h" + #include "system.h" + #include "word.h" + + #include + + + Builtin::Builtin(const char *name, size_t namelen, Word *head, bool (*target)(System *)) + : prev(head), fun(target) + { + memcpy(this->name, name, namelen); + this->namelen = namelen; + } + + bool + Builtin::eval(System *sys) + { + return this->fun(sys); + } + + Word * + Builtin::next() + { + return this->prev; + } + + bool + Builtin::match(struct Token *token) + { + return match_token(this->name, this->namelen, token->token, token->length); + } + + void + Builtin::getname(char *buf, size_t *buflen) + { + memcpy(buf, this->name, this->namelen); + *buflen = namelen; + } + +Right. Now to do something with this. + +The system dictionary +^^^^^^^^^^^^^^^^^^^^^ + +The dictionary's interface is minimal:: + + // dict.h + #ifndef __KF_DICT_H__ + #define __KF_DICT_H__ + + #include "defs.h" + #include "parser.h" + #include "system.h" + #include "word.h" + + typedef enum _LOOKUP_ : uint8_t { + LOOKUP_OK = 0, // Lookup executed properly. + LOOKUP_NOTFOUND = 1, // The token isn't in the dictionary. + LOOKUP_FAILED = 2 // The word failed to execute. + } LOOKUP; + + void init_dict(System *); + LOOKUP lookup(struct Token *, System *); + + #endif // __KF_DICT_H__ + +There's a modicum of differentiation between "everything worked" and "no it +didn't," and by that I mean the lookup can tell you if the word wasn't found +or if there was a problem executing it. + +I added a ``struct Word *dict`` field to the ``System`` struct, since we're +operating on these anyways. I guess it's best to start with the lookup function +first so that when I started adding builtins later it'll be easy to just +recompile and use them. +:: + LOOKUP + lookup(struct Token *token, System *sys) + { + Word *cursor = sys->dict; + KF_INT n; + +I seem to recall from *Programming a Problem-Oriented Language* that Chuck +Moore advocated checking whether a token was a number before looking it up +in the dictionary, so that's the approach I'll take:: + + if (parse_num(token, &n)) { + if (sys->dstack.push(n)) { + return LOOKUP_OK; + } + return LOOKUP_FAILED; + } + +The remainder is pretty much bog-standard linked list traversal:: + + while (cursor != nullptr) { + if (cursor->match(token)) { + if (cursor->eval(sys)) { + return LOOKUP_OK; + } + return LOOKUP_FAILED; + } + cursor = cursor->next(); + } + + return LOOKUP_NOTFOUND; + } + +This needs to get hooked up into the interpreter now; this is going to require +some reworking of the ``parser`` function:: + + static bool + parser(const char *buf, const size_t buflen) + { + static size_t offset = 0; + static struct Token token; + static PARSE_RESULT result = PARSE_FAIL; + static LOOKUP lresult = LOOKUP_FAILED; + static bool stop = false; + + offset = 0; + + // reset token + token.token = nullptr; + token.length = 0; + + while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) { + lresult = lookup(&token, &sys); + switch (lresult) { + case LOOKUP_OK: + continue; + case LOOKUP_NOTFOUND: + sys.interface->wrln((char *)"word not found", 15); + stop = true; + break; + case LOOKUP_FAILED: + sys.interface->wrln((char *)"execution failed", 17); + stop = true; + break; + default: + sys.interface->wrln((char *)"*** the world is broken ***", 27); + exit(1); + } + + if (stop) { + stop = false; + break; + } + } + + switch (result) { + case PARSE_OK: + return false; + case PARSE_EOB: + sys.interface->wrbuf(ok, 4); + return true; + case PARSE_LEN: + sys.interface->wrln((char *)"parse error: token too long", 27); + return false; + case PARSE_FAIL: + sys.interface->wrln((char *)"parser failure", 14); + return false; + default: + sys.interface->wrln((char *)"*** the world is broken ***", 27); + exit(1); + } + } + + +Now I feel like I'm at the part where I can start adding in functionality. The +easiest first builtin: addition. Almost impossible to screw this up, right? +:: + + static bool + add(System *sys) + { + KF_INT a = 0; + KF_INT b = 0; + if (!sys->dstack.pop(&a)) { + return false; + } + + if (!sys->dstack.pop(&b)) { + return false; + } + + b += a; + return sys->dstack.push(b); + } + +Now this needs to go into the ``init_dict`` function:: + + void + init_dict(System *sys) + { + sys->dict = nullptr; + sys->dict = new Builtin((const char *)"+", 1, sys->dict, add); + } + +And this needs to get added into the ``main`` function:: + + int + main(void) + { + init_dict(&sys); + #ifdef __linux__ + Console interface; + sys.interface = &interface; + #endif + sys.interface->wrbuf(banner, bannerlen); + interpreter(); + return 0; + } + +The moment of truth +^^^^^^^^^^^^^^^^^^^ + +Hold on to your pants, let's see what breaks:: + + $ ./kforth + kforth interpreter + <> + ? 2 3 + + ok. + <5> + +Oh hey, look, it actually works. Time to add a few more definitions for good +measure: + ++ the basic arithmetic operators `-`, `*`, `/` ++ the classic `SWAP` and `ROT` words ++ `DEFINITIONS` to look at all the definitions in the language + +These are all pretty simple, fortunately. A few things that tripped me up, +though: + ++ The *a* and *b* names kind of threw me off because I fill *a* first. This + means it's the last number on the stack; this didn't matter for addition, + but in subtraction, it means I had to be careful to do ``b -= a`` rather + than the other way around. + ++ pforth and gfortran both support case insensitive matching, so I had to + modify the token matcher:: + + bool + match_token(const char *a, const size_t alen, + const char *b, const size_t blen) + { + if (alen != blen) { + return false; + } + + for (size_t i = 0; i < alen; i++) { + if (a[i] == b[i]) { + continue; + } + + if (!isalpha(a[i]) || !isalpha(b[i])) { + return false; + } + +The XOR by 0x20 is just a neat trick for inverting the case of a letter. + + if ((a[i] ^ 0x20) == b[i]) { + continue; + } + + if (a[i] == (b[i] ^ 0x20)) { + continue; + } + + return false; + } + return true; + } + ++ I forgot to include the case for ``PARSE_OK`` in the result switch statement + in the ``parser`` function, so I could get a line of code evaluated but then + it'd die with "the world is broken." + ++ When I tried doing some division, I ran into some weird issues:: + + $ ./kforth + kforth interpreter + <> + ? 2 5040 / + ok. + <��> + +It turns out that in ``write_num``, the case where *n = 0* results in nothing +happening, and therefore the buffer just being written. This is a dirty thing, +but I edge cased this:: + + $ git diff io.cc + diff --git a/io.cc b/io.cc + index 77e0e2a..a86156b 100644 + --- a/io.cc + +++ b/io.cc + @@ -24,6 +24,10 @@ write_num(IO *interface, KF_INT n) + n++; + } + } + + else if (n == 0) { + + interface->wrch('0'); + + return; + + } + + while (n != 0) { + char ch = (n % 10) + '0'; + +May the compiler have mercy on my soul and whatnot. + +For you sports fans keeping track at home, here's the classic bugs I've +introduced so far: + +1. bounds overrun +2. missing case in a switch statement + +But now here I am with the interpreter in good shape. Now I can start +implementing the builtins in earnest! + +As before, see the tag `part-0x05 `_. \ No newline at end of file diff --git a/io.cc b/io.cc index 77e0e2a..a86156b 100644 --- a/io.cc +++ b/io.cc @@ -24,6 +24,10 @@ write_num(IO *interface, KF_INT n) n++; } } + else if (n == 0) { + interface->wrch('0'); + return; + } while (n != 0) { char ch = (n % 10) + '0'; diff --git a/kforth.cc b/kforth.cc index 10d8aa2..565b5b0 100644 --- a/kforth.cc +++ b/kforth.cc @@ -11,7 +11,6 @@ #endif // __linux__ static char ok[] = "ok.\n"; -static char bye[] = "bye"; static System sys; @@ -68,12 +67,6 @@ parser(const char *buf, const size_t buflen) stop = false; break; } - - // Temporary hack until the interpreter is working further. - if (match_token(token.token, token.length, bye, 3)) { - sys.interface->wrln((char *)"Goodbye!", 8); - exit(0); - } } switch (result) { @@ -116,7 +109,7 @@ const size_t bannerlen = 19; int main(void) { - init_dict(); + init_dict(&sys); #ifdef __linux__ Console interface; sys.interface = &interface; diff --git a/parser.cc b/parser.cc index f7e35f4..5d26ba4 100644 --- a/parser.cc +++ b/parser.cc @@ -2,6 +2,7 @@ #include "parser.h" #include "stack.h" +#include #include static void @@ -23,6 +24,10 @@ match_token(const char *a, const size_t alen, if (a[i] == b[i]) { continue; } + + if (!isalpha(a[i]) || !isalpha(b[i])) { + return false; + } if ((a[i] ^ 0x20) == b[i]) { continue; diff --git a/system.h b/system.h index 77413d3..00f4a34 100644 --- a/system.h +++ b/system.h @@ -8,6 +8,7 @@ typedef struct _System { Stack dstack; IO *interface; + struct Word *dict; } System; diff --git a/word.h b/word.h index 84b7684..1989e88 100644 --- a/word.h +++ b/word.h @@ -25,7 +25,6 @@ public: Word *next(void); bool match(struct Token *); void getname(char *, size_t *); - private: char name[MAX_TOKEN_LENGTH]; size_t namelen; @@ -33,5 +32,4 @@ private: bool (*fun)(System *); }; - #endif // __KF_WORD_H__ \ No newline at end of file