misc/kforth: Interpreter writeup, code cleanups.

2018-02-27 08:04:17 -08:00
parent 14dc525084
commit 63e9a0a333
9 changed files with 443 additions and 31 deletions
--- a/dict.cc
+++ b/dict.cc
@@ -4,6 +4,8 @@
 #include "system.h"
 #include "word.h"
 #include <string.h>
 static bool
 add(System *sys)
 {
@@ -68,7 +70,7 @@ div(System *sys)
 		return false;
 	}
-	b *= a;
+	b /= a;
 	return sys->dstack.push(b);
 }
@@ -125,36 +127,61 @@ rot(System *sys)
 static bool
 definitions(System *sys)
 {
-	Word	*cursor = dict;
+	Word	*cursor = sys->dict;
 	char	 buf[MAX_TOKEN_LENGTH];
-	size_t	 buflen = 0;
+	char	 line[72];
 	size_t	 buflen = 0, linelen = 0;
 	bool	 ready = false;
 	while (cursor != nullptr) {
 		if (ready) {
 			ready = false;
 			sys->interface->wrln(line, linelen);
 			linelen = 0;
 			continue;
 		}
 		cursor->getname(buf, &buflen);
-		sys->interface->wrln(buf, buflen);
+
 		// TODO: get rid of magic numbers
 		if ((buflen + linelen) > 72) {
 			ready = true;
 			continue;
 		}
 		memcpy(line + linelen, buf, buflen);
 		linelen += buflen;
 		if (linelen < 71) {
 			line[linelen++] = ' ';
 		}
 		else {
 			ready = true;
 		}
 		cursor = cursor->next();
 	}
 	sys->interface->wrln(line, linelen);
 	return true;
 }
 void
-init_dict()
+init_dict(System *sys)
 {
-	dict = new Builtin((const char *)"DEFINITIONS", 11, dict, definitions);
+	sys->dict = nullptr;
-	dict = new Builtin((const char *)"+", 1, dict, add);
+	sys->dict = new Builtin((const char *)"DEFINITIONS", 11, sys->dict, definitions);
-	dict = new Builtin((const char *)"-", 1, dict, sub);
+	sys->dict = new Builtin((const char *)"+", 1, sys->dict, add);
-	dict = new Builtin((const char *)"*", 1, dict, mul);
+	sys->dict = new Builtin((const char *)"-", 1, sys->dict, sub);
-	dict = new Builtin((const char *)"/", 1, dict, div);
+	sys->dict = new Builtin((const char *)"*", 1, sys->dict, mul);
-	dict = new Builtin((const char *)"SWAP", 4, dict, swap);
+	sys->dict = new Builtin((const char *)"/", 1, sys->dict, div);
-	dict = new Builtin((const char *)"ROT", 3, dict, rot);
+	sys->dict = new Builtin((const char *)"SWAP", 4, sys->dict, swap);
 	sys->dict = new Builtin((const char *)"ROT", 3, sys->dict, rot);
 }
 LOOKUP
 lookup(struct Token *token, System *sys)
 {
-	Word	*cursor = dict;
+	Word	*cursor = sys->dict;
 	KF_INT	 n;
 	if (parse_num(token, &n)) {
--- a/dict.h
+++ b/dict.h
@@ -6,15 +6,13 @@
 #include "system.h"
 #include "word.h"
 static	Word	*dict = nullptr;
 typedef enum _LOOKUP_ : uint8_t {
 	LOOKUP_OK = 0,	     // Lookup executed properly.
 	LOOKUP_NOTFOUND = 1, // The token isn't in the dictionary.
 	LOOKUP_FAILED = 2    // The word failed to execute.
 } LOOKUP;
-void	init_dict(void);
+void	init_dict(System *);
 LOOKUP	lookup(struct Token *, System *);
--- a/doc/part-0x01.rst
+++ b/doc/part-0x01.rst
@@ -55,7 +55,7 @@ won't be a target at first, but something to keep in mind as I progress.
 Eventually, I'd like to build a zero-allocation Forth that can run on an
 STM-32 or an MSP430, but the first goal is going to get a minimal Forth
-working. I'll define the stages as
+working. I'll define the stages tentatively as
 Stage 1
 ~~~~~~~
@@ -95,6 +95,15 @@ limiting factor. Fortunately, just a few days before I started this, the TI
 wiki was updated_ to note that the latest compilers now support C++11 and
 C++14, so I'll target C++14.
 As a reminder to myself: this is not going to be the prettiest or best or most
 secure or production ready code. The goal is to have fun writing some software
 again and to rekindle some of the joy of computing that I had before. Once I
 have something working, I can go back and make an exercise of cleaning it up
 and refactoring it. The prose in this series is also not going to be my finest
 writing ever --- again, it suffices just to do it. The goal is to have
 something to show, not to achieve perfection; it'll mostly going to be hacked
 on while I'm on the bus or when I have a bit of downtime here and there.
 .. _updated: http://processors.wiki.ti.com/index.php/C%2B%2B_Support_in_TI_Compilers#Status_as_of_February_2018
 I don't really know what I'm doing, so in the next section, I'll build out the
--- a/doc/part-0x05.rst
+++ b/doc/part-0x05.rst
@@ -4,6 +4,9 @@ Write You a Forth, 0x05
 :date: 2018-02-24 12:23
 :tags: wyaf, forth
 NB: Today's update was pretty large, so I don't show all of the code; this is
 what ``git`` is for.
 Today I need to start actually doing things with tokens. This requires two
 things:
@@ -30,6 +33,7 @@ stack-aware, so what I've done is define a ``System`` struct in ``system.h``::
        typedef struct _System {
                Stack<KF_INT>        dstack;
                IO                  *interface;
        } System;
@@ -38,9 +42,382 @@ stack-aware, so what I've done is define a ``System`` struct in ``system.h``::
 This will let me later add in support for the return stack and other things
 that might be useful. Other ideas: adding something like an ``errno``
 equivalent to indicate the last error, and storing a dictionary of words. This
-will need some restructuring, though. Anyways, this works for now.
+will need some restructuring, though. I've already moved the I/O into the
 system as well. This took some finangling in ``kforth.cc``; I'm eliding the
 diff here because it's so long, but it's basically a ``sed -i -e
 's/interface./sys->interface.``.
 The Word interface
 ^^^^^^^^^^^^^^^^^^
-Now I can start defining a Word.h.
+Now I can start defining a Word.h. Maybe this is a case of when you have an
 object-oriented language, everything looks like a class, but I decided to
 design an abstract class for a Word and implement the first concrete class,
 **Builtin**. What I came up with was::
    class Word {
    public:
            virtual ~Word() {};
 The *eval* method takes a ``System`` structure and executes some function.
            virtual bool  eval(System *) = 0;
 The dictionary is a linked list, so next is used to traverse the list.
            virtual Word *next(void) = 0;
 The ``match`` method is used to determine whether this is the word being
 referred to.
            virtual bool  match(struct Token *) = 0;
 Finally, ``getname`` will fill in a ``char[MAX_TOKEN_SIZE]`` buffer with the
 word's name.
            virtual void  getname(char *, size_t *) = 0;
    };
 With the interface defined, I can implement ``Builtins`` (I've elided the
 common interface from the listing below to focus on the implementation)::
        class Builtin : public Word {
        public:
                ~Builtin() {};
                Builtin(const char *name, size_t namelen, Word *head, bool (*fun)(System *));
        private:
                char		 name[MAX_TOKEN_LENGTH];
                size_t		 namelen;
                Word		*prev;
                bool		(*fun)(System *);
        };
 The ``bool`` works as a first pass, but I think I'm going to have add some
 notion of system conditions later on to denote why execution failed. One thing
 that both ``pforth`` and ``gforth`` do that I don't yet do is to clear the
 stack when there's an execution failure. At least, they clear the stack with an
 unrecognised word. The implementation is pretty trivial::
        #include "defs.h"
        #include "parser.h"
        #include "system.h"
        #include "word.h"
        #include <string.h>
        Builtin::Builtin(const char *name, size_t namelen, Word *head, bool (*target)(System *))
                : prev(head), fun(target)
        {
                memcpy(this->name, name, namelen);
                this->namelen = namelen;
        }
        bool
        Builtin::eval(System *sys)
        {
                return this->fun(sys);
        }
        Word *
        Builtin::next()
        {
                return this->prev;
        }
        bool
        Builtin::match(struct Token *token)
        {
                return match_token(this->name, this->namelen, token->token, token->length);
        }
        void
        Builtin::getname(char *buf, size_t *buflen)
        {
                memcpy(buf, this->name, this->namelen);
                *buflen = namelen;
        }
 Right. Now to do something with this.
 The system dictionary
 ^^^^^^^^^^^^^^^^^^^^^
 The dictionary's interface is minimal::
        // dict.h
        #ifndef __KF_DICT_H__
        #define __KF_DICT_H__
        #include "defs.h"
        #include "parser.h"
        #include "system.h"
        #include "word.h"
        typedef enum _LOOKUP_ : uint8_t {
                LOOKUP_OK = 0,	     // Lookup executed properly.
                LOOKUP_NOTFOUND = 1, // The token isn't in the dictionary.
                LOOKUP_FAILED = 2    // The word failed to execute.
        } LOOKUP;
        void	init_dict(System *);
        LOOKUP	lookup(struct Token *, System *);
        #endif // __KF_DICT_H__
 There's a modicum of differentiation between "everything worked" and "no it
 didn't," and by that I mean the lookup can tell you if the word wasn't found
 or if there was a problem executing it.
 I added a ``struct Word *dict`` field to the ``System`` struct, since we're
 operating on these anyways. I guess it's best to start with the lookup function
 first so that when I started adding builtins later it'll be easy to just
 recompile and use them.
 ::
        LOOKUP
        lookup(struct Token *token, System *sys)
        {
                Word	*cursor = sys->dict;
                KF_INT	 n;
 I seem to recall from *Programming a Problem-Oriented Language* that Chuck
 Moore advocated checking whether a token was a number before looking it up
 in the dictionary, so that's the approach I'll take:: 
                if (parse_num(token, &n)) {
                        if (sys->dstack.push(n)) {
                                return LOOKUP_OK;
                        }
                        return LOOKUP_FAILED;
                }
 The remainder is pretty much bog-standard linked list traversal::
                while (cursor != nullptr) {
                        if (cursor->match(token)) {
                                if (cursor->eval(sys)) {
                                        return LOOKUP_OK;
                                }
                                return LOOKUP_FAILED;
                        }
                        cursor = cursor->next();
                }
                return LOOKUP_NOTFOUND;
        }
 This needs to get hooked up into the interpreter now; this is going to require
 some reworking of the ``parser`` function::
        static bool
        parser(const char *buf, const size_t buflen)
        {
                static size_t		offset = 0;
                static struct Token	token;
                static PARSE_RESULT	result = PARSE_FAIL;
                static LOOKUP		lresult = LOOKUP_FAILED;
                static bool		stop = false;
                offset = 0;
                // reset token
                token.token = nullptr;
                token.length = 0;
                while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) {
                        lresult = lookup(&token, &sys);
                        switch (lresult) {
                        case LOOKUP_OK:
                                continue;
                        case LOOKUP_NOTFOUND:
                                sys.interface->wrln((char *)"word not found", 15);
                                stop = true;
                                break;
                        case LOOKUP_FAILED:
                                sys.interface->wrln((char *)"execution failed", 17);
                                stop = true;
                                break;
                        default:
                                sys.interface->wrln((char *)"*** the world is broken ***", 27);
                                exit(1);
                        }
                        if (stop) {
                                stop = false;
                                break;
                        }
                }
                switch (result) {
                case PARSE_OK:
                        return false;
                case PARSE_EOB:
                        sys.interface->wrbuf(ok, 4);
                        return true;
                case PARSE_LEN:
                        sys.interface->wrln((char *)"parse error: token too long", 27);
                        return false;
                case PARSE_FAIL:
                        sys.interface->wrln((char *)"parser failure", 14);
                        return false;
                default:
                        sys.interface->wrln((char *)"*** the world is broken ***", 27);
                        exit(1);
                }
        }
 Now I feel like I'm at the part where I can start adding in functionality. The
 easiest first builtin: addition. Almost impossible to screw this up, right?
 ::
        static bool
        add(System *sys)
        {
                KF_INT	a = 0;
                KF_INT	b = 0;
                if (!sys->dstack.pop(&a)) {
                        return false;
                }
                if (!sys->dstack.pop(&b)) {
                        return false;
                }
                b += a;
                return sys->dstack.push(b);
        }
 Now this needs to go into the ``init_dict`` function::
        void
        init_dict(System *sys)
        {
                sys->dict = nullptr;
                sys->dict = new Builtin((const char *)"+", 1, sys->dict, add);
        }
 And this needs to get added into the ``main`` function::
        int
        main(void)
        {
                init_dict(&sys);
        #ifdef __linux__
                Console interface;
                sys.interface = &interface;
        #endif
                sys.interface->wrbuf(banner, bannerlen);
                interpreter();
                return 0;
        }
 The moment of truth
 ^^^^^^^^^^^^^^^^^^^
 Hold on to your pants, let's see what breaks::
        $ ./kforth
        kforth interpreter
        <>
        ? 2 3 +
        ok.
        <5>
 Oh hey, look, it actually works. Time to add a few more definitions for good
 measure:
 + the basic arithmetic operators `-`, `*`, `/`
 + the classic `SWAP` and `ROT` words
 + `DEFINITIONS` to look at all the definitions in the language
 These are all pretty simple, fortunately. A few things that tripped me up,
 though:
 + The *a* and *b* names kind of threw me off because I fill *a* first. This
  means it's the last number on the stack; this didn't matter for addition,
  but in subtraction, it means I had to be careful to do ``b -= a`` rather
  than the other way around.
 + pforth and gfortran both support case insensitive matching, so I had to
  modify the token matcher::
        bool
        match_token(const char *a, const size_t alen,
                const char *b, const size_t blen)
        {
                if (alen != blen) {
                        return false;
                }
                for (size_t i = 0; i < alen; i++) {
                        if (a[i] == b[i]) {
                                continue;
                        }
                        if (!isalpha(a[i]) || !isalpha(b[i])) {
                                return false;
                        }
 The XOR by 0x20 is just a neat trick for inverting the case of a letter.
                        if ((a[i] ^ 0x20) == b[i]) {
                                continue;
                        }
                        if (a[i] == (b[i] ^ 0x20)) {
                                continue;
                        }
                        return false;
                }
                return true;
        }
 + I forgot to include the case for ``PARSE_OK`` in the result switch statement
  in the ``parser`` function, so I could get a line of code evaluated but then
  it'd die with "the world is broken."
 + When I tried doing some division, I ran into some weird issues::
        $ ./kforth 
        kforth interpreter
        <>
        ? 2 5040 /
        ok.
        <<3C><>>
 It turns out that in ``write_num``, the case where *n = 0* results in nothing
 happening, and therefore the buffer just being written. This is a dirty thing,
 but I edge cased this::
        $ git diff io.cc 
        diff --git a/io.cc b/io.cc
        index 77e0e2a..a86156b 100644
        --- a/io.cc
        +++ b/io.cc
        @@ -24,6 +24,10 @@ write_num(IO *interface, KF_INT n)
                                n++;
                        }
                }
        +       else if (n == 0) {
        +               interface->wrch('0');
        +               return;
        +       }
                while (n != 0) {
                        char ch = (n % 10) + '0';
 May the compiler have mercy on my soul and whatnot.
 For you sports fans keeping track at home, here's the classic bugs I've
 introduced so far:
 1. bounds overrun
 2. missing case in a switch statement
 But now here I am with the interpreter in good shape. Now I can start
 implementing the builtins in earnest!
 As before, see the tag `part-0x05 <https://github.com/kisom/kforth/tree/part-0x05>`_.
--- a/io.cc
+++ b/io.cc
@@ -24,6 +24,10 @@ write_num(IO *interface, KF_INT n)
 			n++;
 		}
 	}
 	else if (n == 0) {
 		interface->wrch('0');
 		return;
 	}
 	while (n != 0) {
 		char ch = (n % 10) + '0';
--- a/kforth.cc
+++ b/kforth.cc
@@ -11,7 +11,6 @@
 #endif // __linux__
 static char     ok[] = "ok.\n";
 static char	bye[] = "bye";
 static System	sys;
@@ -68,12 +67,6 @@ parser(const char *buf, const size_t buflen)
 			stop = false;
 			break;
 		}
 		// Temporary hack until the interpreter is working further.
 		if (match_token(token.token, token.length, bye, 3)) {
 			sys.interface->wrln((char *)"Goodbye!", 8);
 			exit(0);
 		}
 	}
 	switch (result) {
@@ -116,7 +109,7 @@ const size_t	bannerlen = 19;
 int
 main(void)
 {
-	init_dict();
+	init_dict(&sys);
 #ifdef __linux__
 	Console interface;
 	sys.interface = &interface;
--- a/parser.cc
+++ b/parser.cc
@@ -2,6 +2,7 @@
 #include "parser.h"
 #include "stack.h"
 #include <ctype.h>
 #include <string.h>
 static void
@@ -24,6 +25,10 @@ match_token(const char *a, const size_t alen,
 			continue;
 		}
 		if (!isalpha(a[i]) || !isalpha(b[i])) {
 			return false;
 		}
 		if ((a[i] ^ 0x20) == b[i]) {
 			continue;
 		}
--- a/system.h
+++ b/system.h
@@ -8,6 +8,7 @@
 typedef struct _System {
 	Stack<KF_INT>	 dstack;
 	IO		*interface;
 	struct Word	*dict;
 } System;
--- a/word.h
+++ b/word.h
@@ -25,7 +25,6 @@ public:
 	Word *next(void);
 	bool  match(struct Token *);
 	void  getname(char *, size_t *);
 private:
 	char		 name[MAX_TOKEN_LENGTH];
 	size_t		 namelen;
@@ -33,5 +32,4 @@ private:
 	bool		(*fun)(System *);
 };
 #endif // __KF_WORD_H__