misc/kforth: Finish parsing.

2018-02-23 14:01:52 -08:00 · 2018-02-23 14:01:52 -08:00 · 0ae7d49593
parent 5bcc016246
commit 0ae7d49593
9 changed files with 432 additions and 14 deletions
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-CXXSTD :=	c++11
+CXXSTD :=	c++14
 CXXFLAGS :=	-std=$(CXXSTD) -Wall -Werror -g -O0
 OBJS :=		linux/io.o	\
 		parser.o	\
--- a/defs.h
+++ b/defs.h
@ -5,5 +5,6 @@
 #include "linux/defs.h"
 #endif

+constexpr size_t	MAX_TOKEN_LENGTH = 16;

 #endif // __KF_DEFS_H__
--- a/doc/index.rst
+++ b/doc/index.rst
@ -8,6 +8,7 @@ Contents:

   part-0x01
   part-0x02
+   part-0x03

 Indices and tables
 ==================
--- a/doc/part-0x01.rst
+++ b/doc/part-0x01.rst
@ -87,5 +87,15 @@ Stage 4
 Next steps
 ^^^^^^^^^^

+I've decided to use C++ for two reasons: it's supported by all the targets I
+want (amd64, arm/arm64, msp430, avr), and I know it well enough (and
+importantly, I know the tooling) to get by. Typically, the TI compilers lag
+behind the others in supporting newer C++ standards, so those will be the
+limiting factor. Fortunately, just a few days before I started this, the TI
+wiki was updated_ to note that the latest compilers now support C++11 and
+C++14, so I'll target C++14.
+
+.. _updated: http://processors.wiki.ti.com/index.php/C%2B%2B_Support_in_TI_Compilers#Status_as_of_February_2018
+
 I don't really know what I'm doing, so in the next section, I'll build out the
 basic framework and set up the build.
--- a/doc/part-0x02.rst
+++ b/doc/part-0x02.rst
@ -20,7 +20,7 @@ The project will also need a build system. For simplicity, I'll at least start
 with a basic Makefile::

  # Makefile
-  CXXSTD :=     c++11
+  CXXSTD :=     c++14
  CXXFLAGS :=   -std=$(CXXSTD) -Werror -Wall -g -O0
  OBJS :=       linux/io.o     \
                kforth.o
@ -271,4 +271,10 @@ definitions from the standard library::
 Next steps
 ^^^^^^^^^^

-I guess the next thing to do will be to start parsing.
+I guess the next thing to do will be to start parsing.
+
+Some housekeeping: I'll keep the state of the code at each part in
+the tag ``part-$PART``; this part, for example is in the tag
+`part-0x02`_.
+
+.. _part-0x02: https://github.com/kisom/kforth/tree/part-0x02
--- a/doc/part-0x03.rst
+++ b/doc/part-0x03.rst
@ -0,0 +1,293 @@
+Write You a Forth, 0x03
+-----------------------
+
+:date: 2018-02-23 09:36
+:tags: wyaf, forth
+
+Today, I'm working on parsing. I was talking to `steveo
+<https://github.com/steveo>`_ yesterday, and he mentioned string interning, and
+it sounded like a fun thing to do (and then I started thinking about ropes and
+so on).
+
+However, I'm not going to intern strings --- at least, not yet. I'm going to do
+something way more primitive::
+
+  bool match_token(const char *a, const size_t alen,
+                   const char *b, const size_t blen)
+  {
+          if (alen != blen) {
+                  return false;
+          }
+
+          return memcmp(a, b, alen) == 0;
+  }
+
+I'd also like to operate on a buffer without having to store a bunch of copies
+of strings. Performance may not be the number one concern here, but I think
+it'll be more fun to implement, and it will be a little easier. The parser
+should return the next token that we can push off to the rest of the process.
+It seems like we'll want a structure for that.
+
+``parser.h``
+^^^^^^^^^^^^
+
+The parser seems like it really only needs a few things, so time to take a stab at
+``parser.h``::
+
+        #ifndef __KF_PARSER_H__
+        #define __KF_PARSER_H__
+
+        #include "defs.h"
+
+A ``Token`` can be defined as just the pointer to the start of the token and
+its length. There's a limit to the maximum size of the buffer, and it'll be
+important to check the length of the token. For simplicity, I'm going to define
+the maximum length of a token as 16, and I'll put this as a ``constexpr`` in the
+``defs.h`` file.
+::
+
+        struct Token {
+                char    *token;
+                uint8_t  length;
+        };
+
+Next up is to define the function from before for matching tokens.
+::
+
+        bool    match_token(const char *, const size_t, const char *, const size_t);
+
+The meat of the parser is `parse_next`, for which we'll also need some return codes.
+::
+
+        typedef enum _PARSE_RESULT_ : uint8_t {
+                PARSE_OK = 0,  // token now has a valid token.
+                PARSE_EOB = 1, // end of buffer, parsing a line should stop.
+                PARSE_LEN = 2, // token is too long
+                PARSE_FAIL = 3 // catch-all error
+        } PARSE_RESULT;
+
+        int     parse_next(const char *, const size_t, size_t *, struct Token *);
+
+        #endif // __KF_PARSER_H__
+
+``parser.cc``
+^^^^^^^^^^^^^^
+
+``parser.cc`` will open with a helper to reset tokens and the same
+matching code I mentioned before::
+
+        #include "defs.h"
+        #include "parser.h"
+
+        #include <string.h>
+
+        static void
+        reset(struct Token *t)
+        {
+                t->token = nullptr;
+                t->length = 0;
+        }
+
+        bool
+        match_token(const char *a, const size_t alen,
+                const char *b, const size_t blen)
+        {
+                if (alen != blen) {
+                        return false;
+                }
+
+                return memcmp(a, b, alen) == 0;
+        }
+
+At the start of the parser, I'm going to reset the token; if there's a failure,
+there shouldn't be a valid token anyhow.
+::
+
+        PARSE_RESULT
+        parse_next(const char *buf, const size_t length, size_t *offset,
+                struct Token *token)
+        {
+                size_t	 cursor = *offset;
+
+                // Clear the token.
+                reset(token);
+
+If the offset is already at the end of the buffer, there's no more work to do
+on this buffer, so I'll cut out early ``PARSE_EOB``. If I was doing a more
+careful job of programming this, I'd *generally* try to avoid multiple returns,
+but in this case, having working code is more important than awesome code.
+::
+                
+                if (cursor == length) {
+                        return PARSE_EOB;
+                }
+
+I'm going to assume that tokens are separated by spaces or tabs. I wasn't going
+to support tabs at first, but it's easy enough to do that I just included it.
+::
+
+                while (cursor <= length) {
+                        if (buf[cursor] != ' ') {
+                                if (buf[cursor] != '\t') {
+                                        break;
+                                }
+                        }
+                        cursor++;
+                }
+
+This part might seem superfluous, but it's important in case there's trailing
+whitespace in the buffer. I haven't touched the token yet, so no need to reset
+it.
+::
+
+                if (cursor == length) {
+                        return PARSE_EOB;
+                }
+
+Now I can point the token to the buffer at the start of the next token and walk
+through the buffer until the end of the buffer or the first whitespace
+character::
+
+                token->token = (char *)buf + cursor;
+                while ((token->length <= MAX_TOKEN_LENGTH) && (cursor < length)) {
+                        if (buf[cursor] != ' ') {
+                                if (buf[cursor] != '\t') {
+                                        cursor++;
+                                        token->length++;
+                                        continue;
+                                }
+                        }
+
+This got me at first and took me a few minutes to figure out. If the cursor
+isn't updated at the end, the next run of the parser is going to be stuck on
+this word as the cursor doesn't point to whitespace anymore.
+::
+
+                        cursor++;
+                        break;
+                }
+
+Finally, if the token length hasn't been exceeded, the offset can be updated
+and the token returned::
+
+                if (token->length > MAX_TOKEN_LENGTH) {
+                        reset(token);
+                        return PARSE_LEN;
+                }
+
+                *offset = cursor;
+                return PARSE_OK;
+        }
+
+``kforth.cc``
+^^^^^^^^^^^^^
+
+That's all of ``parse.cc`` (at least for now), but this needs to be integrated
+into the frontend. ``kforth.cc`` now starts off with::
+
+        #include "io.h"
+        #include "parser.h"
+
+        #include <stdlib.h>
+
+        #ifdef __linux__
+        #include "linux.h"
+        #endif // __linux__
+
+        static char     ok[] = "ok.\n";
+        static char     bye[] = "bye";
+
+        static bool
+        parser(IO &interface, const char *buf, const size_t buflen)
+        {
+                static size_t           offset = 0;
+                static struct Token     token;
+                static PARSE_RESULT     result = PARSE_FAIL;
+
+                offset = 0;
+
+                // reset token
+                token.token = nullptr;
+                token.length = 0;
+
+                while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) {
+                        interface.wrbuf((char *)"token: ", 7);
+                        interface.wrbuf(token.token, token.length);
+                        interface.wrln((char *)".", 1);
+
+There's no command parser right now, so I've added in this hack so it starts to
+feel a little like a Forth.
+::
+
+                        if (match_token(token.token, token.length, bye, 3)) {
+                                interface.wrln((char *)"Goodbye!", 8);
+                                exit(0);
+                        }
+                }
+
+                switch (result) {
+                case PARSE_EOB:
+                        interface.wrbuf(ok, 4);
+                        return true;
+                case PARSE_LEN:
+                        interface.wrln((char *)"parse error: token too long", 27);
+                        return false;
+                case PARSE_FAIL:
+                        interface.wrln((char *)"parser failure", 14);
+                        return false;
+                default:
+                        interface.wrln((char *)"*** the world is broken ***", 27);
+                        exit(1);
+                }
+        }
+
+        static void
+        interpreter(IO &interface)
+        {
+                static size_t buflen = 0;
+                static char linebuf[81];
+
+                while (true) {
+                        interface.wrch('?');
+                        interface.wrch(' ');
+                        buflen = interface.rdbuf(linebuf, 80, true, '\n');
+
+The return value is being ignored right now, but later on it might be useful.
+::
+
+                        parser(interface, linebuf, buflen);
+                }
+        }
+
+But does it work?
+::
+
+        ~/code/kforth (0) $ make
+        g++ -std=c++14 -Wall -Werror -g -O0   -c -o linux/io.o linux/io.cc
+        g++ -std=c++14 -Wall -Werror -g -O0   -c -o parser.o parser.cc
+        g++ -std=c++14 -Wall -Werror -g -O0   -c -o kforth.o kforth.cc
+        g++  -o kforth linux/io.o parser.o kforth.o
+        ~/code/kforth (0) $ ./kforth 
+        kforth interpreter
+        ? 2 3 4 + * 1 SWAP  
+        token: 2.
+        token: 3.
+        token: 4.
+        token: +.
+        token: *.
+        token: 1.
+        token: SWAP.
+        ok.
+        ? thistokenistoolong!
+        parse error: token too long
+        bye
+        token: bye.
+        Goodbye!
+        ~/code/kforth (0) $ 
+
+Heyo! Now I'm getting somewhere. The next logical step (to me) is to add in a
+command parser and a standard vocabulary.
+
+The snapshot of the code from here is in the tag part-0x03_.
+
+.. _part-0x03: https://github.com/kisom/kforth/tree/part-0x03
--- a/kforth.cc
+++ b/kforth.cc
@ -1,11 +1,55 @@
 #include "io.h"
 #include "parser.h"

+#include <stdlib.h>
+
 #ifdef __linux__
 #include "linux.h"
 #endif // __linux__

 static char     ok[] = "ok.\n";
+static char	bye[] = "bye";
+
+static bool
+parser(IO &interface, const char *buf, const size_t buflen)
+{
+	static size_t		offset = 0;
+	static struct Token	token;
+	static PARSE_RESULT	result = PARSE_FAIL;
+
+	offset = 0;
+
+	// reset token
+	token.token = nullptr;
+	token.length = 0;
+
+	while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) {
+		interface.wrbuf((char *)"token: ", 7);
+		interface.wrbuf(token.token, token.length);
+		interface.wrln((char *)".", 1);
+
+		// Temporary hack until the interpreter is working further.
+		if (match_token(token.token, token.length, bye, 3)) {
+			interface.wrln((char *)"Goodbye!", 8);
+			exit(0);
+		}
+	}
+
+	switch (result) {
+	case PARSE_EOB:
+		interface.wrbuf(ok, 4);
+		return true;
+	case PARSE_LEN:
+		interface.wrln((char *)"parse error: token too long", 27);
+		return false;
+	case PARSE_FAIL:
+		interface.wrln((char *)"parser failure", 14);
+		return false;
+	default:
+		interface.wrln((char *)"*** the world is broken ***", 27);
+		exit(1);
+	}
+}

 static void
 interpreter(IO &interface)
@ -14,9 +58,10 @@ interpreter(IO &interface)
 	static char linebuf[81];

 	while (true) {
+		interface.wrch('?');
+		interface.wrch(' ');
 		buflen = interface.rdbuf(linebuf, 80, true, '\n');
-		interface.wrln(linebuf, buflen);
-		interface.wrbuf(ok, 4);
+		parser(interface, linebuf, buflen);
 	}
 }

--- a/parser.cc
+++ b/parser.cc
@ -1,17 +1,71 @@
 #include "defs.h"
 #include "parser.h"

-int
+#include <string.h>
+
+static void
+reset(struct Token *t)
+{
+	t->token = nullptr;
+	t->length = 0;
+}
+
+bool
+match_token(const char *a, const size_t alen,
+	    const char *b, const size_t blen)
+{
+	if (alen != blen) {
+		return false;
+	}
+
+	return memcmp(a, b, alen) == 0;
+}
+
+PARSE_RESULT
 parse_next(const char *buf, const size_t length, size_t *offset,
 	   struct Token *token)
 {
-	size_t	start = *offset;
-	bool	ok = false;
+	size_t	 cursor = *offset;

-	// TODO(skip past whitespace)
-	// TODO(find next EOC)
-	if (!ok) {
-		*offset = start;
+	// Clear the token.
+	reset(token);
+
+	if (cursor == length) {
+		return PARSE_EOB;
 	}
-	return -1;
+
+	while (cursor <= length) {
+		if (buf[cursor] != ' ') {
+			if (buf[cursor] != '\t') {
+				break;
+			}
+		}
+
+		cursor++;
+	}
+
+	if (cursor == length) {
+		return PARSE_EOB;
+	}
+
+	token->token = (char *)buf + cursor;
+	while ((token->length <= MAX_TOKEN_LENGTH) && (cursor < length)) {
+		if (buf[cursor] != ' ') {
+			if (buf[cursor] != '\t') {
+				cursor++;
+				token->length++;
+				continue;
+			}
+		}
+		cursor++;
+		break;
+	}
+
+	if (token->length > MAX_TOKEN_LENGTH) {
+		reset(token);
+		return PARSE_LEN;
+	}
+
+	*offset = cursor;
+	return PARSE_OK;
 }
--- a/parser.h
+++ b/parser.h
@ -8,7 +8,15 @@ struct Token {
 	uint8_t	 length;
 };

-int	parse_next(const char *, const size_t, size_t *, struct Token *);
+typedef enum _PARSE_RESULT_ : uint8_t {
+	PARSE_OK = 0,
+	PARSE_EOB = 1, // end of buffer
+	PARSE_LEN = 2, // token is too long
+	PARSE_FAIL = 3 // catch-all error
+} PARSE_RESULT;
+
+bool		match_token(const char *, const size_t, const char *, const size_t);
+PARSE_RESULT	parse_next(const char *, const size_t, size_t *, struct Token *);


 #endif // __KF_PARSER_H__