misc/kforth: Finish parsing.
This commit is contained in:
parent
5bcc016246
commit
0ae7d49593
2
Makefile
2
Makefile
|
@ -1,4 +1,4 @@
|
||||||
CXXSTD := c++11
|
CXXSTD := c++14
|
||||||
CXXFLAGS := -std=$(CXXSTD) -Wall -Werror -g -O0
|
CXXFLAGS := -std=$(CXXSTD) -Wall -Werror -g -O0
|
||||||
OBJS := linux/io.o \
|
OBJS := linux/io.o \
|
||||||
parser.o \
|
parser.o \
|
||||||
|
|
1
defs.h
1
defs.h
|
@ -5,5 +5,6 @@
|
||||||
#include "linux/defs.h"
|
#include "linux/defs.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
constexpr size_t MAX_TOKEN_LENGTH = 16;
|
||||||
|
|
||||||
#endif // __KF_DEFS_H__
|
#endif // __KF_DEFS_H__
|
|
@ -8,6 +8,7 @@ Contents:
|
||||||
|
|
||||||
part-0x01
|
part-0x01
|
||||||
part-0x02
|
part-0x02
|
||||||
|
part-0x03
|
||||||
|
|
||||||
Indices and tables
|
Indices and tables
|
||||||
==================
|
==================
|
||||||
|
|
|
@ -87,5 +87,15 @@ Stage 4
|
||||||
Next steps
|
Next steps
|
||||||
^^^^^^^^^^
|
^^^^^^^^^^
|
||||||
|
|
||||||
|
I've decided to use C++ for two reasons: it's supported by all the targets I
|
||||||
|
want (amd64, arm/arm64, msp430, avr), and I know it well enough (and
|
||||||
|
importantly, I know the tooling) to get by. Typically, the TI compilers lag
|
||||||
|
behind the others in supporting newer C++ standards, so those will be the
|
||||||
|
limiting factor. Fortunately, just a few days before I started this, the TI
|
||||||
|
wiki was updated_ to note that the latest compilers now support C++11 and
|
||||||
|
C++14, so I'll target C++14.
|
||||||
|
|
||||||
|
.. _updated: http://processors.wiki.ti.com/index.php/C%2B%2B_Support_in_TI_Compilers#Status_as_of_February_2018
|
||||||
|
|
||||||
I don't really know what I'm doing, so in the next section, I'll build out the
|
I don't really know what I'm doing, so in the next section, I'll build out the
|
||||||
basic framework and set up the build.
|
basic framework and set up the build.
|
|
@ -20,7 +20,7 @@ The project will also need a build system. For simplicity, I'll at least start
|
||||||
with a basic Makefile::
|
with a basic Makefile::
|
||||||
|
|
||||||
# Makefile
|
# Makefile
|
||||||
CXXSTD := c++11
|
CXXSTD := c++14
|
||||||
CXXFLAGS := -std=$(CXXSTD) -Werror -Wall -g -O0
|
CXXFLAGS := -std=$(CXXSTD) -Werror -Wall -g -O0
|
||||||
OBJS := linux/io.o \
|
OBJS := linux/io.o \
|
||||||
kforth.o
|
kforth.o
|
||||||
|
@ -272,3 +272,9 @@ Next steps
|
||||||
^^^^^^^^^^
|
^^^^^^^^^^
|
||||||
|
|
||||||
I guess the next thing to do will be to start parsing.
|
I guess the next thing to do will be to start parsing.
|
||||||
|
|
||||||
|
Some housekeeping: I'll keep the state of the code at each part in
|
||||||
|
the tag ``part-$PART``; this part, for example is in the tag
|
||||||
|
`part-0x02`_.
|
||||||
|
|
||||||
|
.. _part-0x02: https://github.com/kisom/kforth/tree/part-0x02
|
|
@ -0,0 +1,293 @@
|
||||||
|
Write You a Forth, 0x03
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
:date: 2018-02-23 09:36
|
||||||
|
:tags: wyaf, forth
|
||||||
|
|
||||||
|
Today, I'm working on parsing. I was talking to `steveo
|
||||||
|
<https://github.com/steveo>`_ yesterday, and he mentioned string interning, and
|
||||||
|
it sounded like a fun thing to do (and then I started thinking about ropes and
|
||||||
|
so on).
|
||||||
|
|
||||||
|
However, I'm not going to intern strings --- at least, not yet. I'm going to do
|
||||||
|
something way more primitive::
|
||||||
|
|
||||||
|
bool match_token(const char *a, const size_t alen,
|
||||||
|
const char *b, const size_t blen)
|
||||||
|
{
|
||||||
|
if (alen != blen) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return memcmp(a, b, alen) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
I'd also like to operate on a buffer without having to store a bunch of copies
|
||||||
|
of strings. Performance may not be the number one concern here, but I think
|
||||||
|
it'll be more fun to implement, and it will be a little easier. The parser
|
||||||
|
should return the next token that we can push off to the rest of the process.
|
||||||
|
It seems like we'll want a structure for that.
|
||||||
|
|
||||||
|
``parser.h``
|
||||||
|
^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The parser seems like it really only needs a few things, so time to take a stab at
|
||||||
|
``parser.h``::
|
||||||
|
|
||||||
|
#ifndef __KF_PARSER_H__
|
||||||
|
#define __KF_PARSER_H__
|
||||||
|
|
||||||
|
#include "defs.h"
|
||||||
|
|
||||||
|
A ``Token`` can be defined as just the pointer to the start of the token and
|
||||||
|
its length. There's a limit to the maximum size of the buffer, and it'll be
|
||||||
|
important to check the length of the token. For simplicity, I'm going to define
|
||||||
|
the maximum length of a token as 16, and I'll put this as a ``constexpr`` in the
|
||||||
|
``defs.h`` file.
|
||||||
|
::
|
||||||
|
|
||||||
|
struct Token {
|
||||||
|
char *token;
|
||||||
|
uint8_t length;
|
||||||
|
};
|
||||||
|
|
||||||
|
Next up is to define the function from before for matching tokens.
|
||||||
|
::
|
||||||
|
|
||||||
|
bool match_token(const char *, const size_t, const char *, const size_t);
|
||||||
|
|
||||||
|
The meat of the parser is `parse_next`, for which we'll also need some return codes.
|
||||||
|
::
|
||||||
|
|
||||||
|
typedef enum _PARSE_RESULT_ : uint8_t {
|
||||||
|
PARSE_OK = 0, // token now has a valid token.
|
||||||
|
PARSE_EOB = 1, // end of buffer, parsing a line should stop.
|
||||||
|
PARSE_LEN = 2, // token is too long
|
||||||
|
PARSE_FAIL = 3 // catch-all error
|
||||||
|
} PARSE_RESULT;
|
||||||
|
|
||||||
|
int parse_next(const char *, const size_t, size_t *, struct Token *);
|
||||||
|
|
||||||
|
#endif // __KF_PARSER_H__
|
||||||
|
|
||||||
|
``parser.cc``
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
``parser.cc`` will open with a helper to reset tokens and the same
|
||||||
|
matching code I mentioned before::
|
||||||
|
|
||||||
|
#include "defs.h"
|
||||||
|
#include "parser.h"
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
static void
|
||||||
|
reset(struct Token *t)
|
||||||
|
{
|
||||||
|
t->token = nullptr;
|
||||||
|
t->length = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
match_token(const char *a, const size_t alen,
|
||||||
|
const char *b, const size_t blen)
|
||||||
|
{
|
||||||
|
if (alen != blen) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return memcmp(a, b, alen) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
At the start of the parser, I'm going to reset the token; if there's a failure,
|
||||||
|
there shouldn't be a valid token anyhow.
|
||||||
|
::
|
||||||
|
|
||||||
|
PARSE_RESULT
|
||||||
|
parse_next(const char *buf, const size_t length, size_t *offset,
|
||||||
|
struct Token *token)
|
||||||
|
{
|
||||||
|
size_t cursor = *offset;
|
||||||
|
|
||||||
|
// Clear the token.
|
||||||
|
reset(token);
|
||||||
|
|
||||||
|
If the offset is already at the end of the buffer, there's no more work to do
|
||||||
|
on this buffer, so I'll cut out early ``PARSE_EOB``. If I was doing a more
|
||||||
|
careful job of programming this, I'd *generally* try to avoid multiple returns,
|
||||||
|
but in this case, having working code is more important than awesome code.
|
||||||
|
::
|
||||||
|
|
||||||
|
if (cursor == length) {
|
||||||
|
return PARSE_EOB;
|
||||||
|
}
|
||||||
|
|
||||||
|
I'm going to assume that tokens are separated by spaces or tabs. I wasn't going
|
||||||
|
to support tabs at first, but it's easy enough to do that I just included it.
|
||||||
|
::
|
||||||
|
|
||||||
|
while (cursor <= length) {
|
||||||
|
if (buf[cursor] != ' ') {
|
||||||
|
if (buf[cursor] != '\t') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cursor++;
|
||||||
|
}
|
||||||
|
|
||||||
|
This part might seem superfluous, but it's important in case there's trailing
|
||||||
|
whitespace in the buffer. I haven't touched the token yet, so no need to reset
|
||||||
|
it.
|
||||||
|
::
|
||||||
|
|
||||||
|
if (cursor == length) {
|
||||||
|
return PARSE_EOB;
|
||||||
|
}
|
||||||
|
|
||||||
|
Now I can point the token to the buffer at the start of the next token and walk
|
||||||
|
through the buffer until the end of the buffer or the first whitespace
|
||||||
|
character::
|
||||||
|
|
||||||
|
token->token = (char *)buf + cursor;
|
||||||
|
while ((token->length <= MAX_TOKEN_LENGTH) && (cursor < length)) {
|
||||||
|
if (buf[cursor] != ' ') {
|
||||||
|
if (buf[cursor] != '\t') {
|
||||||
|
cursor++;
|
||||||
|
token->length++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
This got me at first and took me a few minutes to figure out. If the cursor
|
||||||
|
isn't updated at the end, the next run of the parser is going to be stuck on
|
||||||
|
this word as the cursor doesn't point to whitespace anymore.
|
||||||
|
::
|
||||||
|
|
||||||
|
cursor++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Finally, if the token length hasn't been exceeded, the offset can be updated
|
||||||
|
and the token returned::
|
||||||
|
|
||||||
|
if (token->length > MAX_TOKEN_LENGTH) {
|
||||||
|
reset(token);
|
||||||
|
return PARSE_LEN;
|
||||||
|
}
|
||||||
|
|
||||||
|
*offset = cursor;
|
||||||
|
return PARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
``kforth.cc``
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
That's all of ``parse.cc`` (at least for now), but this needs to be integrated
|
||||||
|
into the frontend. ``kforth.cc`` now starts off with::
|
||||||
|
|
||||||
|
#include "io.h"
|
||||||
|
#include "parser.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#ifdef __linux__
|
||||||
|
#include "linux.h"
|
||||||
|
#endif // __linux__
|
||||||
|
|
||||||
|
static char ok[] = "ok.\n";
|
||||||
|
static char bye[] = "bye";
|
||||||
|
|
||||||
|
static bool
|
||||||
|
parser(IO &interface, const char *buf, const size_t buflen)
|
||||||
|
{
|
||||||
|
static size_t offset = 0;
|
||||||
|
static struct Token token;
|
||||||
|
static PARSE_RESULT result = PARSE_FAIL;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
|
||||||
|
// reset token
|
||||||
|
token.token = nullptr;
|
||||||
|
token.length = 0;
|
||||||
|
|
||||||
|
while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) {
|
||||||
|
interface.wrbuf((char *)"token: ", 7);
|
||||||
|
interface.wrbuf(token.token, token.length);
|
||||||
|
interface.wrln((char *)".", 1);
|
||||||
|
|
||||||
|
There's no command parser right now, so I've added in this hack so it starts to
|
||||||
|
feel a little like a Forth.
|
||||||
|
::
|
||||||
|
|
||||||
|
if (match_token(token.token, token.length, bye, 3)) {
|
||||||
|
interface.wrln((char *)"Goodbye!", 8);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (result) {
|
||||||
|
case PARSE_EOB:
|
||||||
|
interface.wrbuf(ok, 4);
|
||||||
|
return true;
|
||||||
|
case PARSE_LEN:
|
||||||
|
interface.wrln((char *)"parse error: token too long", 27);
|
||||||
|
return false;
|
||||||
|
case PARSE_FAIL:
|
||||||
|
interface.wrln((char *)"parser failure", 14);
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
interface.wrln((char *)"*** the world is broken ***", 27);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
interpreter(IO &interface)
|
||||||
|
{
|
||||||
|
static size_t buflen = 0;
|
||||||
|
static char linebuf[81];
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
interface.wrch('?');
|
||||||
|
interface.wrch(' ');
|
||||||
|
buflen = interface.rdbuf(linebuf, 80, true, '\n');
|
||||||
|
|
||||||
|
The return value is being ignored right now, but later on it might be useful.
|
||||||
|
::
|
||||||
|
|
||||||
|
parser(interface, linebuf, buflen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
But does it work?
|
||||||
|
::
|
||||||
|
|
||||||
|
~/code/kforth (0) $ make
|
||||||
|
g++ -std=c++14 -Wall -Werror -g -O0 -c -o linux/io.o linux/io.cc
|
||||||
|
g++ -std=c++14 -Wall -Werror -g -O0 -c -o parser.o parser.cc
|
||||||
|
g++ -std=c++14 -Wall -Werror -g -O0 -c -o kforth.o kforth.cc
|
||||||
|
g++ -o kforth linux/io.o parser.o kforth.o
|
||||||
|
~/code/kforth (0) $ ./kforth
|
||||||
|
kforth interpreter
|
||||||
|
? 2 3 4 + * 1 SWAP
|
||||||
|
token: 2.
|
||||||
|
token: 3.
|
||||||
|
token: 4.
|
||||||
|
token: +.
|
||||||
|
token: *.
|
||||||
|
token: 1.
|
||||||
|
token: SWAP.
|
||||||
|
ok.
|
||||||
|
? thistokenistoolong!
|
||||||
|
parse error: token too long
|
||||||
|
bye
|
||||||
|
token: bye.
|
||||||
|
Goodbye!
|
||||||
|
~/code/kforth (0) $
|
||||||
|
|
||||||
|
Heyo! Now I'm getting somewhere. The next logical step (to me) is to add in a
|
||||||
|
command parser and a standard vocabulary.
|
||||||
|
|
||||||
|
The snapshot of the code from here is in the tag part-0x03_.
|
||||||
|
|
||||||
|
.. _part-0x03: https://github.com/kisom/kforth/tree/part-0x03
|
49
kforth.cc
49
kforth.cc
|
@ -1,11 +1,55 @@
|
||||||
#include "io.h"
|
#include "io.h"
|
||||||
#include "parser.h"
|
#include "parser.h"
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include "linux.h"
|
#include "linux.h"
|
||||||
#endif // __linux__
|
#endif // __linux__
|
||||||
|
|
||||||
static char ok[] = "ok.\n";
|
static char ok[] = "ok.\n";
|
||||||
|
static char bye[] = "bye";
|
||||||
|
|
||||||
|
static bool
|
||||||
|
parser(IO &interface, const char *buf, const size_t buflen)
|
||||||
|
{
|
||||||
|
static size_t offset = 0;
|
||||||
|
static struct Token token;
|
||||||
|
static PARSE_RESULT result = PARSE_FAIL;
|
||||||
|
|
||||||
|
offset = 0;
|
||||||
|
|
||||||
|
// reset token
|
||||||
|
token.token = nullptr;
|
||||||
|
token.length = 0;
|
||||||
|
|
||||||
|
while ((result = parse_next(buf, buflen, &offset, &token)) == PARSE_OK) {
|
||||||
|
interface.wrbuf((char *)"token: ", 7);
|
||||||
|
interface.wrbuf(token.token, token.length);
|
||||||
|
interface.wrln((char *)".", 1);
|
||||||
|
|
||||||
|
// Temporary hack until the interpreter is working further.
|
||||||
|
if (match_token(token.token, token.length, bye, 3)) {
|
||||||
|
interface.wrln((char *)"Goodbye!", 8);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (result) {
|
||||||
|
case PARSE_EOB:
|
||||||
|
interface.wrbuf(ok, 4);
|
||||||
|
return true;
|
||||||
|
case PARSE_LEN:
|
||||||
|
interface.wrln((char *)"parse error: token too long", 27);
|
||||||
|
return false;
|
||||||
|
case PARSE_FAIL:
|
||||||
|
interface.wrln((char *)"parser failure", 14);
|
||||||
|
return false;
|
||||||
|
default:
|
||||||
|
interface.wrln((char *)"*** the world is broken ***", 27);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
interpreter(IO &interface)
|
interpreter(IO &interface)
|
||||||
|
@ -14,9 +58,10 @@ interpreter(IO &interface)
|
||||||
static char linebuf[81];
|
static char linebuf[81];
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
interface.wrch('?');
|
||||||
|
interface.wrch(' ');
|
||||||
buflen = interface.rdbuf(linebuf, 80, true, '\n');
|
buflen = interface.rdbuf(linebuf, 80, true, '\n');
|
||||||
interface.wrln(linebuf, buflen);
|
parser(interface, linebuf, buflen);
|
||||||
interface.wrbuf(ok, 4);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
70
parser.cc
70
parser.cc
|
@ -1,17 +1,71 @@
|
||||||
#include "defs.h"
|
#include "defs.h"
|
||||||
#include "parser.h"
|
#include "parser.h"
|
||||||
|
|
||||||
int
|
#include <string.h>
|
||||||
|
|
||||||
|
static void
|
||||||
|
reset(struct Token *t)
|
||||||
|
{
|
||||||
|
t->token = nullptr;
|
||||||
|
t->length = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
match_token(const char *a, const size_t alen,
|
||||||
|
const char *b, const size_t blen)
|
||||||
|
{
|
||||||
|
if (alen != blen) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return memcmp(a, b, alen) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
PARSE_RESULT
|
||||||
parse_next(const char *buf, const size_t length, size_t *offset,
|
parse_next(const char *buf, const size_t length, size_t *offset,
|
||||||
struct Token *token)
|
struct Token *token)
|
||||||
{
|
{
|
||||||
size_t start = *offset;
|
size_t cursor = *offset;
|
||||||
bool ok = false;
|
|
||||||
|
|
||||||
// TODO(skip past whitespace)
|
// Clear the token.
|
||||||
// TODO(find next EOC)
|
reset(token);
|
||||||
if (!ok) {
|
|
||||||
*offset = start;
|
if (cursor == length) {
|
||||||
|
return PARSE_EOB;
|
||||||
}
|
}
|
||||||
return -1;
|
|
||||||
|
while (cursor <= length) {
|
||||||
|
if (buf[cursor] != ' ') {
|
||||||
|
if (buf[cursor] != '\t') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cursor++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cursor == length) {
|
||||||
|
return PARSE_EOB;
|
||||||
|
}
|
||||||
|
|
||||||
|
token->token = (char *)buf + cursor;
|
||||||
|
while ((token->length <= MAX_TOKEN_LENGTH) && (cursor < length)) {
|
||||||
|
if (buf[cursor] != ' ') {
|
||||||
|
if (buf[cursor] != '\t') {
|
||||||
|
cursor++;
|
||||||
|
token->length++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cursor++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token->length > MAX_TOKEN_LENGTH) {
|
||||||
|
reset(token);
|
||||||
|
return PARSE_LEN;
|
||||||
|
}
|
||||||
|
|
||||||
|
*offset = cursor;
|
||||||
|
return PARSE_OK;
|
||||||
}
|
}
|
10
parser.h
10
parser.h
|
@ -8,7 +8,15 @@ struct Token {
|
||||||
uint8_t length;
|
uint8_t length;
|
||||||
};
|
};
|
||||||
|
|
||||||
int parse_next(const char *, const size_t, size_t *, struct Token *);
|
typedef enum _PARSE_RESULT_ : uint8_t {
|
||||||
|
PARSE_OK = 0,
|
||||||
|
PARSE_EOB = 1, // end of buffer
|
||||||
|
PARSE_LEN = 2, // token is too long
|
||||||
|
PARSE_FAIL = 3 // catch-all error
|
||||||
|
} PARSE_RESULT;
|
||||||
|
|
||||||
|
bool match_token(const char *, const size_t, const char *, const size_t);
|
||||||
|
PARSE_RESULT parse_next(const char *, const size_t, size_t *, struct Token *);
|
||||||
|
|
||||||
|
|
||||||
#endif // __KF_PARSER_H__
|
#endif // __KF_PARSER_H__
|
||||||
|
|
Loading…
Reference in New Issue