UTF-8 support.

This commit is contained in:
2025-11-24 13:11:24 -08:00
parent e345b55595
commit 3782880062

181
main.c
View File

@@ -19,6 +19,8 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <locale.h>
#include <wchar.h>
#include <termios.h> #include <termios.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
@@ -313,15 +315,54 @@ int
erow_render_to_cursor(struct erow *row, int cx) erow_render_to_cursor(struct erow *row, int cx)
{ {
int rx = 0; int rx = 0;
int j; size_t j = 0;
for (j = 0; j < cx; j++) { wchar_t wc;
if (row->line[j] == '\t') { mbstate_t st;
memset(&st, 0, sizeof(st));
while (j < (size_t)cx && j < (size_t)row->size) {
unsigned char b = (unsigned char)row->line[j];
if (b == '\t') {
rx += (TAB_STOP - 1) - (rx % TAB_STOP); rx += (TAB_STOP - 1) - (rx % TAB_STOP);
} else if (row->line[j] < 0x20) {
rx += 2;
}
rx++; rx++;
j++;
continue;
}
if (b < 0x20) {
/* render as \xx -> width 3 */
rx += 3;
j++;
continue;
}
size_t rem = (size_t)row->size - j;
size_t n = mbrtowc(&wc, &row->line[j], rem, &st);
if (n == (size_t)-2) {
/* incomplete sequence at end; treat one byte */
rx += 1;
j += 1;
memset(&st, 0, sizeof(st));
}
else if (n == (size_t)-1) {
/* invalid byte; consume one and reset state */
rx += 1;
j += 1;
memset(&st, 0, sizeof(st));
}
else if (n == 0) {
/* null character */
rx += 0;
j += 1;
}
else {
int w = wcwidth(wc);
if (w < 0) w = 1; /* non-printable -> treat as width 1 */
rx += w;
j += n;
}
} }
return rx; return rx;
@@ -332,22 +373,59 @@ int
erow_cursor_to_render(struct erow *row, int rx) erow_cursor_to_render(struct erow *row, int rx)
{ {
int cur_rx = 0; int cur_rx = 0;
int curx = 0; size_t j = 0;
for (curx = 0; curx < row->size; curx++) { wchar_t wc;
if (row->line[curx] == '\t') { mbstate_t st;
cur_rx += (TAB_STOP - 1) - (cur_rx % TAB_STOP);
} else if (row->line[curx] < 0x20) { memset(&st, 0, sizeof(st));
cur_rx += 2;
while (j < (size_t)row->size) {
int w = 0;
size_t adv = 1;
unsigned char b = (unsigned char)row->line[j];
if (b == '\t') {
int add = (TAB_STOP - 1) - (cur_rx % TAB_STOP);
w = add + 1;
adv = 1;
/* tabs are single byte */
} }
cur_rx++; else if (b < 0x20) {
w = 3; /* "\\xx" */
adv = 1;
}
else {
size_t rem = (size_t)row->size - j;
size_t n = mbrtowc(&wc, &row->line[j], rem, &st);
if (cur_rx > rx) { if (n == (size_t)-2 || n == (size_t)-1) {
/* invalid/incomplete */
w = 1;
adv = 1;
memset(&st, 0, sizeof(st));
}
else if (n == 0) {
w = 0;
adv = 1;
}
else {
int ww = wcwidth(wc);
if (ww < 0) ww = 1;
w = ww;
adv = n;
}
}
if (cur_rx + w > rx) {
break; break;
} }
cur_rx += w;
j += adv;
} }
return curx; return (int)j;
} }
@@ -364,7 +442,9 @@ erow_update(struct erow *row)
for (j = 0; j < row->size; j++) { for (j = 0; j < row->size; j++) {
if (row->line[j] == '\t') { if (row->line[j] == '\t') {
tabs++; tabs++;
} else if (!isprint(row->line[j])) { }
else if ((unsigned char)row->line[j] < 0x20) {
/* treat only ASCII control characters as non-printable */
ctrl++; ctrl++;
} }
} }
@@ -381,12 +461,16 @@ erow_update(struct erow *row)
if (row->line[j] == '\t') { if (row->line[j] == '\t') {
do { do {
row->render[i++] = ' '; row->render[i++] = ' ';
} while ((i % TAB_STOP) != 0); }
} else if (!isprint(row->line[j])) { while ((i % TAB_STOP) != 0);
}
else if ((unsigned char)row->line[j] < 0x20) {
row->render[i++] = '\\'; row->render[i++] = '\\';
row->render[i++] = nibble_to_hex(row->line[j] >> 4); row->render[i++] = nibble_to_hex(row->line[j] >> 4);
row->render[i++] = nibble_to_hex(row->line[j] & 0x0f); row->render[i++] = nibble_to_hex(row->line[j] & 0x0f);
} else { }
else {
/* leave UTF-8 multibyte bytes untouched so terminal can render */
row->render[i++] = row->line[j]; row->render[i++] = row->line[j];
} }
} }
@@ -703,7 +787,9 @@ insertch(int16_t c)
erow_insert(editor.nrows, "", 0); erow_insert(editor.nrows, "", 0);
} }
row_insert_ch(&editor.row[editor.cury], editor.curx, (char)(c & 0xff)); /* Ensure we pass a non-negative byte value to avoid assert(c > 0). */
row_insert_ch(&editor.row[editor.cury], editor.curx,
(int16_t)(c & 0xff));
editor.curx++; editor.curx++;
editor.dirty++; editor.dirty++;
} }
@@ -904,12 +990,16 @@ int16_t
get_keypress(void) get_keypress(void)
{ {
char seq[3]; char seq[3];
char c = -1; /* read raw byte so UTF-8 bytes (>=0x80) are not sign-extended */
unsigned char uc = 0;
int16_t c;
if (read(STDIN_FILENO, &c, 1) == -1) { if (read(STDIN_FILENO, &uc, 1) == -1) {
die("get_keypress:read"); die("get_keypress:read");
} }
c = (int16_t)uc;
if (c == 0x1b) { if (c == 0x1b) {
if (read(STDIN_FILENO, &seq[0], 1) != 1) return c; if (read(STDIN_FILENO, &seq[0], 1) != 1) return c;
if (read(STDIN_FILENO, &seq[1], 1) != 1) return c; if (read(STDIN_FILENO, &seq[1], 1) != 1) return c;
@@ -929,7 +1019,8 @@ get_keypress(void)
case '8': return END_KEY; case '8': return END_KEY;
} }
} }
} else { }
else {
switch (seq[1]) { switch (seq[1]) {
case 'A': return ARROW_UP; case 'A': return ARROW_UP;
case 'B': return ARROW_DOWN; case 'B': return ARROW_DOWN;
@@ -942,7 +1033,8 @@ get_keypress(void)
/* nada */ ; /* nada */ ;
} }
} }
} else if (seq[0] == 'O') { }
else if (seq[0] == 'O') {
switch (seq[1]) { switch (seq[1]) {
case 'F': return END_KEY; case 'F': return END_KEY;
case 'H': return HOME_KEY; case 'H': return HOME_KEY;
@@ -989,14 +1081,14 @@ editor_prompt(char *prompt, void (*cb)(char *, int16_t))
} }
return buf; return buf;
} }
} else if (!iscntrl(c) && c < 128) { } else if ((c == TAB_KEY) || (c >= 0x20 && c != 0x7f)) {
if (buflen == bufsz - 1) { if (buflen == bufsz - 1) {
bufsz *= 2; bufsz *= 2;
buf = realloc(buf, bufsz); buf = realloc(buf, bufsz);
assert(buf != NULL); assert(buf != NULL);
} }
buf[buflen++] = c; buf[buflen++] = (char)(c & 0xff);
buf[buflen] = '\0'; buf[buflen] = '\0';
} }
@@ -1130,7 +1222,13 @@ move_cursor(int16_t c)
case CTRL_KEY('f'): case CTRL_KEY('f'):
if (row && editor.curx < row->size) { if (row && editor.curx < row->size) {
editor.curx++; editor.curx++;
} else if (row && editor.curx == row->size) { /* skip over UTF-8 continuation bytes */
while (row && editor.curx < row->size &&
((unsigned char)row->line[editor.curx] & 0xC0) == 0x80) {
editor.curx++;
}
}
else if (row && editor.curx == row->size) {
editor.cury++; editor.cury++;
editor.curx = 0; editor.curx = 0;
} }
@@ -1139,16 +1237,29 @@ move_cursor(int16_t c)
case CTRL_KEY('b'): case CTRL_KEY('b'):
if (editor.curx > 0) { if (editor.curx > 0) {
editor.curx--; editor.curx--;
} else if (editor.cury > 0) { /* move to the start byte if we landed on a continuation */
while (editor.curx > 0 &&
((unsigned char)row->line[editor.curx] & 0xC0) == 0x80) {
editor.curx--;
}
}
else if (editor.cury > 0) {
editor.cury--; editor.cury--;
editor.curx = editor.row[editor.cury].size; editor.curx = editor.row[editor.cury].size;
/* ensure at a codepoint boundary at end of previous line */
row = &editor.row[editor.cury];
while (editor.curx > 0 &&
((unsigned char)row->line[editor.curx] & 0xC0) == 0x80) {
editor.curx--;
}
} }
break; break;
case PG_UP: case PG_UP:
case PG_DN: { case PG_DN:
if (c == PG_UP) { if (c == PG_UP) {
editor.cury = editor.rowoffs; editor.cury = editor.rowoffs;
} else if (c == PG_DN) { }
else if (c == PG_DN) {
editor.cury = editor.rowoffs + editor.rows - 1; editor.cury = editor.rowoffs + editor.rows - 1;
if (editor.cury > editor.nrows) { if (editor.cury > editor.nrows) {
editor.cury = editor.nrows; editor.cury = editor.nrows;
@@ -1161,7 +1272,6 @@ move_cursor(int16_t c)
} }
break; break;
}
case HOME_KEY: case HOME_KEY:
case CTRL_KEY('a'): case CTRL_KEY('a'):
@@ -1179,8 +1289,7 @@ move_cursor(int16_t c)
} }
row = (editor.cury >= editor.nrows) ? row = (editor.cury >= editor.nrows) ? NULL : &editor.row[editor.cury];
NULL : &editor.row[editor.cury];
reps = row ? row->size : 0; reps = row ? row->size : 0;
if (editor.curx > reps) { if (editor.curx > reps) {
editor.curx = reps; editor.curx = reps;
@@ -1312,7 +1421,8 @@ process_normal(int16_t c)
editor.mode = MODE_ESCAPE; editor.mode = MODE_ESCAPE;
break; break;
default: default:
if (isprint(c) || c == TAB_KEY) { /* Insert any printable byte: ASCII 0x200x7E and all bytes >=0x80. */
if ((c == TAB_KEY) || (c >= 0x20 && c != 0x7f)) {
insertch(c); insertch(c);
} }
break; break;
@@ -1662,6 +1772,9 @@ loop(void)
int int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
// Set locale for proper UTF-8 handling
setlocale(LC_ALL, "");
setup_terminal(); setup_terminal();
init_editor(); init_editor();