UTF-8 support.
This commit is contained in:
199
main.c
199
main.c
@@ -19,6 +19,8 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <locale.h>
|
||||||
|
#include <wchar.h>
|
||||||
#include <termios.h>
|
#include <termios.h>
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
@@ -312,16 +314,55 @@ nibble_to_hex(char c)
|
|||||||
int
|
int
|
||||||
erow_render_to_cursor(struct erow *row, int cx)
|
erow_render_to_cursor(struct erow *row, int cx)
|
||||||
{
|
{
|
||||||
int rx = 0;
|
int rx = 0;
|
||||||
int j;
|
size_t j = 0;
|
||||||
|
|
||||||
for (j = 0; j < cx; j++) {
|
wchar_t wc;
|
||||||
if (row->line[j] == '\t') {
|
mbstate_t st;
|
||||||
rx += (TAB_STOP-1) - (rx%TAB_STOP);
|
|
||||||
} else if (row->line[j] < 0x20) {
|
memset(&st, 0, sizeof(st));
|
||||||
rx += 2;
|
|
||||||
|
while (j < (size_t)cx && j < (size_t)row->size) {
|
||||||
|
unsigned char b = (unsigned char)row->line[j];
|
||||||
|
if (b == '\t') {
|
||||||
|
rx += (TAB_STOP - 1) - (rx % TAB_STOP);
|
||||||
|
rx++;
|
||||||
|
j++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (b < 0x20) {
|
||||||
|
/* render as \xx -> width 3 */
|
||||||
|
rx += 3;
|
||||||
|
j++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t rem = (size_t)row->size - j;
|
||||||
|
size_t n = mbrtowc(&wc, &row->line[j], rem, &st);
|
||||||
|
|
||||||
|
if (n == (size_t)-2) {
|
||||||
|
/* incomplete sequence at end; treat one byte */
|
||||||
|
rx += 1;
|
||||||
|
j += 1;
|
||||||
|
memset(&st, 0, sizeof(st));
|
||||||
|
}
|
||||||
|
else if (n == (size_t)-1) {
|
||||||
|
/* invalid byte; consume one and reset state */
|
||||||
|
rx += 1;
|
||||||
|
j += 1;
|
||||||
|
memset(&st, 0, sizeof(st));
|
||||||
|
}
|
||||||
|
else if (n == 0) {
|
||||||
|
/* null character */
|
||||||
|
rx += 0;
|
||||||
|
j += 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int w = wcwidth(wc);
|
||||||
|
if (w < 0) w = 1; /* non-printable -> treat as width 1 */
|
||||||
|
rx += w;
|
||||||
|
j += n;
|
||||||
}
|
}
|
||||||
rx++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return rx;
|
return rx;
|
||||||
@@ -331,23 +372,60 @@ erow_render_to_cursor(struct erow *row, int cx)
|
|||||||
int
|
int
|
||||||
erow_cursor_to_render(struct erow *row, int rx)
|
erow_cursor_to_render(struct erow *row, int rx)
|
||||||
{
|
{
|
||||||
int cur_rx = 0;
|
int cur_rx = 0;
|
||||||
int curx = 0;
|
size_t j = 0;
|
||||||
|
|
||||||
for (curx = 0; curx < row->size; curx++) {
|
wchar_t wc;
|
||||||
if (row->line[curx] == '\t') {
|
mbstate_t st;
|
||||||
cur_rx += (TAB_STOP - 1) - (cur_rx % TAB_STOP);
|
|
||||||
} else if (row->line[curx] < 0x20) {
|
memset(&st, 0, sizeof(st));
|
||||||
cur_rx += 2;
|
|
||||||
|
while (j < (size_t)row->size) {
|
||||||
|
int w = 0;
|
||||||
|
size_t adv = 1;
|
||||||
|
|
||||||
|
unsigned char b = (unsigned char)row->line[j];
|
||||||
|
if (b == '\t') {
|
||||||
|
int add = (TAB_STOP - 1) - (cur_rx % TAB_STOP);
|
||||||
|
w = add + 1;
|
||||||
|
adv = 1;
|
||||||
|
/* tabs are single byte */
|
||||||
}
|
}
|
||||||
cur_rx++;
|
else if (b < 0x20) {
|
||||||
|
w = 3; /* "\\xx" */
|
||||||
|
adv = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
size_t rem = (size_t)row->size - j;
|
||||||
|
size_t n = mbrtowc(&wc, &row->line[j], rem, &st);
|
||||||
|
|
||||||
if (cur_rx > rx) {
|
if (n == (size_t)-2 || n == (size_t)-1) {
|
||||||
|
/* invalid/incomplete */
|
||||||
|
w = 1;
|
||||||
|
adv = 1;
|
||||||
|
memset(&st, 0, sizeof(st));
|
||||||
|
}
|
||||||
|
else if (n == 0) {
|
||||||
|
w = 0;
|
||||||
|
adv = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int ww = wcwidth(wc);
|
||||||
|
if (ww < 0) ww = 1;
|
||||||
|
w = ww;
|
||||||
|
adv = n;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cur_rx + w > rx) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cur_rx += w;
|
||||||
|
j += adv;
|
||||||
}
|
}
|
||||||
|
|
||||||
return curx;
|
return (int)j;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -364,7 +442,9 @@ erow_update(struct erow *row)
|
|||||||
for (j = 0; j < row->size; j++) {
|
for (j = 0; j < row->size; j++) {
|
||||||
if (row->line[j] == '\t') {
|
if (row->line[j] == '\t') {
|
||||||
tabs++;
|
tabs++;
|
||||||
} else if (!isprint(row->line[j])) {
|
}
|
||||||
|
else if ((unsigned char)row->line[j] < 0x20) {
|
||||||
|
/* treat only ASCII control characters as non-printable */
|
||||||
ctrl++;
|
ctrl++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -374,19 +454,23 @@ erow_update(struct erow *row)
|
|||||||
row->rsize = 0;
|
row->rsize = 0;
|
||||||
}
|
}
|
||||||
row->render = NULL;
|
row->render = NULL;
|
||||||
row->render = malloc(row->size + (tabs * (TAB_STOP-1)) + (ctrl * 3) + 1);
|
row->render = malloc(row->size + (tabs * (TAB_STOP - 1)) + (ctrl * 3) + 1);
|
||||||
assert(row->render != NULL);
|
assert(row->render != NULL);
|
||||||
|
|
||||||
for (j = 0; j < row->size; j++) {
|
for (j = 0; j < row->size; j++) {
|
||||||
if (row->line[j] == '\t') {
|
if (row->line[j] == '\t') {
|
||||||
do {
|
do {
|
||||||
row->render[i++] = ' ';
|
row->render[i++] = ' ';
|
||||||
} while ((i % TAB_STOP) != 0);
|
}
|
||||||
} else if (!isprint(row->line[j])) {
|
while ((i % TAB_STOP) != 0);
|
||||||
|
}
|
||||||
|
else if ((unsigned char)row->line[j] < 0x20) {
|
||||||
row->render[i++] = '\\';
|
row->render[i++] = '\\';
|
||||||
row->render[i++] = nibble_to_hex(row->line[j] >> 4);
|
row->render[i++] = nibble_to_hex(row->line[j] >> 4);
|
||||||
row->render[i++] = nibble_to_hex(row->line[j] & 0x0f);
|
row->render[i++] = nibble_to_hex(row->line[j] & 0x0f);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
|
/* leave UTF-8 multibyte bytes untouched so terminal can render */
|
||||||
row->render[i++] = row->line[j];
|
row->render[i++] = row->line[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -703,7 +787,9 @@ insertch(int16_t c)
|
|||||||
erow_insert(editor.nrows, "", 0);
|
erow_insert(editor.nrows, "", 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
row_insert_ch(&editor.row[editor.cury], editor.curx, (char)(c & 0xff));
|
/* Ensure we pass a non-negative byte value to avoid assert(c > 0). */
|
||||||
|
row_insert_ch(&editor.row[editor.cury], editor.curx,
|
||||||
|
(int16_t)(c & 0xff));
|
||||||
editor.curx++;
|
editor.curx++;
|
||||||
editor.dirty++;
|
editor.dirty++;
|
||||||
}
|
}
|
||||||
@@ -903,13 +989,17 @@ is_arrow_key(int16_t c)
|
|||||||
int16_t
|
int16_t
|
||||||
get_keypress(void)
|
get_keypress(void)
|
||||||
{
|
{
|
||||||
char seq[3];
|
char seq[3];
|
||||||
char c = -1;
|
/* read raw byte so UTF-8 bytes (>=0x80) are not sign-extended */
|
||||||
|
unsigned char uc = 0;
|
||||||
|
int16_t c;
|
||||||
|
|
||||||
if (read(STDIN_FILENO, &c, 1) == -1) {
|
if (read(STDIN_FILENO, &uc, 1) == -1) {
|
||||||
die("get_keypress:read");
|
die("get_keypress:read");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c = (int16_t)uc;
|
||||||
|
|
||||||
if (c == 0x1b) {
|
if (c == 0x1b) {
|
||||||
if (read(STDIN_FILENO, &seq[0], 1) != 1) return c;
|
if (read(STDIN_FILENO, &seq[0], 1) != 1) return c;
|
||||||
if (read(STDIN_FILENO, &seq[1], 1) != 1) return c;
|
if (read(STDIN_FILENO, &seq[1], 1) != 1) return c;
|
||||||
@@ -917,7 +1007,7 @@ get_keypress(void)
|
|||||||
if (seq[0] == '[') {
|
if (seq[0] == '[') {
|
||||||
if (seq[1] < 'A') {
|
if (seq[1] < 'A') {
|
||||||
if (read(STDIN_FILENO, &seq[2], 1) != 1)
|
if (read(STDIN_FILENO, &seq[2], 1) != 1)
|
||||||
return c;
|
return c;
|
||||||
if (seq[2] == '~') {
|
if (seq[2] == '~') {
|
||||||
switch (seq[1]) {
|
switch (seq[1]) {
|
||||||
case '1': return HOME_KEY;
|
case '1': return HOME_KEY;
|
||||||
@@ -929,7 +1019,8 @@ get_keypress(void)
|
|||||||
case '8': return END_KEY;
|
case '8': return END_KEY;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
switch (seq[1]) {
|
switch (seq[1]) {
|
||||||
case 'A': return ARROW_UP;
|
case 'A': return ARROW_UP;
|
||||||
case 'B': return ARROW_DOWN;
|
case 'B': return ARROW_DOWN;
|
||||||
@@ -942,7 +1033,8 @@ get_keypress(void)
|
|||||||
/* nada */ ;
|
/* nada */ ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (seq[0] == 'O') {
|
}
|
||||||
|
else if (seq[0] == 'O') {
|
||||||
switch (seq[1]) {
|
switch (seq[1]) {
|
||||||
case 'F': return END_KEY;
|
case 'F': return END_KEY;
|
||||||
case 'H': return HOME_KEY;
|
case 'H': return HOME_KEY;
|
||||||
@@ -989,14 +1081,14 @@ editor_prompt(char *prompt, void (*cb)(char *, int16_t))
|
|||||||
}
|
}
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
} else if (!iscntrl(c) && c < 128) {
|
} else if ((c == TAB_KEY) || (c >= 0x20 && c != 0x7f)) {
|
||||||
if (buflen == bufsz - 1) {
|
if (buflen == bufsz - 1) {
|
||||||
bufsz *= 2;
|
bufsz *= 2;
|
||||||
buf = realloc(buf, bufsz);
|
buf = realloc(buf, bufsz);
|
||||||
assert(buf != NULL);
|
assert(buf != NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
buf[buflen++] = c;
|
buf[buflen++] = (char)(c & 0xff);
|
||||||
buf[buflen] = '\0';
|
buf[buflen] = '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1108,8 +1200,8 @@ editor_openfile(void)
|
|||||||
void
|
void
|
||||||
move_cursor(int16_t c)
|
move_cursor(int16_t c)
|
||||||
{
|
{
|
||||||
struct erow *row;
|
struct erow* row;
|
||||||
int reps;
|
int reps;
|
||||||
|
|
||||||
row = (editor.cury >= editor.nrows) ? NULL : &editor.row[editor.cury];
|
row = (editor.cury >= editor.nrows) ? NULL : &editor.row[editor.cury];
|
||||||
|
|
||||||
@@ -1130,7 +1222,13 @@ move_cursor(int16_t c)
|
|||||||
case CTRL_KEY('f'):
|
case CTRL_KEY('f'):
|
||||||
if (row && editor.curx < row->size) {
|
if (row && editor.curx < row->size) {
|
||||||
editor.curx++;
|
editor.curx++;
|
||||||
} else if (row && editor.curx == row->size) {
|
/* skip over UTF-8 continuation bytes */
|
||||||
|
while (row && editor.curx < row->size &&
|
||||||
|
((unsigned char)row->line[editor.curx] & 0xC0) == 0x80) {
|
||||||
|
editor.curx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (row && editor.curx == row->size) {
|
||||||
editor.cury++;
|
editor.cury++;
|
||||||
editor.curx = 0;
|
editor.curx = 0;
|
||||||
}
|
}
|
||||||
@@ -1139,17 +1237,30 @@ move_cursor(int16_t c)
|
|||||||
case CTRL_KEY('b'):
|
case CTRL_KEY('b'):
|
||||||
if (editor.curx > 0) {
|
if (editor.curx > 0) {
|
||||||
editor.curx--;
|
editor.curx--;
|
||||||
} else if (editor.cury > 0) {
|
/* move to the start byte if we landed on a continuation */
|
||||||
|
while (editor.curx > 0 &&
|
||||||
|
((unsigned char)row->line[editor.curx] & 0xC0) == 0x80) {
|
||||||
|
editor.curx--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (editor.cury > 0) {
|
||||||
editor.cury--;
|
editor.cury--;
|
||||||
editor.curx = editor.row[editor.cury].size;
|
editor.curx = editor.row[editor.cury].size;
|
||||||
|
/* ensure at a codepoint boundary at end of previous line */
|
||||||
|
row = &editor.row[editor.cury];
|
||||||
|
while (editor.curx > 0 &&
|
||||||
|
((unsigned char)row->line[editor.curx] & 0xC0) == 0x80) {
|
||||||
|
editor.curx--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case PG_UP:
|
case PG_UP:
|
||||||
case PG_DN: {
|
case PG_DN:
|
||||||
if (c == PG_UP) {
|
if (c == PG_UP) {
|
||||||
editor.cury = editor.rowoffs;
|
editor.cury = editor.rowoffs;
|
||||||
} else if (c == PG_DN) {
|
}
|
||||||
editor.cury = editor.rowoffs + editor.rows-1;
|
else if (c == PG_DN) {
|
||||||
|
editor.cury = editor.rowoffs + editor.rows - 1;
|
||||||
if (editor.cury > editor.nrows) {
|
if (editor.cury > editor.nrows) {
|
||||||
editor.cury = editor.nrows;
|
editor.cury = editor.nrows;
|
||||||
}
|
}
|
||||||
@@ -1161,7 +1272,6 @@ move_cursor(int16_t c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
|
|
||||||
case HOME_KEY:
|
case HOME_KEY:
|
||||||
case CTRL_KEY('a'):
|
case CTRL_KEY('a'):
|
||||||
@@ -1179,8 +1289,7 @@ move_cursor(int16_t c)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
row = (editor.cury >= editor.nrows) ?
|
row = (editor.cury >= editor.nrows) ? NULL : &editor.row[editor.cury];
|
||||||
NULL : &editor.row[editor.cury];
|
|
||||||
reps = row ? row->size : 0;
|
reps = row ? row->size : 0;
|
||||||
if (editor.curx > reps) {
|
if (editor.curx > reps) {
|
||||||
editor.curx = reps;
|
editor.curx = reps;
|
||||||
@@ -1312,7 +1421,8 @@ process_normal(int16_t c)
|
|||||||
editor.mode = MODE_ESCAPE;
|
editor.mode = MODE_ESCAPE;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (isprint(c) || c == TAB_KEY) {
|
/* Insert any printable byte: ASCII 0x20–0x7E and all bytes >=0x80. */
|
||||||
|
if ((c == TAB_KEY) || (c >= 0x20 && c != 0x7f)) {
|
||||||
insertch(c);
|
insertch(c);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -1662,6 +1772,9 @@ loop(void)
|
|||||||
int
|
int
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
|
// Set locale for proper UTF-8 handling
|
||||||
|
setlocale(LC_ALL, "");
|
||||||
|
|
||||||
setup_terminal();
|
setup_terminal();
|
||||||
init_editor();
|
init_editor();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user