terminal: Make utf-8 state machine assemble unicode code point value
This commit is contained in:
@@ -106,6 +106,7 @@ struct utf8_state_machine {
|
|||||||
enum utf8_state state;
|
enum utf8_state state;
|
||||||
int len;
|
int len;
|
||||||
union utf8_char s;
|
union utf8_char s;
|
||||||
|
uint32_t unicode;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -132,6 +133,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c)
|
|||||||
/* single byte, accept */
|
/* single byte, accept */
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
machine->state = utf8state_accept;
|
machine->state = utf8state_accept;
|
||||||
|
machine->unicode = c;
|
||||||
} else if((c & 0xC0) == 0x80) {
|
} else if((c & 0xC0) == 0x80) {
|
||||||
/* parser out of sync, ignore byte */
|
/* parser out of sync, ignore byte */
|
||||||
machine->state = utf8state_start;
|
machine->state = utf8state_start;
|
||||||
@@ -139,14 +141,17 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c)
|
|||||||
/* start of two byte sequence */
|
/* start of two byte sequence */
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
machine->state = utf8state_expect1;
|
machine->state = utf8state_expect1;
|
||||||
|
machine->unicode = c & 0x1f;
|
||||||
} else if((c & 0xF0) == 0xE0) {
|
} else if((c & 0xF0) == 0xE0) {
|
||||||
/* start of three byte sequence */
|
/* start of three byte sequence */
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
machine->state = utf8state_expect2;
|
machine->state = utf8state_expect2;
|
||||||
|
machine->unicode = c & 0x0f;
|
||||||
} else if((c & 0xF8) == 0xF0) {
|
} else if((c & 0xF8) == 0xF0) {
|
||||||
/* start of four byte sequence */
|
/* start of four byte sequence */
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
machine->state = utf8state_expect3;
|
machine->state = utf8state_expect3;
|
||||||
|
machine->unicode = c & 0x07;
|
||||||
} else {
|
} else {
|
||||||
/* overlong encoding, reject */
|
/* overlong encoding, reject */
|
||||||
machine->state = utf8state_reject;
|
machine->state = utf8state_reject;
|
||||||
@@ -154,6 +159,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c)
|
|||||||
break;
|
break;
|
||||||
case utf8state_expect3:
|
case utf8state_expect3:
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
|
machine->unicode = (machine->unicode << 6) | (c & 0x3f);
|
||||||
if((c & 0xC0) == 0x80) {
|
if((c & 0xC0) == 0x80) {
|
||||||
/* all good, continue */
|
/* all good, continue */
|
||||||
machine->state = utf8state_expect2;
|
machine->state = utf8state_expect2;
|
||||||
@@ -164,6 +170,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c)
|
|||||||
break;
|
break;
|
||||||
case utf8state_expect2:
|
case utf8state_expect2:
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
|
machine->unicode = (machine->unicode << 6) | (c & 0x3f);
|
||||||
if((c & 0xC0) == 0x80) {
|
if((c & 0xC0) == 0x80) {
|
||||||
/* all good, continue */
|
/* all good, continue */
|
||||||
machine->state = utf8state_expect1;
|
machine->state = utf8state_expect1;
|
||||||
@@ -174,6 +181,7 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c)
|
|||||||
break;
|
break;
|
||||||
case utf8state_expect1:
|
case utf8state_expect1:
|
||||||
machine->s.byte[machine->len++] = c;
|
machine->s.byte[machine->len++] = c;
|
||||||
|
machine->unicode = (machine->unicode << 6) | (c & 0x3f);
|
||||||
if((c & 0xC0) == 0x80) {
|
if((c & 0xC0) == 0x80) {
|
||||||
/* all good, accept */
|
/* all good, accept */
|
||||||
machine->state = utf8state_accept;
|
machine->state = utf8state_accept;
|
||||||
@@ -190,6 +198,26 @@ utf8_next_char(struct utf8_state_machine *machine, unsigned char c)
|
|||||||
return machine->state;
|
return machine->state;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint32_t
|
||||||
|
get_unicode(union utf8_char utf8)
|
||||||
|
{
|
||||||
|
struct utf8_state_machine machine;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
init_state_machine(&machine);
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
utf8_next_char(&machine, utf8.byte[i]);
|
||||||
|
if (machine.state == utf8state_accept ||
|
||||||
|
machine.state == utf8state_reject)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (machine.state == utf8state_reject)
|
||||||
|
return 0xfffd;
|
||||||
|
|
||||||
|
return machine.unicode;
|
||||||
|
}
|
||||||
|
|
||||||
struct char_sub {
|
struct char_sub {
|
||||||
union utf8_char match;
|
union utf8_char match;
|
||||||
union utf8_char replace;
|
union utf8_char replace;
|
||||||
|
|||||||
Reference in New Issue
Block a user