WIP: improve colr emoji detection and testing#620
Conversation
a236fbc to
288fa99
Compare
288fa99 to
d7b53ec
Compare
|
I tried using Ragel/Colm to generate Rust code, but the generated code cannot be used directly. The quality is much worse than C. It has not been updated for a long time. |
|
The hand-written C port (https://github.com/chansen/c-emoji) referenced in google/emoji-segmenter#17 may also be worth looking at. |
Can you elaborate please? We use the ragel Rust output in HarfRust and seems to work fine, except for a minor issue: https://github.com/harfbuzz/harfrust/blob/main/docs/ragel.md#the-universal-state-machine |
|
I'm using this version: emoji-segmenter on main
λ ~/.colm-suite/bin/ragel-rust -v
Ragel State Machine Compiler version 7.1.0-pre.1 April 2026
Copyright (c) 2001-2026 by Dr. Adrian D. Thurston et al.Try to generate rust code: λ ~/.colm-suite/bin/ragel-rust -F1 -e emoji_presentation_scanner.rlDetails#include <stdbool.h>
#ifndef EMOJI_LINKAGE
#define EMOJI_LINKAGE static
#endif
static _emoji_presentation_trans_keys: [unsigned char ; 28] = [ 0, 13, 14, 15, 0, 13, 9, 12, 10, 12, 10, 10, 4, 12, 4, 12, 6, 6, 8, 12, 8, 8, 8, 10, 9, 14, 0, 0 ];
static _emoji_presentation_char_class: [i8 ; 19] = [ 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0 , 0 ];
static _emoji_presentation_index_offsets: [i8 ; 15] = [ 0, 14, 16, 30, 34, 37, 38, 47, 56, 57, 62, 63, 66, 0 , 0 ];
static _emoji_presentation_indices: [i8 ; 74] = [ 1, 1, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 4, 5, 7, 8, 9, 10, 5, 11, 12, 13, 11, 11, 11, 11, 11, 14, 5, 15, 16, 17, 15, 18, 19, 15, 19, 18, 18, 18, 18, 18, 15, 18, 19, 19, 0, 0, 0, 0, 5, 15, 16, 17, 5, 21, 5, 15, 22, 23, 16, 26, 25, 15, 5, 15, 16, 17, 18, 4, 0 , 0 ];
static _emoji_presentation_index_defaults: [i8 ; 15] = [ 0, 3, 11, 0, 18, 0, 18, 0, 18, 20, 24, 25, 18, 0 , 0 ];
static _emoji_presentation_cond_targs: [i8 ; 29] = [ 2, 4, 6, 2, 1, 2, 2, 3, 3, 7, 7, 2, 8, 9, 12, 0, 2, 5, 2, 5, 2, 2, 10, 11, 2, 2, 2, 0 , 0 ];
static _emoji_presentation_cond_actions: [i8 ; 29] = [ 1, 2, 2, 3, 0, 4, 0, 7, 2, 7, 2, 8, 0, 7, 2, 0, 9, 10, 11, 2, 12, 13, 0, 10, 14, 15, 16, 0 , 0 ];
static _emoji_presentation_to_state_actions: [i8 ; 15] = [ 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0 ];
static _emoji_presentation_from_state_actions: [i8 ; 15] = [ 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0 ];
static _emoji_presentation_eof_trans: [i8 ; 15] = [ 1, 4, 7, 1, 19, 1, 19, 1, 19, 21, 25, 26, 19, 0 , 0 ];
static emoji_presentation_start : i32 = 2;
static emoji_presentation_en_text_and_emoji_run : i32 = 2;
EMOJI_LINKAGE emoji_text_iter_t
scan_emoji_presentation (emoji_text_iter_t p,
const emoji_text_iter_t pe,
bool* is_emoji,
bool* has_vs)
{
emoji_text_iter_t ts;
emoji_text_iter_t te;
const emoji_text_iter_t eof = pe;
(void)ts;
unsigned act;
int cs;
{
cs = ( emoji_presentation_start ) as i32;
ts = 0;
te = 0;
act = 0;
}
{
let mut _trans = 0;
let mut _keys :i32= 0;
let mut _inds :i32= 0;
let mut _ic = 0;
'_resume: while ( p != pe || p == eof ) {
'_again: while ( true ) {
match ( _emoji_presentation_from_state_actions[(cs) as usize] ) {
6 => {
{{ts = p;
}}
}
_ => {}
}
if ( p == eof ) {
{
if ( _emoji_presentation_eof_trans[(cs) as usize]> 0 ) {
{
_trans = ( _emoji_presentation_eof_trans ) as u32[(cs) as usize]- 1;
}
}
}
}
else {
{
_keys = ( (cs<<1) ) as i32;
_inds = ( _emoji_presentation_index_offsets[(cs) as usize] ) as i32;
if ( ( data[(p ) as usize]
) <= 16 ) {
{
_ic = ( _emoji_presentation_char_class ) as i32[(( ( data[(p ) as usize]
) ) as i32- 0) as usize];
if ( _ic <= ( _emoji_presentation_trans_keys[(_keys+1 ) as usize]
) as i32&& _ic >= ( _emoji_presentation_trans_keys[(_keys ) as usize]
) as i32 ) {
_trans = ( _emoji_presentation_indices[(_inds + ( ( _ic - ( _emoji_presentation_trans_keys[(_keys ) as usize]
) as i32) ) as i32) as usize]
) as u32;
}
else {
_trans = ( _emoji_presentation_index_defaults ) as u32[(cs) as usize];
}
}
}
else {
{
_trans = ( _emoji_presentation_index_defaults ) as u32[(cs) as usize];
}
}
}
}
cs = ( _emoji_presentation_cond_targs ) as i32[(_trans) as usize];
if ( _emoji_presentation_cond_actions[(_trans) as usize]!= 0 ) {
{
match ( _emoji_presentation_cond_actions[(_trans) as usize] ) {
9 => {
{{te = p+1;
{*is_emoji = false; *has_vs = true; return te; }
}}
}
16 => {
{{te = p+1;
{*is_emoji = true; *has_vs = true; return te; }
}}
}
4 => {
{{te = p+1;
{*is_emoji = true; *has_vs = false; return te; }
}}
}
13 => {
{{te = p+1;
{*is_emoji = true; *has_vs = false; return te; }
}}
}
8 => {
{{te = p+1;
{*is_emoji = false; *has_vs = false; return te; }
}}
}
14 => {
{{te = p;
p = p - 1;
{*is_emoji = false; *has_vs = true; return te; }
}}
}
15 => {
{{te = p;
p = p - 1;
{*is_emoji = true; *has_vs = true; return te; }
}}
}
11 => {
{{te = p;
p = p - 1;
{*is_emoji = true; *has_vs = false; return te; }
}}
}
12 => {
{{te = p;
p = p - 1;
{*is_emoji = false; *has_vs = false; return te; }
}}
}
3 => {
{{p = ((te))-1;
{*is_emoji = true; *has_vs = false; return te; }
}}
}
1 => {
{{match ( act ) {
2 => {
p = ((te))-1;
{*is_emoji = true; *has_vs = true; return te; }
}
3 => {
p = ((te))-1;
{*is_emoji = true; *has_vs = false; return te; }
}
5 => {
p = ((te))-1;
{*is_emoji = false; *has_vs = false; return te; }
}
_ => {}
}
}
}
}
10 => {
{{te = p+1;
}}
{{act = 2;
}}
}
2 => {
{{te = p+1;
}}
{{act = 3;
}}
}
7 => {
{{te = p+1;
}}
{{act = 5;
}}
}
_ => {}
}
}
}
break '_again;
}
if ( p == eof ) {
{
if ( cs >= 2 ) {
break '_resume;
}
}
}
else {
{
match ( _emoji_presentation_to_state_actions[(cs) as usize] ) {
5 => {
{{ts = 0;
}}
}
_ => {}
}
p += 1;
continue '_resume;
}
}
break '_resume;
}
}
*is_emoji = false;
*has_vs = false;
return p;
}It looks like the generated code is not quite correct. BTW: I pulled the latest colm-suite source code and added a patch: Details |
That looks bad indeed. Thanks for the steps. In HarfBuzz / HarfRust we heavily rely on ragel, and historically it's latest versions have indeed be flaky. That's unfortunate. I'll take a look and see if I can make this work (changing emoji-segmenter to remove the C/C++ out of the .rl at least). |
emojimodule and implementedscan_emoji_presentation, its based onemoji-segmenter's Ragel grammar.By the way, this PR should help #492.
Related to #617.