Skip to content

WIP: improve colr emoji detection and testing#620

Open
fundon wants to merge 39 commits into
linebender:mainfrom
fundon:improve-emoji-detection-and-testing
Open

WIP: improve colr emoji detection and testing#620
fundon wants to merge 39 commits into
linebender:mainfrom
fundon:improve-emoji-detection-and-testing

Conversation

@fundon
Copy link
Copy Markdown
Contributor

@fundon fundon commented May 6, 2026

  • Added emoji module and implemented scan_emoji_presentation, its based on emoji-segmenter's Ragel grammar.
  • Added test cases for scanning emoji presentation.

By the way, this PR should help #492.

Related to #617.

@fundon fundon force-pushed the improve-emoji-detection-and-testing branch from a236fbc to 288fa99 Compare May 6, 2026 06:54
@fundon fundon force-pushed the improve-emoji-detection-and-testing branch from 288fa99 to d7b53ec Compare May 6, 2026 07:00
@fundon fundon marked this pull request as draft May 6, 2026 11:44
@fundon
Copy link
Copy Markdown
Contributor Author

fundon commented May 9, 2026

I tried using Ragel/Colm to generate Rust code, but the generated code cannot be used directly. The quality is much worse than C. It has not been updated for a long time.

@fundon fundon marked this pull request as ready for review May 10, 2026 03:46
@fundon
Copy link
Copy Markdown
Contributor Author

fundon commented May 10, 2026

@conor-93 @DJMcNab Please take a look. Thanks.

@nicoburns
Copy link
Copy Markdown
Collaborator

nicoburns commented May 10, 2026

The hand-written C port (https://github.com/chansen/c-emoji) referenced in google/emoji-segmenter#17 may also be worth looking at.

@fundon fundon changed the title improve colr emoji detection and testing WIP: improve colr emoji detection and testing May 11, 2026
@behdad
Copy link
Copy Markdown

behdad commented May 18, 2026

I tried using Ragel/Colm to generate Rust code, but the generated code cannot be used directly. The quality is much worse than C. It has not been updated for a long time.

Can you elaborate please? We use the ragel Rust output in HarfRust and seems to work fine, except for a minor issue:

https://github.com/harfbuzz/harfrust/blob/main/docs/ragel.md#the-universal-state-machine

@fundon
Copy link
Copy Markdown
Contributor Author

fundon commented May 18, 2026

I'm using this version:

emoji-segmenter on main
λ ~/.colm-suite/bin/ragel-rust -v
Ragel State Machine Compiler version 7.1.0-pre.1 April 2026
Copyright (c) 2001-2026 by Dr. Adrian D. Thurston et al.

Try to generate rust code:

λ ~/.colm-suite/bin/ragel-rust -F1 -e emoji_presentation_scanner.rl
Details
#include <stdbool.h>

#ifndef EMOJI_LINKAGE
#define EMOJI_LINKAGE static
#endif

static _emoji_presentation_trans_keys: [unsigned char ; 28] = [ 0, 13, 14, 15, 0, 13, 9, 12, 10, 12, 10, 10, 4, 12, 4, 12, 6, 6, 8, 12, 8, 8, 8, 10, 9, 14, 0, 0 ];
static _emoji_presentation_char_class: [i8 ; 19] = [ 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0 , 0 ];
static _emoji_presentation_index_offsets: [i8 ; 15] = [ 0, 14, 16, 30, 34, 37, 38, 47, 56, 57, 62, 63, 66, 0 , 0 ];
static _emoji_presentation_indices: [i8 ; 74] = [ 1, 1, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 4, 5, 7, 8, 9, 10, 5, 11, 12, 13, 11, 11, 11, 11, 11, 14, 5, 15, 16, 17, 15, 18, 19, 15, 19, 18, 18, 18, 18, 18, 15, 18, 19, 19, 0, 0, 0, 0, 5, 15, 16, 17, 5, 21, 5, 15, 22, 23, 16, 26, 25, 15, 5, 15, 16, 17, 18, 4, 0 , 0 ];
static _emoji_presentation_index_defaults: [i8 ; 15] = [ 0, 3, 11, 0, 18, 0, 18, 0, 18, 20, 24, 25, 18, 0 , 0 ];
static _emoji_presentation_cond_targs: [i8 ; 29] = [ 2, 4, 6, 2, 1, 2, 2, 3, 3, 7, 7, 2, 8, 9, 12, 0, 2, 5, 2, 5, 2, 2, 10, 11, 2, 2, 2, 0 , 0 ];
static _emoji_presentation_cond_actions: [i8 ; 29] = [ 1, 2, 2, 3, 0, 4, 0, 7, 2, 7, 2, 8, 0, 7, 2, 0, 9, 10, 11, 2, 12, 13, 0, 10, 14, 15, 16, 0 , 0 ];
static _emoji_presentation_to_state_actions: [i8 ; 15] = [ 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0 ];
static _emoji_presentation_from_state_actions: [i8 ; 15] = [ 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0 ];
static _emoji_presentation_eof_trans: [i8 ; 15] = [ 1, 4, 7, 1, 19, 1, 19, 1, 19, 21, 25, 26, 19, 0 , 0 ];
static emoji_presentation_start : i32 = 2;
static emoji_presentation_en_text_and_emoji_run : i32 = 2;
EMOJI_LINKAGE emoji_text_iter_t
scan_emoji_presentation (emoji_text_iter_t p,
const emoji_text_iter_t pe,
bool* is_emoji,
bool* has_vs)
{
	emoji_text_iter_t ts;
	emoji_text_iter_t te;
	const emoji_text_iter_t eof = pe;
	
	(void)ts;
	
	unsigned act;
	int cs;
	
	
	{
		cs = ( emoji_presentation_start ) as i32;
		ts = 0;
		te = 0;
		act = 0;
	}
	
	{
		let mut _trans  = 0;
		let mut _keys :i32= 0;
		let mut _inds :i32= 0;
		let mut _ic = 0;
		'_resume: while ( p != pe || p == eof  ) {
			'_again: while ( true  ) {
				match ( _emoji_presentation_from_state_actions[(cs) as usize] ) {
					6  => {
						{{ts = p;
							}}
						
					}
					
					_ => {}
				}
				if ( p == eof  ) {
					{
						if ( _emoji_presentation_eof_trans[(cs) as usize]> 0  ) {
							{
								_trans = ( _emoji_presentation_eof_trans ) as u32[(cs) as usize]- 1;
							}
							
						}
					}
					
				}
				else {
					{
						_keys = ( (cs<<1)  ) as i32;
						_inds = ( _emoji_presentation_index_offsets[(cs) as usize] ) as i32;
						if ( ( data[(p ) as usize]
						) <= 16  ) {
							{
								_ic = ( _emoji_presentation_char_class ) as i32[(( ( data[(p ) as usize]
								)  ) as i32- 0) as usize];
								if ( _ic <= ( _emoji_presentation_trans_keys[(_keys+1 ) as usize]
								) as i32&& _ic >= ( _emoji_presentation_trans_keys[(_keys ) as usize]
								) as i32 ) {
									_trans = ( _emoji_presentation_indices[(_inds + ( ( _ic - ( _emoji_presentation_trans_keys[(_keys ) as usize]
									) as i32)   ) as i32) as usize]
									) as u32;
									
								}
								else {
									_trans = ( _emoji_presentation_index_defaults ) as u32[(cs) as usize];
									
								}
							}
							
						}
						else {
							{
								_trans = ( _emoji_presentation_index_defaults ) as u32[(cs) as usize];
							}
							
							
						}
					}
					
				}
				cs = ( _emoji_presentation_cond_targs ) as i32[(_trans) as usize];
				if ( _emoji_presentation_cond_actions[(_trans) as usize]!= 0  ) {
					{
					
						match ( _emoji_presentation_cond_actions[(_trans) as usize] ) {
							9  => {
								{{te = p+1;
										{*is_emoji = false; *has_vs = true; return te; }
									}}
								
							}
							16  => {
								{{te = p+1;
										{*is_emoji = true; *has_vs = true; return te; }
									}}
								
							}
							4  => {
								{{te = p+1;
										{*is_emoji = true; *has_vs = false; return te; }
									}}
								
							}
							13  => {
								{{te = p+1;
										{*is_emoji = true;  *has_vs = false; return te; }
									}}
								
							}
							8  => {
								{{te = p+1;
										{*is_emoji = false; *has_vs = false; return te; }
									}}
								
							}
							14  => {
								{{te = p;
										p = p - 1;
										{*is_emoji = false; *has_vs = true; return te; }
									}}
								
							}
							15  => {
								{{te = p;
										p = p - 1;
										{*is_emoji = true; *has_vs = true; return te; }
									}}
								
							}
							11  => {
								{{te = p;
										p = p - 1;
										{*is_emoji = true; *has_vs = false; return te; }
									}}
								
							}
							12  => {
								{{te = p;
										p = p - 1;
										{*is_emoji = false; *has_vs = false; return te; }
									}}
								
							}
							3  => {
								{{p = ((te))-1;
										{*is_emoji = true; *has_vs = false; return te; }
									}}
								
							}
							1  => {
								{{match ( act  ) {
											2  => {
												p = ((te))-1;
												{*is_emoji = true; *has_vs = true; return te; }
												
											}
											3  => {
												p = ((te))-1;
												{*is_emoji = true; *has_vs = false; return te; }
												
											}
											5  => {
												p = ((te))-1;
												{*is_emoji = false; *has_vs = false; return te; }
												
											}
											
											_ => {}
										}
									}
								}
								
							}
							10  => {
								{{te = p+1;
									}}
								{{act = 2;
									}}
								
							}
							2  => {
								{{te = p+1;
									}}
								{{act = 3;
									}}
								
							}
							7  => {
								{{te = p+1;
									}}
								{{act = 5;
									}}
								
							}
							
							_ => {}
						}
					}
					
				}
				break '_again;
				
			}
			if ( p == eof  ) {
				{
					if ( cs >= 2  ) {
						break '_resume;
						
					}
				}
				
			}
			else {
				{
					match ( _emoji_presentation_to_state_actions[(cs) as usize] ) {
						5  => {
							{{ts = 0;
								}}
							
						}
						
						_ => {}
					}
					p += 1;
					continue '_resume;
				}
				
			}
			break '_resume;
			
		}
	}
	*is_emoji = false;
	*has_vs = false;
	return p;
}

It looks like the generated code is not quite correct.

BTW: I pulled the latest colm-suite source code and added a patch:

Details
diff --git a/configure.ac b/configure.ac
index 60e95fd3..876407f9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -20,7 +20,7 @@ dnl LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM
 dnl OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 dnl SOFTWARE.
   
-AC_INIT([colm-suite],[0.15.0-pre.1])
+AC_INIT([colm-suite],[0.14.7])
 PUBDATE="April 2026"
 
 AM_INIT_AUTOMAKE([foreign])
diff --git a/release.Dockerfile b/release.Dockerfile
index 2cbaa499..c883f499 100644
--- a/release.Dockerfile
+++ b/release.Dockerfile
@@ -13,7 +13,7 @@ RUN apt-get update && apt-get install -y \
 RUN curl https://www.colm.net/files/thurston.asc | gpg --import -
 
 WORKDIR /build
-ENV COLM_SUITE_VERSION=0.15.0-pre.1
+ENV COLM_SUITE_VERSION=0.14.7
 RUN curl -O https://www.colm.net/files/colm-suite/colm-suite-${COLM_SUITE_VERSION}.tar.gz
 RUN curl -O https://www.colm.net/files/colm-suite/colm-suite-${COLM_SUITE_VERSION}.tar.gz.asc
 RUN gpg --verify colm-suite-${COLM_SUITE_VERSION}.tar.gz.asc colm-suite-${COLM_SUITE_VERSION}.tar.gz
diff --git a/src/ragel/host-rust/main.cc b/src/ragel/host-rust/main.cc
index 106d3566..f904dd87 100644
--- a/src/ragel/host-rust/main.cc
+++ b/src/ragel/host-rust/main.cc
@@ -36,13 +36,14 @@ const char *defaultOutFnRust( const char *inputFileName )
 
 HostType hostTypesRust[] =
 {
-	{ "u8",    0,  "byte",      false,   true,  false,  0, 0,  0, UCHAR_MAX, 1 },
+	{ "char",      0,      "char",    (CHAR_MIN != 0), true,  false,  SCHAR_MIN, SCHAR_MAX,  0, UCHAR_MAX,  sizeof(char) },
+	{ "unsigned", "char",  "char",   false,           true,  false,  0, 0,                  0, UCHAR_MAX,  sizeof(unsigned char) },
 };
 
 const HostLang hostLangRust =
 {
 	hostTypesRust,
-	1,
+	2,
 	0,
 	false,
 	true,     /* loopLabels */

@behdad
Copy link
Copy Markdown

behdad commented May 18, 2026

It looks like the generated code is not quite correct.

That looks bad indeed. Thanks for the steps.

In HarfBuzz / HarfRust we heavily rely on ragel, and historically it's latest versions have indeed be flaky. That's unfortunate. I'll take a look and see if I can make this work (changing emoji-segmenter to remove the C/C++ out of the .rl at least).

cc @drott @adrian-thurston

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants