diff --git a/fontique/src/collection/mod.rs b/fontique/src/collection/mod.rs index 08ad5af76..ddbd53f66 100644 --- a/fontique/src/collection/mod.rs +++ b/fontique/src/collection/mod.rs @@ -318,22 +318,19 @@ impl Inner { /// Returns the family object for the given family identifier. pub fn family(&mut self, id: FamilyId) -> Option { self.sync_shared(); + if let Some(family) = self.data.families.get(&id) { - family.as_ref().cloned() - } else { - #[cfg(feature = "system")] - if let Some(system) = &self.system { - let family = system.fonts.lock().unwrap().family(id); - self.data.families.insert(id, family.clone()); - family - } else { - None - } - #[cfg(not(feature = "system"))] - { - None - } + return family.as_ref().cloned(); } + + #[cfg(feature = "system")] + if let Some(system) = &self.system { + let family = system.fonts.lock().unwrap().family(id); + self.data.families.insert(id, family.clone()); + return family; + } + + None } /// Returns the family object for the given name. diff --git a/parley/src/analysis/cluster.rs b/parley/src/analysis/cluster.rs index c2d6228e3..9890faec0 100644 --- a/parley/src/analysis/cluster.rs +++ b/parley/src/analysis/cluster.rs @@ -4,7 +4,7 @@ use alloc::vec::Vec; use icu_normalizer::properties::Decomposed; -use crate::analysis::AnalysisDataSources; +use crate::{analysis::AnalysisDataSources, emoji::EmojiPresentationStyle}; /// The maximum number of characters in a single cluster. const MAX_CLUSTER_SIZE: usize = 32; @@ -12,11 +12,11 @@ const MAX_CLUSTER_SIZE: usize = 32; #[derive(Debug, Default)] pub(crate) struct CharCluster { pub chars: Vec, - pub is_emoji: bool, pub map_len: u8, pub start: u32, pub end: u32, pub force_normalize: bool, + pub emoji_presentation_style: EmojiPresentationStyle, comp: Form, decomp: Form, form: FormKind, @@ -52,6 +52,8 @@ pub(crate) struct Char { /// Indexes into the list of styles for the containing text run, to find the style applicable /// to this character. pub style_index: u16, + /// Whether the emoji presentation selector + pub is_emoji_presentation_selector: bool, } pub(crate) type GlyphId = u16; @@ -93,7 +95,6 @@ pub(crate) enum Status { impl CharCluster { pub(crate) fn clear(&mut self) { self.chars.clear(); - self.is_emoji = false; self.map_len = 0; self.start = 0; self.end = 0; @@ -102,6 +103,7 @@ impl CharCluster { self.decomp.clear(); self.form = FormKind::Original; self.best_ratio = 0.; + self.emoji_presentation_style = EmojiPresentationStyle::Default; } #[inline(always)] @@ -351,17 +353,23 @@ impl<'a> Mapper<'a> { } let mut mapped = 0; for (c, g) in self.chars.iter().zip(glyphs.iter_mut()) { - if !c.contributes_to_shaping { - *g = f(c.ch); - if self.map_len == 1 { - mapped += 1; - } - } else { - let gid = f(c.ch); - *g = gid; - if gid != 0 { + *g = f(c.ch); + + // If the color emoji has a presentation style, ignore the variation selector. + if c.is_emoji_presentation_selector { + mapped += 1; + continue; + } + + if c.contributes_to_shaping { + if *g != 0 { mapped += 1; } + continue; + } + + if self.map_len == 1 { + mapped += 1; } } let ratio = mapped as f32 / self.map_len as f32; diff --git a/parley/src/analysis/mod.rs b/parley/src/analysis/mod.rs index 460e78d96..789cfd3e0 100644 --- a/parley/src/analysis/mod.rs +++ b/parley/src/analysis/mod.rs @@ -22,7 +22,7 @@ use icu_segmenter::{ GraphemeClusterSegmenter, GraphemeClusterSegmenterBorrowed, LineSegmenter, LineSegmenterBorrowed, WordSegmenter, WordSegmenterBorrowed, }; -use parley_data::Properties; +use parley_data::{Properties, emoji::EmojiProperties}; pub(crate) struct AnalysisDataSources; @@ -92,6 +92,11 @@ impl AnalysisDataSources { fn brackets(&self) -> CodePointMapDataBorrowed<'_, BidiMirroringGlyph> { const { CodePointMapData::new() } } + + #[inline(always)] + pub(crate) const fn emoji_properties(&self, c: char) -> EmojiProperties { + EmojiProperties::get(c) + } } #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -106,6 +111,8 @@ pub(crate) struct CharInfo { pub bidi_class: icu_properties::props::BidiClass, /// Whether or not the character is a bracket, plus mirror data if so. pub bracket: BidiMirroringGlyph, + /// The emoji properties of this character. + pub emoji_properties: EmojiProperties, flags: u8, } @@ -139,6 +146,7 @@ impl CharInfo { grapheme_cluster_break: GraphemeClusterBreak, bidi_class: icu_properties::props::BidiClass, bracket: BidiMirroringGlyph, + emoji_properties: EmojiProperties, is_variation_selector: bool, is_region_indicator: bool, is_control: bool, @@ -152,6 +160,7 @@ impl CharInfo { grapheme_cluster_break, bidi_class, bracket, + emoji_properties, flags: (is_variation_selector as u8) << Self::VARIATION_SELECTOR_SHIFT | (is_region_indicator as u8) << Self::REGION_INDICATOR_SHIFT | (is_control as u8) << Self::CONTROL_SHIFT @@ -429,6 +438,7 @@ pub(crate) fn analyze_text(lcx: &mut LayoutContext, mut text: &str) }); let properties = |c| lcx.analysis_data_sources.properties(c); + let emoji_properties = |c| lcx.analysis_data_sources.emoji_properties(c); let mut needs_bidi_resolution = false; @@ -448,6 +458,11 @@ pub(crate) fn analyze_text(lcx: &mut LayoutContext, mut text: &str) let is_variation_selector = properties.is_variation_selector(); let is_region_indicator = properties.is_region_indicator(); let next_mandatory_linebreak = properties.is_mandatory_linebreak(); + let emoji_properties = if is_emoji_or_pictograph { + emoji_properties(ch) + } else { + EmojiProperties::ZERO + }; let boundary = if is_mandatory_linebreak { Boundary::Mandatory @@ -479,6 +494,7 @@ pub(crate) fn analyze_text(lcx: &mut LayoutContext, mut text: &str) grapheme_cluster_break, bidi_class, bracket, + emoji_properties, is_variation_selector, is_region_indicator, general_category == GeneralCategory::Control, diff --git a/parley/src/emoji/dfa.rs b/parley/src/emoji/dfa.rs new file mode 100644 index 000000000..1275eef6d --- /dev/null +++ b/parley/src/emoji/dfa.rs @@ -0,0 +1,256 @@ +// Copyright 2026 the Parley Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use super::types::{EmojiPresentationStyle, EmojiSegmentationCategory, EmojiSequence, EmojiState}; + +/// The transition table for Emoji DFA. +/// +/// +static DFA_TRANS: [[u8; 13]; 13] = { + use EmojiSegmentationCategory as Category; + use EmojiState as State; + + let mut t = [[0; 13]; 13]; + + /// Adds a state transition to the DFA transition table. + macro_rules! add { + ($state:expr, $category:expr, $next_state:expr) => { + t[$state.as_usize()][$category.as_usize()] = $next_state.as_u8() + }; + } + + // Text and Emoji presentation sequences + { + add!(State::Start, Category::Emoji, State::Emoji); + + add!(State::Start, Category::EmojiPresentation, State::Emoji); + + // Text presentation sequence + // + // + add!(State::Emoji, Category::Vs15, State::Terminal); + + // Emoji presentation sequence + // + // + add!(State::Emoji, Category::Vs16, State::OptionalZwj); + + // ZWJ + add!(State::Emoji, Category::Zwj, State::Zwj); + } + + // Emoji modifier sequence + // + // + { + add!( + State::Start, + Category::EmojiModifierBase, + State::EmojiModifierBase + ); + + add!(State::EmojiModifierBase, Category::Vs16, State::OptionalZwj); + add!(State::EmojiModifierBase, Category::Zwj, State::Zwj); + add!( + State::EmojiModifierBase, + Category::EmojiModifier, + State::OptionalZwj + ); + + // other + add!(State::Start, Category::EmojiModifier, State::Terminal); + } + + // Emoji flag sequence -- A sequence of two Regional Indicator characters. + // + // + { + add!(State::Start, Category::Ri, State::Ri); + + add!(State::Ri, Category::Ri, State::Terminal); + } + + // Emoji tag sequence (ETS). + // + // + { + add!(State::Start, Category::TagBase, State::TagBase); + + add!(State::TagBase, Category::Vs15, State::Terminal); + add!(State::TagBase, Category::Vs16, State::OptionalZwj); + add!(State::TagBase, Category::TagSpec, State::TagSpec); + add!(State::TagBase, Category::TagEnd, State::TagEmpty); // without any `TagSpec` + add!(State::TagBase, Category::Zwj, State::Zwj); + + // (seq)+ + add!(State::TagSpec, Category::TagSpec, State::TagSpec); + add!(State::TagSpec, Category::TagEnd, State::Terminal); + } + + // Emoji keycap sequence. + // + // + { + add!(State::Start, Category::KeycapBase, State::KeycapBase); + + add!(State::KeycapBase, Category::KeycapEnd, State::Terminal); + add!(State::KeycapBase, Category::Vs15, State::KeycapVs); + add!(State::KeycapBase, Category::Vs16, State::KeycapVs); + + add!(State::KeycapVs, Category::KeycapEnd, State::Terminal); + } + + // Emoji ZWJ sequence. + // + // + { + add!(State::OptionalZwj, Category::Zwj, State::Zwj); + + // (zwj emoji_zwj_element)+ + add!(State::Zwj, Category::Emoji, State::Emoji); + add!(State::Zwj, Category::EmojiPresentation, State::Emoji); + add!( + State::Zwj, + Category::EmojiModifierBase, + State::EmojiModifierBase + ); + } + + t +}; + +#[derive(Clone, Copy, Debug)] +pub(crate) struct EmojiDFA { + state: EmojiState, + // (state, category) + recorded: (u16, u16), +} + +impl EmojiDFA { + const DEFAULT: Self = Self { + state: EmojiState::Start, + recorded: (0, 0), + }; + + #[inline] + pub(crate) const fn new() -> Self { + Self::DEFAULT + } + + #[inline] + pub(crate) const fn step(&mut self, category: EmojiSegmentationCategory) { + self.state = EmojiState::from_u8(DFA_TRANS[self.state.as_usize()][category.as_usize()]); + } + + #[inline] + pub(crate) const fn step_record(&mut self, category: EmojiSegmentationCategory) { + self.step(category); + + if self.is_rejected() || self.is_started() { + return; + } + + self.recorded.0 |= 1 << self.state.as_u8(); + self.recorded.1 |= 1 << category.as_u8(); + } + + #[inline] + pub(crate) const fn is_rejected(self) -> bool { + self.state.eq(EmojiState::Reject) + } + + #[inline] + pub(crate) const fn is_started(self) -> bool { + self.state.eq(EmojiState::Start) + } + + #[allow(unused)] + #[inline] + pub(crate) const fn is_accepting(self) -> bool { + const START: u8 = EmojiState::Terminal.as_u8(); + const END: u8 = EmojiState::Ri.as_u8(); + + let cur = self.state.as_u8(); + + START <= cur && cur <= END + } + + #[inline] + pub(crate) const fn contains_state(self, state: EmojiState) -> bool { + self.recorded.0 & (1 << state.as_u8()) != 0 + } + + #[inline] + pub(crate) const fn contains_category(self, category: EmojiSegmentationCategory) -> bool { + self.recorded.1 & (1 << category.as_u8()) != 0 + } + + #[inline] + pub(crate) const fn sequence(self) -> EmojiSequence { + if self.contains_category(EmojiSegmentationCategory::Zwj) { + return EmojiSequence::Zwj; + } + + if self.contains_state(EmojiState::TagBase) + && self.contains_state(EmojiState::Terminal) + && !self.contains_category(EmojiSegmentationCategory::Vs15) + { + return EmojiSequence::Tag; + } + + if self.contains_state(EmojiState::Ri) && self.contains_state(EmojiState::Terminal) { + return EmojiSequence::Flag; + } + + if self.contains_category(EmojiSegmentationCategory::EmojiModifierBase) + && self.contains_category(EmojiSegmentationCategory::EmojiModifier) + { + return EmojiSequence::Modifier; + } + + if self.contains_category(EmojiSegmentationCategory::KeycapBase) + && self.contains_category(EmojiSegmentationCategory::Vs16) + && self.contains_category(EmojiSegmentationCategory::KeycapEnd) + { + return EmojiSequence::Keycap; + } + + if self.contains_category(EmojiSegmentationCategory::KeycapEnd) + && self.contains_category(EmojiSegmentationCategory::Vs16) + { + return EmojiSequence::Keycap; + } + + EmojiSequence::Basic + } + + #[inline] + pub(crate) const fn presentation_style(self) -> EmojiPresentationStyle { + if self.contains_category(EmojiSegmentationCategory::Vs15) { + return EmojiPresentationStyle::Text; + } + if self.contains_category(EmojiSegmentationCategory::Vs16) { + return EmojiPresentationStyle::Emoji; + } + + if self.contains_category(EmojiSegmentationCategory::EmojiPresentation) { + return EmojiPresentationStyle::Emoji; + } + + if !self.sequence().eq(EmojiSequence::Basic) { + return EmojiPresentationStyle::Emoji; + } + + // single emoji modifier; e.g. ๐Ÿป + if self.contains_category(EmojiSegmentationCategory::EmojiModifier) { + return EmojiPresentationStyle::Emoji; + } + + // single emoji modifier base; e.g โ˜ + if self.contains_category(EmojiSegmentationCategory::EmojiModifierBase) { + return EmojiPresentationStyle::Text; + } + + EmojiPresentationStyle::Default + } +} diff --git a/parley/src/emoji/mod.rs b/parley/src/emoji/mod.rs new file mode 100644 index 000000000..634b73d95 --- /dev/null +++ b/parley/src/emoji/mod.rs @@ -0,0 +1,15 @@ +// Copyright 2026 the Parley Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! This implementation is based on [emoji segmenter]'s Ragel grammar (Apache-2.0). +//! +//! And follow the [UTS51](Unicode Technical Standard #51). +//! +//! [emoji segmenter]: +//! [UTS51]: + +mod dfa; +mod types; + +pub(crate) use dfa::EmojiDFA; +pub(crate) use types::{EmojiPresentationStyle, EmojiSegmentationCategory}; diff --git a/parley/src/emoji/types.rs b/parley/src/emoji/types.rs new file mode 100644 index 000000000..0b796e174 --- /dev/null +++ b/parley/src/emoji/types.rs @@ -0,0 +1,221 @@ +// Copyright 2026 the Parley Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use parley_data::emoji::EmojiProperties; + +#[repr(u8)] +#[derive(Clone, Copy, Debug, PartialEq)] +pub(crate) enum EmojiState { + Reject = 0, + Start, + + Terminal, + Emoji, + EmojiModifierBase, + OptionalZwj, + KeycapVs, + TagBase, + /// `RegionalIndicator` + Ri, + + TagSpec, + TagEmpty, + KeycapBase, + Zwj, +} + +impl EmojiState { + #[inline] + pub(crate) const fn from_u8(value: u8) -> Self { + match value { + 1 => Self::Start, + 2 => Self::Terminal, + 3 => Self::Emoji, + 4 => Self::EmojiModifierBase, + 5 => Self::OptionalZwj, + 6 => Self::KeycapVs, + 7 => Self::TagBase, + 8 => Self::Ri, + 9 => Self::TagSpec, + 10 => Self::TagEmpty, + 11 => Self::KeycapBase, + 12 => Self::Zwj, + _ => Self::Reject, + } + } + + #[inline] + pub(crate) const fn as_usize(self) -> usize { + self as usize + } + + #[inline] + pub(crate) const fn as_u8(self) -> u8 { + self as u8 + } + + #[inline] + pub(crate) const fn eq(self, other: Self) -> bool { + self.as_u8() == other.as_u8() + } +} + +impl core::ops::Index for [T] { + type Output = T; + + #[inline] + fn index(&self, index: EmojiState) -> &T { + &self[index.as_usize()] + } +} + +impl core::ops::IndexMut for [T] { + #[inline] + fn index_mut(&mut self, index: EmojiState) -> &mut T { + &mut self[index.as_usize()] + } +} + +/// Represents the category of an emoji segmentation. +#[repr(u8)] +#[derive(Clone, Copy, Debug, PartialEq)] +pub(crate) enum EmojiSegmentationCategory { + Emoji = 0, + EmojiPresentation, + EmojiModifier, + EmojiModifierBase, + KeycapBase, + KeycapEnd, + TagBase, + TagSpec, + TagEnd, + /// `RegionalIndicator` + Ri, + Vs15, + Vs16, + Zwj, + None, +} + +impl EmojiSegmentationCategory { + /// Returns the category of the given codepoint and flags. + /// + /// + #[inline] + pub(crate) fn from_codepoint(cp: u32, properties: EmojiProperties) -> Self { + match cp { + // '0'..'9', '#', '*' + 0x30..=0x39 | 0x23 | 0x2A => Self::KeycapBase, + 0x200D => Self::Zwj, + 0x20E3 => Self::KeycapEnd, + 0xFE0E => Self::Vs15, + 0xFE0F => Self::Vs16, + 0x1F3F4 => Self::TagBase, + 0xE0030..=0xE0039 | 0xE0061..=0xE007A => Self::TagSpec, + 0xE007F => Self::TagEnd, + _ => { + if properties.is_regional_indicator() { + return Self::Ri; + } + + if properties.is_emoji_modifier_base() { + return Self::EmojiModifierBase; + } + + if properties.is_emoji_modifier() { + return Self::EmojiModifier; + } + + if properties.is_emoji_presentation() { + return Self::EmojiPresentation; + } + + if properties.is_emoji() { + return Self::Emoji; + } + + Self::None + } + } + } + + #[inline] + pub(crate) const fn as_usize(self) -> usize { + self as usize + } + + #[inline] + pub(crate) const fn as_u8(self) -> u8 { + self as u8 + } + + #[inline] + pub(crate) const fn eq(self, other: Self) -> bool { + self.as_u8() == other.as_u8() + } +} + +impl core::ops::Index for [T] { + type Output = T; + + #[inline] + fn index(&self, index: EmojiSegmentationCategory) -> &T { + &self[index.as_usize()] + } +} + +impl core::ops::IndexMut for [T] { + #[inline] + fn index_mut(&mut self, index: EmojiSegmentationCategory) -> &mut T { + &mut self[index.as_usize()] + } +} + +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Debug)] +pub(crate) enum EmojiSequence { + Basic, + Keycap, + Modifier, + Flag, + Zwj, + Tag, +} + +impl EmojiSequence { + #[inline] + pub(crate) const fn as_u8(self) -> u8 { + self as u8 + } + + #[inline] + pub(crate) const fn eq(self, other: Self) -> bool { + self.as_u8() == other.as_u8() + } +} + +#[repr(u8)] +#[derive(Clone, Copy, PartialEq, Default, Debug)] +pub(crate) enum EmojiPresentationStyle { + Emoji, + Text, + #[default] + Default, +} + +impl EmojiPresentationStyle { + #[inline] + pub(crate) const fn is_emoji(self) -> bool { + self.eq(Self::Emoji) + } + + #[inline] + pub(crate) const fn as_u8(self) -> u8 { + self as u8 + } + + #[inline] + pub(crate) const fn eq(self, other: Self) -> bool { + self.as_u8() == other.as_u8() + } +} diff --git a/parley/src/lib.rs b/parley/src/lib.rs index ac8269ed1..70226720e 100644 --- a/parley/src/lib.rs +++ b/parley/src/lib.rs @@ -113,6 +113,7 @@ mod bidi; mod builder; mod context; mod convert; +mod emoji; mod font; mod inline_box; mod lru_cache; diff --git a/parley/src/shape/mod.rs b/parley/src/shape/mod.rs index cd451001e..569309477 100644 --- a/parley/src/shape/mod.rs +++ b/parley/src/shape/mod.rs @@ -14,6 +14,7 @@ use super::style::{Brush, FontFeature, FontVariation}; use crate::analysis::cluster::{Char, CharCluster, Status}; use crate::analysis::{AnalysisDataSources, CharInfo}; use crate::convert::script_to_harfrust; +use crate::emoji::{EmojiDFA, EmojiSegmentationCategory}; use crate::inline_box::InlineBox; use crate::lru_cache::LruCache; use crate::util::nearly_eq; @@ -234,36 +235,40 @@ fn fill_cluster_in_place( char_cluster.clear(); let mut force_normalize = false; - let mut is_emoji_or_pictograph = false; let mut map_len: u8 = 0; let start = *code_unit_offset_in_string as u32; + let mut is_emoji = false; + let mut emoji_dfa = EmojiDFA::new(); + for ((_, ch), (info, style_index)) in segment_text.char_indices().zip(item_infos_iter.by_ref()) { + *code_unit_offset_in_string += ch.len_utf8(); force_normalize |= info.force_normalize(); + // TODO - make emoji detection more complete, as per (except using composite Trie tables as // much as possible: // https://github.com/conor-93/parley/blob/4637d826732a1a82bbb3c904c7f47a16a21cceec/parley/src/shape/mod.rs#L221-L269 - is_emoji_or_pictograph |= info.is_emoji_or_pictograph(); - *code_unit_offset_in_string += ch.len_utf8(); // TODO: Explore ignoring other modifiers in determining `contributes_to_shaping`: // regional indicators, subdivision flag tag sequences, skin tone modifiers // See also: https://github.com/google/emoji-segmenter - // If the color emoji has a non-printing variation selector, ignore the variation selector. - // Its presentation depends on the platform and font. - // - // e.g. - // - `U+270C + U+FE0F`: `โœŒ`, force basic presentation - // - `U+270C + U+FE0F`: `โœŒ๏ธ`, force emoji presentation - // - // - let is_emoji_with_non_printing_variation_selector = - is_emoji_or_pictograph && info.is_variation_selector(); - - let contributes_to_shaping = - info.contributes_to_shaping() && !is_emoji_with_non_printing_variation_selector; + is_emoji |= info.is_emoji_or_pictograph(); + + let mut is_emoji_presentation_selector = false; + + if is_emoji { + let category = + EmojiSegmentationCategory::from_codepoint(ch as u32, info.emoji_properties); + + is_emoji_presentation_selector = category.eq(EmojiSegmentationCategory::Vs16) + || category.eq(EmojiSegmentationCategory::Vs15); + + emoji_dfa.step_record(category); + } + + let contributes_to_shaping = info.contributes_to_shaping(); if contributes_to_shaping { map_len += 1; } @@ -274,16 +279,20 @@ fn fill_cluster_in_place( glyph_id: 0, style_index: *style_index, is_control_character: info.is_control(), + is_emoji_presentation_selector, }); } // Finalize cluster metadata let end = *code_unit_offset_in_string as u32; - char_cluster.is_emoji = is_emoji_or_pictograph; char_cluster.map_len = map_len; char_cluster.start = start; char_cluster.end = end; char_cluster.force_normalize = force_normalize; + + if is_emoji { + char_cluster.emoji_presentation_style = emoji_dfa.presentation_style(); + } } fn shape_item<'a, B: Brush>( @@ -570,7 +579,7 @@ impl<'a, 'b, B: Brush> FontSelector<'a, 'b, B> { analysis_data_sources: &AnalysisDataSources, ) -> Option { let style_index = cluster.style_index(); - let is_emoji = cluster.is_emoji; + let is_emoji = cluster.emoji_presentation_style.is_emoji(); if style_index != self.style_index || is_emoji || self.fonts_id.is_none() { self.style_index = style_index; let style = &self.styles[style_index as usize]; diff --git a/parley/src/tests/mod.rs b/parley/src/tests/mod.rs index 52b77f872..f6f46b89d 100644 --- a/parley/src/tests/mod.rs +++ b/parley/src/tests/mod.rs @@ -3,4 +3,5 @@ mod test_analysis; mod test_builders; +mod test_emoji_segmenters; mod utils; diff --git a/parley/src/tests/test_analysis.rs b/parley/src/tests/test_analysis.rs index 202e4f11a..1e80d97d4 100644 --- a/parley/src/tests/test_analysis.rs +++ b/parley/src/tests/test_analysis.rs @@ -1180,7 +1180,7 @@ fn test_whitespace_contiguous_interspersed_in_latin_mixed() { } #[test] -fn test_color_emoji_with_non_printing_variation_selector() { +fn test_color_emoji_with_presentation() { verify_analysis("\u{270c}\u{fe0f}", |_| {}) .expect_is_emoji_or_pictograph_list(vec![true, false]) .expect_is_variation_selector_list(vec![false, true]); diff --git a/parley/src/tests/test_emoji_segmenters.rs b/parley/src/tests/test_emoji_segmenters.rs new file mode 100644 index 000000000..34aa85ae3 --- /dev/null +++ b/parley/src/tests/test_emoji_segmenters.rs @@ -0,0 +1,615 @@ +// Copyright 2026 the Parley Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Tests extracted from the [emoji segmenter]. +//! +//! [emoji segmenter]: + +use alloc::vec::Vec; +use core::char; + +use crate::{ + analysis::AnalysisDataSources, + emoji::{EmojiDFA, EmojiPresentationStyle, EmojiSegmentationCategory}, +}; + +struct TestEntity<'a> { + sequence: &'a [u32], + categories: &'a [EmojiSegmentationCategory], + style: EmojiPresentationStyle, +} + +fn assert_emoji_segmenters_produce_same_result(entity: TestEntity<'_>) { + let analysis = AnalysisDataSources::new(); + + let mut emoji_dfa = EmojiDFA::new(); + + let result = entity + .sequence + .iter() + .copied() + .map(|cp| { + let ch = char::from_u32(cp).unwrap(); + let emoji_properties = analysis.emoji_properties(ch); + + let category = EmojiSegmentationCategory::from_codepoint(cp, emoji_properties); + + emoji_dfa.step_record(category); + + category + }) + .collect::>(); + + assert_eq!(result, entity.categories); + assert_eq!(emoji_dfa.presentation_style(), entity.style); +} + +// Emoji presentation default; Encoded: ๐Ÿ˜€ +#[test] +fn emoji_presentation_default() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F600, // GRINNING FACE + ], + categories: &[EmojiSegmentationCategory::EmojiPresentation], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Text presentation default (copyright); Encoded: ยฉ +#[test] +fn text_presentation_default() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x00A9, // COPYRIGHT SIGN + ], + categories: &[EmojiSegmentationCategory::Emoji], + style: EmojiPresentationStyle::Default, + }); +} + +// Lone keycap base; Encoded: 1 +#[test] +fn long_keycap_base() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[0x0031], // DIGIT ONE + categories: &[EmojiSegmentationCategory::KeycapBase], + style: EmojiPresentationStyle::Default, + }); +} + +// Keycap base + VS-15 (no term); Encoded: 1๏ธŽ +#[test] +fn keycap_base_vs15() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x0031, // DIGIT ONE + 0xFE0E, // VARIATION SELECTOR-15 + ], + categories: &[ + EmojiSegmentationCategory::KeycapBase, + EmojiSegmentationCategory::Vs15, + ], + style: EmojiPresentationStyle::Text, + }); +} + +// Keycap base + VS-16 (no term); Encoded: 1๏ธ +#[test] +fn keycap_base_vs16() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x0031, // DIGIT ONE + 0xFE0F, // VARIATION SELECTOR-16 + ], + categories: &[ + EmojiSegmentationCategory::KeycapBase, + EmojiSegmentationCategory::Vs16, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Unqualified keycap; Encoded: #โƒฃ +#[test] +fn unqualified_keycap() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x0023, // NUMBER SIGN + 0x20E3, // COMBINING ENCLOSING KEYCAP + ], + categories: &[ + EmojiSegmentationCategory::KeycapBase, + EmojiSegmentationCategory::KeycapEnd, + ], + style: EmojiPresentationStyle::Default, + }); +} + +// Keycap + VS-15 + term; Encoded: 1๏ธŽโƒฃ +#[test] +fn keycap_vs15_term() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x0031, // DIGIT ONE + 0xFE0E, // VARIATION SELECTOR-15 + 0x20E3, // COMBINING ENCLOSING KEYCAP + ], + categories: &[ + EmojiSegmentationCategory::KeycapBase, + EmojiSegmentationCategory::Vs15, + EmojiSegmentationCategory::KeycapEnd, + ], + style: EmojiPresentationStyle::Text, + }); +} + +// Qualified keycap; Encoded: *๏ธโƒฃ +#[test] +fn qualified_keycap() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x002A, // ASTERISK + 0xFE0F, // VARIATION SELECTOR-16 + 0x20E3, // COMBINING ENCLOSING KEYCAP + ], + categories: &[ + EmojiSegmentationCategory::KeycapBase, + EmojiSegmentationCategory::Vs16, + EmojiSegmentationCategory::KeycapEnd, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Lone emoji modifier (Fitzpatrick); Encoded: ๐Ÿป +#[test] +fn lone_emoji_modifier() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F3FB, // EMOJI MODIFIER FITZPATRICK TYPE-1-2 + ], + categories: &[EmojiSegmentationCategory::EmojiModifier], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Bare modifier base, text default; Encoded: โ˜ +#[test] +fn bare_modifier_base_text_default() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x261D, // WHITE UP POINTING INDEX + ], + categories: &[EmojiSegmentationCategory::EmojiModifierBase], + style: EmojiPresentationStyle::Text, + }); +} + +// Modifier base (text default) + VS-16; Encoded: โ˜๏ธ +#[test] +fn modifier_base_text_default_vs16() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x261D, // WHITE UP POINTING INDEX + 0xFE0F, // VARIATION SELECTOR-16 + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Vs16, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Modifier base (text default) + skin tone; Encoded: โ˜๐Ÿป +#[test] +fn modifier_base_text_default_skin_tone() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x261D, // WHITE UP POINTING INDEX + 0x1F3FB, // EMOJI MODIFIER FITZPATRICK TYPE-1-2 + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::EmojiModifier, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Modifier base (emoji default) + skin tone; Encoded: ๐Ÿ‘ฆ๐Ÿป +#[test] +fn modifier_base_emoji_default_skin_tone() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F466, // BOY + 0x1F3FB, // EMOJI MODIFIER FITZPATRICK TYPE-1-2 + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::EmojiModifier, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Lone regional indicator; Encoded: ๐Ÿ‡บ +#[test] +fn lone_regional_indicator() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F1FA, // REGIONAL INDICATOR SYMBOL LETTER U + ], + categories: &[EmojiSegmentationCategory::Ri], + style: EmojiPresentationStyle::Default, + }); +} + +// Flag sequence (US); Encoded: ๐Ÿ‡บ๐Ÿ‡ธ +#[test] +fn flag_sequence_us() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F1FA, // REGIONAL INDICATOR SYMBOL LETTER U + 0x1F1F8, // REGIONAL INDICATOR SYMBOL LETTER S + ], + categories: &[EmojiSegmentationCategory::Ri, EmojiSegmentationCategory::Ri], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Double lone regional indicator + Flag sequence (US); Encoded: ๐Ÿ‡บ๐Ÿ‡บ๐Ÿ‡ธ +// +// FIXME: segmented clusters are incorrect +// โœ–๏ธ, [[0x1F1FA, 0x1F1FA], [0x1F1F8]] +// โœ”๏ธ, [[0x1F1FA], [0x1F1FA, 0x1F1F8]] +#[test] +#[ignore] +fn double_lone_regional_indicator_flag_sequence_us() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F1FA, // REGIONAL INDICATOR SYMBOL LETTER U + 0x1F1FA, // REGIONAL INDICATOR SYMBOL LETTER U + 0x1F1F8, // REGIONAL INDICATOR SYMBOL LETTER S + ], + categories: &[ + EmojiSegmentationCategory::Ri, + EmojiSegmentationCategory::Ri, + EmojiSegmentationCategory::Ri, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Text-default emoji + VS-15; Encoded: โ˜บ๏ธŽ +#[test] +fn text_default_emoji_vs15() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x263A, // WHITE SMILING FACE + 0xFE0E, // VARIATION SELECTOR-15 + ], + categories: &[ + EmojiSegmentationCategory::Emoji, + EmojiSegmentationCategory::Vs15, + ], + style: EmojiPresentationStyle::Text, + }); +} + +// Text-default emoji + VS-16; Encoded: โ˜บ๏ธ +#[test] +fn text_default_emoji_vs16() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x263A, // WHITE SMILING FACE + 0xFE0F, // VARIATION SELECTOR-16 + ], + categories: &[ + EmojiSegmentationCategory::Emoji, + EmojiSegmentationCategory::Vs16, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Emoji-default emoji + VS-15; Encoded: ๐Ÿ˜€๏ธŽ +#[test] +fn emoji_default_emoji_vs15() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F600, // GRINNING FACE + 0xFE0E, // VARIATION SELECTOR-15 + ], + categories: &[ + EmojiSegmentationCategory::EmojiPresentation, + EmojiSegmentationCategory::Vs15, + ], + style: EmojiPresentationStyle::Text, + }); +} + +// Emoji-default emoji + VS-16; Encoded: ๐Ÿ˜€๏ธ +#[test] +fn emoji_default_emoji_vs16() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F600, // GRINNING FACE + 0xFE0F, // VARIATION SELECTOR-16 + ], + categories: &[ + EmojiSegmentationCategory::EmojiPresentation, + EmojiSegmentationCategory::Vs16, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// ZWJ family; Encoded: ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘ง +#[test] +fn zwj_family() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F468, // MAN + 0x200D, // ZERO WIDTH JOINER + 0x1F469, // WOMAN + 0x200D, // ZERO WIDTH JOINER + 0x1F467, // GIRL + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Long ZWJ family (4 members); Encoded: ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ +#[test] +fn long_zwj_family() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F468, // MAN + 0x200D, // ZERO WIDTH JOINER + 0x1F469, // WOMAN + 0x200D, // ZERO WIDTH JOINER + 0x1F467, // GIRL + 0x200D, // ZERO WIDTH JOINER + 0x1F466, // BOY + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// ZWJ couple; Encoded: ๐Ÿ‘จโ€โคโ€๐Ÿ‘จ +#[test] +fn zwj_couple() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F468, // MAN + 0x200D, // ZERO WIDTH JOINER + 0x2764, // HEAVY BLACK HEART + 0x200D, // ZERO WIDTH JOINER + 0x1F468, // MAN + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::Emoji, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// ZWJ with VS-16 element; Encoded: ๐Ÿ‘จ๏ธโ€๐Ÿ‘ฉ +#[test] +fn zwj_with_vs16_element() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F468, // MAN + 0xFE0F, // VARIATION SELECTOR-16 + 0x200D, // ZERO WIDTH JOINER + 0x1F469, // WOMAN + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Vs16, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// ZWJ with VS-16 on both elements; Encoded: ๐Ÿ‘จ๏ธโ€๐Ÿ‘ฉ๏ธ +#[test] +fn zwj_with_vs16_on_both_elements() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F468, // MAN + 0xFE0F, // VARIATION SELECTOR-16 + 0x200D, // ZERO WIDTH JOINER + 0x1F469, // WOMAN + 0xFE0F, // VARIATION SELECTOR-16 + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Vs16, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::Vs16, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// ZWJ after modifier sequence; Encoded: ๐Ÿ‘ฆ๐Ÿปโ€๐Ÿ’ป +#[test] +fn zwj_after_modifier_sequence() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F466, // BOY + 0x1F3FB, // EMOJI MODIFIER FITZPATRICK TYPE-1-2 + 0x200D, // ZERO WIDTH JOINER + 0x1F4BB, // PERSONAL COMPUTER + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::EmojiModifier, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiPresentation, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// ZWJ technologist with skin tone; Encoded: ๐Ÿ‘จ๐Ÿปโ€๐Ÿ’ป +#[test] +fn zwj_technologist_with_skin_tone() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F468, // MAN + 0x1F3FB, // EMOJI MODIFIER FITZPATRICK TYPE-1-2 + 0x200D, // ZERO WIDTH JOINER + 0x1F4BB, // PERSONAL COMPUTER + ], + categories: &[ + EmojiSegmentationCategory::EmojiModifierBase, + EmojiSegmentationCategory::EmojiModifier, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiPresentation, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// VS-16 enables ZWJ continuation; Encoded: โ˜บ๏ธโ€๐Ÿ‘ฉ +#[test] +fn vs16_enables_zwj_continuation() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x263A, // WHITE SMILING FACE + 0xFE0F, // VARIATION SELECTOR-16 + 0x200D, // ZERO WIDTH JOINER + 0x1F469, // WOMAN + ], + categories: &[ + EmojiSegmentationCategory::Emoji, + EmojiSegmentationCategory::Vs16, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiModifierBase, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// Tag sequence (England); Encoded: ๐Ÿด๓ ง๓ ข๓ ฅ๓ ฎ๓ ง๓ ฟ +#[test] +fn tag_sequence_england() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F3F4, // WAVING BLACK FLAG + 0xE0067, // TAG LATIN SMALL LETTER G + 0xE0062, // TAG LATIN SMALL LETTER B + 0xE0065, // TAG LATIN SMALL LETTER E + 0xE006E, // TAG LATIN SMALL LETTER N + 0xE0067, // TAG LATIN SMALL LETTER G + 0xE007F, // CANCEL TAG + ], + categories: &[ + EmojiSegmentationCategory::TagBase, + EmojiSegmentationCategory::TagSpec, + EmojiSegmentationCategory::TagSpec, + EmojiSegmentationCategory::TagSpec, + EmojiSegmentationCategory::TagSpec, + EmojiSegmentationCategory::TagSpec, + EmojiSegmentationCategory::TagEnd, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// TAG_BASE as ZWJ element; Encoded: ๐Ÿดโ€๐Ÿ˜€" +#[test] +fn tag_base_as_zwj_element() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F3F4, // WAVING BLACK FLAG + 0x200D, // ZERO WIDTH JOINER + 0x1F600, // GRINNING FACE + ], + categories: &[ + EmojiSegmentationCategory::TagBase, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiPresentation, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// TAG_BASE + VS-16 + ZWJ; Encoded: ๐Ÿด๏ธโ€๐Ÿ˜€", +#[test] +fn tag_base_vs16_as_zwj() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F3F4, // WAVING BLACK FLAG + 0xFE0F, // VARIATION SELECTOR-16 + 0x200D, // ZERO WIDTH JOINER + 0x1F600, // GRINNING FACE + ], + categories: &[ + EmojiSegmentationCategory::TagBase, + EmojiSegmentationCategory::Vs16, + EmojiSegmentationCategory::Zwj, + EmojiSegmentationCategory::EmojiPresentation, + ], + style: EmojiPresentationStyle::Emoji, + }); +} + +// TAG_BASE + VS-15; Encoded: ๐Ÿด๏ธŽ +#[test] +fn tag_base_vs15() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F3F4, // WAVING BLACK FLAG + 0xFE0E, // VARIATION SELECTOR-15 + ], + categories: &[ + EmojiSegmentationCategory::TagBase, + EmojiSegmentationCategory::Vs15, + ], + style: EmojiPresentationStyle::Text, + }); +} + +// TAG_BASE + VS-16; Encoded: ๐Ÿด๏ธ +#[test] +fn tag_base_vs16() { + assert_emoji_segmenters_produce_same_result(TestEntity { + sequence: &[ + 0x1F3F4, // WAVING BLACK FLAG + 0xFE0F, // VARIATION SELECTOR-16 + ], + categories: &[ + EmojiSegmentationCategory::TagBase, + EmojiSegmentationCategory::Vs16, + ], + style: EmojiPresentationStyle::Emoji, + }); +} diff --git a/parley_data/src/emoji.rs b/parley_data/src/emoji.rs new file mode 100644 index 000000000..a356e9d80 --- /dev/null +++ b/parley_data/src/emoji.rs @@ -0,0 +1,104 @@ +// Copyright 2026 the Parley Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +/// Emoji character properties relevant for text analysis. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub struct EmojiProperties(u32); + +impl EmojiProperties { + const EMOJI_SHIFT: u32 = 0; + const EXTENDED_PICTOGRAPHIC_SHIFT: u32 = 1; + const EMOJI_COMPONENT_SHIFT: u32 = 2; + const EMOJI_PRESENTATION_SHIFT: u32 = 3; + const EMOJI_MODIFIER_SHIFT: u32 = 4; + const EMOJI_MODIFIER_BASE_SHIFT: u32 = 5; + const REGIONAL_INDICATOR_SHIFT: u32 = 6; + + const EMOJI_MASK: u32 = 1 << Self::EMOJI_SHIFT; + const EXTENDED_PICTOGRAPHIC_MASK: u32 = 1 << Self::EXTENDED_PICTOGRAPHIC_SHIFT; + const EMOJI_COMPONENT_MASK: u32 = 1 << Self::EMOJI_COMPONENT_SHIFT; + const EMOJI_PRESENTATION_MASK: u32 = 1 << Self::EMOJI_PRESENTATION_SHIFT; + const EMOJI_MODIFIER_MASK: u32 = 1 << Self::EMOJI_MODIFIER_SHIFT; + const EMOJI_MODIFIER_BASE_MASK: u32 = 1 << Self::EMOJI_MODIFIER_BASE_SHIFT; + const REGIONAL_INDICATOR_MASK: u32 = 1 << Self::REGIONAL_INDICATOR_SHIFT; + + /// All zeroes. + pub const ZERO: Self = Self(0); + + #[cfg(feature = "baked")] + #[inline] + /// Returns the properties for a given character. + pub const fn get(ch: char) -> Self { + Self(crate::generated::emoji_composite_get(ch as u32)) + } + + /// Creates a new [`EmojiProperties`] from the given properties + #[inline] + pub const fn new( + is_emoji: bool, + is_extended_pictographic: bool, + is_emoji_component: bool, + is_emoji_presentation: bool, + is_emoji_modifier: bool, + is_emoji_modifier_base: bool, + is_regional_indicator: bool, + ) -> Self { + Self( + (is_emoji as u32) << Self::EMOJI_SHIFT + | (is_extended_pictographic as u32) << Self::EXTENDED_PICTOGRAPHIC_SHIFT + | (is_emoji_component as u32) << Self::EMOJI_COMPONENT_SHIFT + | (is_emoji_presentation as u32) << Self::EMOJI_PRESENTATION_SHIFT + | (is_emoji_modifier as u32) << Self::EMOJI_MODIFIER_SHIFT + | (is_emoji_modifier_base as u32) << Self::EMOJI_MODIFIER_BASE_SHIFT + | (is_regional_indicator as u32) << Self::REGIONAL_INDICATOR_SHIFT, + ) + } + + /// Returns whether the character is an emoji. + #[inline] + pub const fn is_emoji(self) -> bool { + self.0 & Self::EMOJI_MASK != 0 + } + + /// Returns whether the character is an extended pictographic. + #[inline] + pub const fn is_extended_pictographic(self) -> bool { + self.0 & Self::EXTENDED_PICTOGRAPHIC_MASK != 0 + } + + /// Returns whether the character is an emoji component. + #[inline] + pub const fn is_emoji_component(self) -> bool { + self.0 & Self::EMOJI_COMPONENT_MASK != 0 + } + + /// Returns whether the character is an emoji presentation. + #[inline] + pub const fn is_emoji_presentation(self) -> bool { + self.0 & Self::EMOJI_PRESENTATION_MASK != 0 + } + + /// Returns whether the character is a modifier. + #[inline] + pub const fn is_emoji_modifier(self) -> bool { + self.0 & Self::EMOJI_MODIFIER_MASK != 0 + } + + /// Returns whether the character is a modifier base. + #[inline] + pub const fn is_emoji_modifier_base(self) -> bool { + self.0 & Self::EMOJI_MODIFIER_BASE_MASK != 0 + } + + /// Returns whether the character is a region indicator. + #[inline] + pub const fn is_regional_indicator(self) -> bool { + self.0 & Self::REGIONAL_INDICATOR_MASK != 0 + } +} + +impl From for u32 { + fn from(value: EmojiProperties) -> Self { + value.0 + } +} diff --git a/parley_data/src/generated/mod.rs b/parley_data/src/generated/mod.rs index 0de3bf471..63b84e743 100644 --- a/parley_data/src/generated/mod.rs +++ b/parley_data/src/generated/mod.rs @@ -999,3 +999,311 @@ pub(crate) fn composite_packtab_get(u: usize) -> u32 { pub fn composite_get(cp: u32) -> u32 { composite_packtab_get(cp as usize) } + +static EMOJI_COMPOSITE_U8: [u8; 10] = [2, 3, 4, 5, 11, 15, 29, 35, 43, 77]; + +#[allow(missing_docs, reason = "generated code")] +#[inline] +pub const fn emoji_composite_get(cp: u32) -> u32 { + let idx = match cp { + 0x1F02C..=0x1F02F + | 0x1F094..=0x1F09F + | 0x1F0AF..=0x1F0B0 + | 0x1F0C0 + | 0x1F0D0 + | 0x1F0F6..=0x1F0FF + | 0x1F1AE..=0x1F1E5 + | 0x1F203..=0x1F20F + | 0x1F23C..=0x1F23F + | 0x1F249..=0x1F24F + | 0x1F252..=0x1F25F + | 0x1F266..=0x1F2FF + | 0x1F6D9..=0x1F6DB + | 0x1F6ED..=0x1F6EF + | 0x1F6FD..=0x1F6FF + | 0x1F7DA..=0x1F7DF + | 0x1F7EC..=0x1F7EF + | 0x1F7F1..=0x1F7FF + | 0x1F80C..=0x1F80F + | 0x1F848..=0x1F84F + | 0x1F85A..=0x1F85F + | 0x1F888..=0x1F88F + | 0x1F8AE..=0x1F8AF + | 0x1F8BC..=0x1F8BF + | 0x1F8C2..=0x1F8CF + | 0x1F8D9..=0x1F8FF + | 0x1FA58..=0x1FA5F + | 0x1FA6E..=0x1FA6F + | 0x1FA7D..=0x1FA7F + | 0x1FA8B..=0x1FA8D + | 0x1FAC7 + | 0x1FAC9..=0x1FACC + | 0x1FADD..=0x1FADE + | 0x1FAEB..=0x1FAEE + | 0x1FAF9..=0x1FAFF + | 0x1FC00..=0x1FFFD => 0, + 0xA9 + | 0xAE + | 0x203C + | 0x2049 + | 0x2122 + | 0x2139 + | 0x2194..=0x2199 + | 0x21A9..=0x21AA + | 0x2328 + | 0x23CF + | 0x23ED..=0x23EF + | 0x23F1..=0x23F2 + | 0x23F8..=0x23FA + | 0x24C2 + | 0x25AA..=0x25AB + | 0x25B6 + | 0x25C0 + | 0x25FB..=0x25FC + | 0x2600..=0x2604 + | 0x260E + | 0x2611 + | 0x2618 + | 0x2620 + | 0x2622..=0x2623 + | 0x2626 + | 0x262A + | 0x262E..=0x262F + | 0x2638..=0x263A + | 0x2640 + | 0x2642 + | 0x265F..=0x2660 + | 0x2663 + | 0x2665..=0x2666 + | 0x2668 + | 0x267B + | 0x267E + | 0x2692 + | 0x2694..=0x2697 + | 0x2699 + | 0x269B..=0x269C + | 0x26A0 + | 0x26A7 + | 0x26B0..=0x26B1 + | 0x26C8 + | 0x26CF + | 0x26D1 + | 0x26D3 + | 0x26E9 + | 0x26F0..=0x26F1 + | 0x26F4 + | 0x26F7..=0x26F8 + | 0x2702 + | 0x2708..=0x2709 + | 0x270F + | 0x2712 + | 0x2714 + | 0x2716 + | 0x271D + | 0x2721 + | 0x2733..=0x2734 + | 0x2744 + | 0x2747 + | 0x2763..=0x2764 + | 0x27A1 + | 0x2934..=0x2935 + | 0x2B05..=0x2B07 + | 0x3030 + | 0x303D + | 0x3297 + | 0x3299 + | 0x1F170..=0x1F171 + | 0x1F17E..=0x1F17F + | 0x1F202 + | 0x1F237 + | 0x1F321 + | 0x1F324..=0x1F32C + | 0x1F336 + | 0x1F37D + | 0x1F396..=0x1F397 + | 0x1F399..=0x1F39B + | 0x1F39E..=0x1F39F + | 0x1F3CD..=0x1F3CE + | 0x1F3D4..=0x1F3DF + | 0x1F3F3 + | 0x1F3F5 + | 0x1F3F7 + | 0x1F43F + | 0x1F441 + | 0x1F4FD + | 0x1F549..=0x1F54A + | 0x1F56F..=0x1F570 + | 0x1F573 + | 0x1F576..=0x1F579 + | 0x1F587 + | 0x1F58A..=0x1F58D + | 0x1F5A5 + | 0x1F5A8 + | 0x1F5B1..=0x1F5B2 + | 0x1F5BC + | 0x1F5C2..=0x1F5C4 + | 0x1F5D1..=0x1F5D3 + | 0x1F5DC..=0x1F5DE + | 0x1F5E1 + | 0x1F5E3 + | 0x1F5E8 + | 0x1F5EF + | 0x1F5F3 + | 0x1F5FA + | 0x1F6CB + | 0x1F6CD..=0x1F6CF + | 0x1F6E0..=0x1F6E5 + | 0x1F6E9 + | 0x1F6F0 + | 0x1F6F3 => 1, + 0x200D | 0x20E3 | 0xFE0F | 0xE0020..=0xE007F => 2, + 0x23 | 0x2A | 0x30..=0x39 => 3, + 0x231A..=0x231B + | 0x23E9..=0x23EC + | 0x23F0 + | 0x23F3 + | 0x25FD..=0x25FE + | 0x2614..=0x2615 + | 0x2648..=0x2653 + | 0x267F + | 0x2693 + | 0x26A1 + | 0x26AA..=0x26AB + | 0x26BD..=0x26BE + | 0x26C4..=0x26C5 + | 0x26CE + | 0x26D4 + | 0x26EA + | 0x26F2..=0x26F3 + | 0x26F5 + | 0x26FA + | 0x26FD + | 0x2705 + | 0x2728 + | 0x274C + | 0x274E + | 0x2753..=0x2755 + | 0x2757 + | 0x2795..=0x2797 + | 0x27B0 + | 0x27BF + | 0x2B1B..=0x2B1C + | 0x2B50 + | 0x2B55 + | 0x1F004 + | 0x1F0CF + | 0x1F18E + | 0x1F191..=0x1F19A + | 0x1F201 + | 0x1F21A + | 0x1F22F + | 0x1F232..=0x1F236 + | 0x1F238..=0x1F23A + | 0x1F250..=0x1F251 + | 0x1F300..=0x1F320 + | 0x1F32D..=0x1F335 + | 0x1F337..=0x1F37C + | 0x1F37E..=0x1F384 + | 0x1F386..=0x1F393 + | 0x1F3A0..=0x1F3C1 + | 0x1F3C5..=0x1F3C6 + | 0x1F3C8..=0x1F3C9 + | 0x1F3CF..=0x1F3D3 + | 0x1F3E0..=0x1F3F0 + | 0x1F3F4 + | 0x1F3F8..=0x1F3FA + | 0x1F400..=0x1F43E + | 0x1F440 + | 0x1F444..=0x1F445 + | 0x1F451..=0x1F465 + | 0x1F479..=0x1F47B + | 0x1F47D..=0x1F480 + | 0x1F484 + | 0x1F488..=0x1F48E + | 0x1F490 + | 0x1F492..=0x1F4A9 + | 0x1F4AB..=0x1F4FC + | 0x1F4FF..=0x1F53D + | 0x1F54B..=0x1F54E + | 0x1F550..=0x1F567 + | 0x1F5A4 + | 0x1F5FB..=0x1F644 + | 0x1F648..=0x1F64A + | 0x1F680..=0x1F6A2 + | 0x1F6A4..=0x1F6B3 + | 0x1F6B7..=0x1F6BF + | 0x1F6C1..=0x1F6C5 + | 0x1F6D0..=0x1F6D2 + | 0x1F6D5..=0x1F6D8 + | 0x1F6DC..=0x1F6DF + | 0x1F6EB..=0x1F6EC + | 0x1F6F4..=0x1F6FC + | 0x1F7E0..=0x1F7EB + | 0x1F7F0 + | 0x1F90D..=0x1F90E + | 0x1F910..=0x1F917 + | 0x1F920..=0x1F925 + | 0x1F927..=0x1F92F + | 0x1F93A + | 0x1F93F..=0x1F945 + | 0x1F947..=0x1F976 + | 0x1F978..=0x1F9AF + | 0x1F9B4 + | 0x1F9B7 + | 0x1F9BA + | 0x1F9BC..=0x1F9CC + | 0x1F9D0 + | 0x1F9DE..=0x1F9FF + | 0x1FA70..=0x1FA7C + | 0x1FA80..=0x1FA8A + | 0x1FA8E..=0x1FAC2 + | 0x1FAC6 + | 0x1FAC8 + | 0x1FACD..=0x1FADC + | 0x1FADF..=0x1FAEA + | 0x1FAEF => 4, + 0x1F9B0..=0x1F9B3 => 5, + 0x1F3FB..=0x1F3FF => 6, + 0x261D | 0x26F9 | 0x270C..=0x270D | 0x1F3CB..=0x1F3CC | 0x1F574..=0x1F575 | 0x1F590 => 7, + 0x270A..=0x270B + | 0x1F385 + | 0x1F3C2..=0x1F3C4 + | 0x1F3C7 + | 0x1F3CA + | 0x1F442..=0x1F443 + | 0x1F446..=0x1F450 + | 0x1F466..=0x1F478 + | 0x1F47C + | 0x1F481..=0x1F483 + | 0x1F485..=0x1F487 + | 0x1F48F + | 0x1F491 + | 0x1F4AA + | 0x1F57A + | 0x1F595..=0x1F596 + | 0x1F645..=0x1F647 + | 0x1F64B..=0x1F64F + | 0x1F6A3 + | 0x1F6B4..=0x1F6B6 + | 0x1F6C0 + | 0x1F6CC + | 0x1F90C + | 0x1F90F + | 0x1F918..=0x1F91F + | 0x1F926 + | 0x1F930..=0x1F939 + | 0x1F93C..=0x1F93E + | 0x1F977 + | 0x1F9B5..=0x1F9B6 + | 0x1F9B8..=0x1F9B9 + | 0x1F9BB + | 0x1F9CD..=0x1F9CF + | 0x1F9D1..=0x1F9DD + | 0x1FAC3..=0x1FAC5 + | 0x1FAF0..=0x1FAF8 => 8, + 0x1F1E6..=0x1F1FF => 9, + _ => return 0, + }; + + EMOJI_COMPOSITE_U8[idx as usize] as u32 +} diff --git a/parley_data/src/lib.rs b/parley_data/src/lib.rs index b9dca64d6..ae48da4dc 100644 --- a/parley_data/src/lib.rs +++ b/parley_data/src/lib.rs @@ -12,6 +12,9 @@ use icu_properties::props::{BidiClass, GeneralCategory, GraphemeClusterBreak, Sc #[cfg(feature = "baked")] pub mod generated; +/// Emoji character properties relevant for text analysis. +pub mod emoji; + /// Unicode character properties relevant for text analysis. #[derive(Copy, Clone, Debug)] pub struct Properties(u32); diff --git a/parley_data_gen/Cargo.toml b/parley_data_gen/Cargo.toml index dae6a49ce..ec979d6a2 100644 --- a/parley_data_gen/Cargo.toml +++ b/parley_data_gen/Cargo.toml @@ -11,7 +11,7 @@ publish = false [dependencies] icu_properties = { workspace = true, features = ["compiled_data"] } packtab = { workspace = true } -parley_data = { workspace = true } +parley_data = { workspace = true, default-features = false } [lints] workspace = true diff --git a/parley_data_gen/src/lib.rs b/parley_data_gen/src/lib.rs index 534dc983f..e56a4c6ba 100644 --- a/parley_data_gen/src/lib.rs +++ b/parley_data_gen/src/lib.rs @@ -3,16 +3,20 @@ //! See `./main.rs`. -use icu_properties::props::{GeneralCategory, GraphemeClusterBreak, Script}; use icu_properties::{ CodePointMapData, CodePointSetData, props::{ - BidiClass, Emoji, ExtendedPictographic, LineBreak, RegionalIndicator, VariationSelector, + BidiClass, Emoji, EmojiComponent, EmojiModifier, EmojiModifierBase, EmojiPresentation, + ExtendedPictographic, GeneralCategory, GraphemeClusterBreak, LineBreak, RegionalIndicator, + Script, VariationSelector, }, }; -use parley_data::Properties; -use std::fmt::Write as _; -use std::io::{BufWriter, Write}; +use parley_data::{Properties, emoji::EmojiProperties}; +use std::{collections::BTreeMap, fmt::Write as _}; +use std::{ + io::{BufWriter, Write}, + ops::Range, +}; const COPYRIGHT_HEADER: &str = "// Copyright 2025 the Parley Authors\n// SPDX-License-Identifier: Apache-2.0 OR MIT\n"; @@ -29,35 +33,52 @@ pub struct Config { /// Exports ICU data as `PackTab` lookup tables + generated Rust code into the `out` directory. pub fn generate(out: std::path::PathBuf, config: &Config) { // Generate the data required for `CompositeProps`. - let values = { - // Dense values table for 0..=0x10FFFF - let mut values = Vec::::with_capacity(0x110000); - for cp in 0_u32..=0x10FFFF { - let v = Properties::new( - CodePointMapData::