diff --git a/lib/Horde/String.php b/lib/Horde/String.php index 2e1268c..eaea434 100644 --- a/lib/Horde/String.php +++ b/lib/Horde/String.php @@ -183,7 +183,10 @@ protected static function _convertCharset($input, $from, $to) if (class_exists('UConverter')) { $attemptedMethods[] = 'UConverter'; try { - $conv = new UConverter($to, $from); + $conv = new UConverter( + CharacterSets::toUConverter($to), + CharacterSets::toUConverter($from) + ); $out = $conv->convert($input); if ($out !== false && $out !== '') { return $out; diff --git a/src/CharacterSets.php b/src/CharacterSets.php index f018f7e..160537f 100644 --- a/src/CharacterSets.php +++ b/src/CharacterSets.php @@ -35,6 +35,24 @@ class CharacterSets 'unknown-8bit' => 'iso-8859-1', ]; + /** + * Map charset aliases to unambiguous ICU canonical names for UConverter. + * + * PHP's UConverter emits "Ambiguous encoding specified" warnings for names + * that map to multiple ICU converters (e.g. windows-1258 → ibm-5354 or + * cp1258). Use the canonical name PHP would pick anyway. + * + * @see https://www.php.net/manual/en/class.uconverter.php + */ + private static array $uconverterMap = [ + 'big5-hkscs' => 'ibm-1375_P100-2008', + 'shift_jis' => 'ibm-943_P15A-2003', + 'tis-620' => 'windows-874-2000', + 'windows-1258' => 'cp1258', + 'windows-936' => 'windows-936-2000', + 'windows-950' => 'windows-950-2000', + ]; + /** * Normalize a character set identifier to a canonical name. * @@ -65,4 +83,19 @@ public static function toMbstring(string $identifier): string // TODO: Check against mb_list_encoding return self::normalize($identifier); } + + /** + * Convert charset identifier to an unambiguous UConverter/ICU name. + * + * @param string $identifier The charset identifier. + * + * @return string The UConverter-compatible charset name. + */ + public static function toUConverter(string $identifier): string + { + $identifier = self::normalize($identifier); + $lower = strtolower($identifier); + + return self::$uconverterMap[$lower] ?? $identifier; + } } diff --git a/src/HordeString.php b/src/HordeString.php index 71ebaf7..3dc9387 100644 --- a/src/HordeString.php +++ b/src/HordeString.php @@ -199,7 +199,10 @@ protected static function _convertCharset($input, $from, $to) if (class_exists('UConverter')) { $attemptedMethods[] = 'UConverter'; try { - $conv = new UConverter($to, $from); + $conv = new UConverter( + CharacterSets::toUConverter($to), + CharacterSets::toUConverter($from) + ); $out = $conv->convert($input); if ($out !== false && $out !== '') { return $out; diff --git a/test/CharacterSetsTest.php b/test/CharacterSetsTest.php index e55c969..99cb2d3 100644 --- a/test/CharacterSetsTest.php +++ b/test/CharacterSetsTest.php @@ -78,4 +78,16 @@ public function testToMbstringPreservesOtherCharsets(): void // Non-normalized charsets should pass through $this->assertEquals('iso-8859-1', CharacterSets::toMbstring('iso-8859-1')); } + + public function testToUConverterWindows1258(): void + { + $this->assertEquals('cp1258', CharacterSets::toUConverter('windows-1258')); + $this->assertEquals('cp1258', CharacterSets::toUConverter('Windows-1258')); + } + + public function testToUConverterPreservesUnambiguousCharsets(): void + { + $this->assertEquals('iso-8859-1', CharacterSets::toUConverter('iso-8859-1')); + $this->assertEquals('utf-8', CharacterSets::toUConverter('utf8mb4')); + } }